Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
93.98% covered (success)
93.98%
125 / 133
66.67% covered (warning)
66.67%
8 / 12
CRAP
0.00% covered (danger)
0.00%
0 / 1
ObjectParser
93.98% covered (success)
93.98%
125 / 133
66.67% covered (warning)
66.67%
8 / 12
59.76
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 parseValue
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 parseIndirectObject
100.00% covered (success)
100.00%
17 / 17
100.00% covered (success)
100.00%
1 / 1
6
 parseTokenValue
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
12
 parseIntegerOrReference
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
3
 parseDictionaryOrStream
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 parseDictionary
93.33% covered (success)
93.33%
14 / 15
0.00% covered (danger)
0.00%
0 / 1
6.01
 parseArray
88.89% covered (warning)
88.89%
8 / 9
0.00% covered (danger)
0.00%
0 / 1
4.02
 parseStream
87.50% covered (warning)
87.50%
14 / 16
0.00% covered (danger)
0.00%
0 / 1
6.07
 scanForEndstream
86.67% covered (warning)
86.67%
26 / 30
0.00% covered (danger)
0.00%
0 / 1
10.24
 skipStreamTrailer
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
6
 expect
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
1<?php
2
3declare(strict_types=1);
4
5namespace Phpdftk\Pdf\Reader\Parser;
6
7use Phpdftk\Pdf\Core\PdfArray;
8use Phpdftk\Pdf\Core\PdfBoolean;
9use Phpdftk\Pdf\Core\PdfDictionary;
10use Phpdftk\Pdf\Core\PdfName;
11use Phpdftk\Pdf\Core\PdfNull;
12use Phpdftk\Pdf\Core\PdfNumber;
13use Phpdftk\Pdf\Core\PdfReference;
14use Phpdftk\Pdf\Core\PdfStream;
15use Phpdftk\Pdf\Core\PdfString;
16use Phpdftk\Pdf\Core\Serializable;
17use Phpdftk\Pdf\Reader\Exception\InvalidPdfException;
18use Phpdftk\Pdf\Reader\Tokenizer\Source;
19use Phpdftk\Pdf\Reader\Tokenizer\Token;
20use Phpdftk\Pdf\Reader\Tokenizer\Tokenizer;
21use Phpdftk\Pdf\Reader\Tokenizer\TokenType;
22
23/**
24 * Recursive-descent PDF object parser.
25 *
26 * Consumes tokens from a {@see Tokenizer} and builds the core
27 * `PdfDictionary`, `PdfArray`, `PdfName`, `PdfString`, `PdfNumber`,
28 * `PdfBoolean`, `PdfNull`, `PdfReference`, and `PdfStream` instances.
29 */
30final class ObjectParser
31{
32    public function __construct(
33        private readonly Tokenizer $tokenizer,
34        private readonly Source $source,
35    ) {}
36
37    /**
38     * Parse any PDF value.
39     */
40    public function parseValue(): Serializable
41    {
42        $token = $this->tokenizer->nextToken();
43        return $this->parseTokenValue($token);
44    }
45
46    /**
47     * Parse a complete indirect object: `X Y obj <value> endobj`.
48     *
49     * @return array{int, int, Serializable} [objNum, genNum, value]
50     */
51    public function parseIndirectObject(): array
52    {
53        $objNumToken = $this->tokenizer->nextToken();
54        $this->expect($objNumToken, TokenType::Integer, 'object number');
55
56        $genNumToken = $this->tokenizer->nextToken();
57        $this->expect($genNumToken, TokenType::Integer, 'generation number');
58
59        $objToken = $this->tokenizer->nextToken();
60        $this->expect($objToken, TokenType::ObjKeyword, 'obj keyword');
61
62        $value = $this->parseValue();
63
64        // After the value, expect `endobj` — but if the value was a dict
65        // that was followed by `stream`, it became a PdfStream and we
66        // should now see `endobj`.
67        $end = $this->tokenizer->nextToken();
68        if ($end->type !== TokenType::EndObjKeyword) {
69            // Tolerant: some generators put extra data between the value
70            // and endobj. Try skipping up to 5 tokens to find endobj.
71            if ($end->type !== TokenType::Eof) {
72                $found = false;
73                for ($skip = 0; $skip < 5; $skip++) {
74                    $retry = $this->tokenizer->nextToken();
75                    if ($retry->type === TokenType::EndObjKeyword || $retry->type === TokenType::Eof) {
76                        $found = true;
77                        break;
78                    }
79                }
80                // If we still can't find endobj, just continue — the object
81                // value is already parsed. The tokenizer position may be
82                // slightly off but the xref table will resync for the next object.
83            }
84        }
85
86        return [(int) $objNumToken->value, (int) $genNumToken->value, $value];
87    }
88
89    // -----------------------------------------------------------------------
90    // Internal
91    // -----------------------------------------------------------------------
92
93    private function parseTokenValue(Token $token): Serializable
94    {
95        return match ($token->type) {
96            TokenType::DictStart      => $this->parseDictionaryOrStream(),
97            TokenType::ArrayStart     => $this->parseArray(),
98            TokenType::Name           => new PdfName($token->value),
99            TokenType::LiteralString  => new PdfString($token->value),
100            TokenType::HexString      => new PdfString($token->value, hex: true),
101            TokenType::Integer        => $this->parseIntegerOrReference($token),
102            TokenType::Real           => new PdfNumber((float) $token->value),
103            TokenType::Boolean        => new PdfBoolean($token->value === 'true'),
104            TokenType::Null           => new PdfNull(),
105            // Unknown keywords: skip and try the next token
106            TokenType::Unknown        => $this->parseValue(),
107            default                   => throw new InvalidPdfException(
108                "Unexpected token {$token->type->name} ('{$token->value}') at offset {$token->offset}",
109            ),
110        };
111    }
112
113    /**
114     * After reading an integer, look ahead for `<int> R` (indirect
115     * reference) or just return the integer.
116     */
117    private function parseIntegerOrReference(Token $intToken): Serializable
118    {
119        $savedPos = $this->tokenizer->tell();
120        $next = $this->tokenizer->peek();
121
122        if ($next->type === TokenType::Integer) {
123            $this->tokenizer->nextToken(); // consume the gen number
124            $rToken = $this->tokenizer->peek();
125            if ($rToken->type === TokenType::RKeyword) {
126                $this->tokenizer->nextToken(); // consume R
127                return new PdfReference((int) $intToken->value, (int) $next->value);
128            }
129            // Not a reference — push back by seeking to saved position.
130            $this->tokenizer->seek($savedPos);
131        }
132
133        return new PdfNumber((int) $intToken->value);
134    }
135
136    private function parseDictionaryOrStream(): Serializable
137    {
138        $dict = $this->parseDictionary();
139
140        // Check if the dictionary is followed by a `stream` keyword.
141        $next = $this->tokenizer->peek();
142        if ($next->type === TokenType::StreamKeyword) {
143            $this->tokenizer->nextToken(); // consume 'stream'
144            return $this->parseStream($dict);
145        }
146
147        return $dict;
148    }
149
150    private function parseDictionary(): PdfDictionary
151    {
152        $dict = new PdfDictionary();
153
154        while (true) {
155            $token = $this->tokenizer->nextToken();
156            if ($token->type === TokenType::DictEnd) {
157                break;
158            }
159            if ($token->type === TokenType::Eof) {
160                // Tolerate unclosed dictionaries at EOF
161                break;
162            }
163            // Skip unknown tokens between dictionary entries
164            if ($token->type === TokenType::Unknown) {
165                continue;
166            }
167            if ($token->type !== TokenType::Name) {
168                // Skip unexpected tokens and try to continue
169                continue;
170            }
171
172            $key = $token->value;
173            $value = $this->parseValue();
174            $dict->set($key, $value);
175        }
176
177        return $dict;
178    }
179
180    private function parseArray(): PdfArray
181    {
182        $items = [];
183        while (true) {
184            $token = $this->tokenizer->nextToken();
185            if ($token->type === TokenType::ArrayEnd) {
186                break;
187            }
188            if ($token->type === TokenType::Eof) {
189                // Tolerate unclosed arrays at EOF
190                break;
191            }
192            $items[] = $this->parseTokenValue($token);
193        }
194        return new PdfArray($items);
195    }
196
197    /**
198     * Read stream data after the `stream` keyword has been consumed.
199     * The `stream` keyword must be followed by a single EOL (LF or CR+LF).
200     * The data length comes from `/Length` in the dictionary.
201     */
202    private function parseStream(PdfDictionary $dict): PdfStream
203    {
204        // Skip the mandatory EOL after 'stream'
205        $byte = $this->source->readByte();
206        if ($byte === "\r") {
207            // CR+LF
208            if ($this->source->peek() === "\n") {
209                $this->source->readByte();
210            }
211        }
212        // If it was already LF, we consumed it. If something else, tolerate.
213
214        $length = $dict->get('Length');
215        if ($length instanceof PdfNumber) {
216            $streamLength = (int) $length->toPdf();
217        } elseif (is_int($length)) {
218            $streamLength = $length;
219        } else {
220            // If Length is an indirect reference, we cannot resolve it here
221            // because we don't have the resolver yet. Fall back to scanning
222            // for 'endstream'.
223            $streamLength = $this->scanForEndstream();
224        }
225
226        if ($streamLength >= 0) {
227            $data = $this->source->read($streamLength);
228        } else {
229            $data = '';
230        }
231
232        // Consume the trailing EOL + endstream keyword.
233        // The spec says data is followed by an EOL then 'endstream'.
234        // Tolerate missing EOL.
235        $this->skipStreamTrailer();
236
237        $stream = new PdfStream($dict, $data);
238        return $stream;
239    }
240
241    /**
242     * Fallback: scan forward for `endstream` to determine stream length.
243     *
244     * Limits scan to 64 MB to prevent OOM on corrupted/truncated streams.
245     */
246    private function scanForEndstream(): int
247    {
248        $start = $this->source->tell();
249        $marker = 'endstream';
250        $markerLen = strlen($marker);
251
252        // Use a sliding window instead of accumulating a full buffer to limit memory
253        $maxScan = 64 * 1024 * 1024; // 64 MB safety limit
254        $scanned = 0;
255        $window = '';
256
257        while (!$this->source->isEof() && $scanned < $maxScan) {
258            $byte = $this->source->readByte();
259            if ($byte === null) {
260                break;
261            }
262            $scanned++;
263            $window .= $byte;
264
265            // Keep window just large enough to detect the marker with preceding char
266            if (strlen($window) > $markerLen + 1) {
267                $window = substr($window, -($markerLen + 1));
268            }
269
270            if (str_ends_with($window, $marker)) {
271                // Validate boundary: "endstream" must be preceded by
272                // whitespace (CR, LF, or space) or be at the start of data.
273                $markerStart = strlen($window) - $markerLen;
274                if ($markerStart > 0) {
275                    $preceding = $window[$markerStart - 1];
276                    if ($preceding !== "\n" && $preceding !== "\r" && $preceding !== ' ') {
277                        // False match inside binary data — keep scanning
278                        continue;
279                    }
280                }
281
282                $endPos = $this->source->tell() - $markerLen;
283                $length = $endPos - $start;
284                $this->source->seek($start);
285                $data = $this->source->read($length);
286                $data = rtrim($data, "\r\n");
287                $actualLength = strlen($data);
288                $this->source->seek($start);
289                return $actualLength;
290            }
291        }
292
293        $this->source->seek($start);
294        return 0;
295    }
296
297    private function skipStreamTrailer(): void
298    {
299        // Skip whitespace/EOL between stream data and 'endstream'
300        while (!$this->source->isEof()) {
301            $byte = $this->source->peek();
302            if ($byte === "\r" || $byte === "\n" || $byte === ' ') {
303                $this->source->readByte();
304            } else {
305                break;
306            }
307        }
308
309        // Try to consume 'endstream' keyword via the tokenizer
310        $token = $this->tokenizer->peek();
311        if ($token->type === TokenType::EndStreamKeyword) {
312            $this->tokenizer->nextToken();
313        }
314    }
315
316    private function expect(Token $token, TokenType $expected, string $context): void
317    {
318        if ($token->type !== $expected) {
319            throw new InvalidPdfException(
320                "Expected $context ({$expected->name}) at offset {$token->offset}"
321                . "got {$token->type->name} ('{$token->value}')",
322            );
323        }
324    }
325}