Code Coverage for /home/runner/work/phpdftk/phpdftk/packages/pdf/reader/src/Parser/ObjectParser.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	93.98% covered (success)	93.98%	125 / 133	66.67% covered (warning)	66.67%	8 / 12	CRAP	0.00% covered (danger)	0.00%	0 / 1
ObjectParser	93.98% covered (success)	93.98%	125 / 133	66.67% covered (warning)	66.67%	8 / 12	59.76	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
parseValue	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
parseIndirectObject	100.00% covered (success)	100.00%	17 / 17	100.00% covered (success)	100.00%	1 / 1	6
parseTokenValue	100.00% covered (success)	100.00%	14 / 14	100.00% covered (success)	100.00%	1 / 1	12
parseIntegerOrReference	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	3
parseDictionaryOrStream	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	2
parseDictionary	93.33% covered (success)	93.33%	14 / 15	0.00% covered (danger)	0.00%	0 / 1	6.01
parseArray	88.89% covered (warning)	88.89%	8 / 9	0.00% covered (danger)	0.00%	0 / 1	4.02
parseStream	87.50% covered (warning)	87.50%	14 / 16	0.00% covered (danger)	0.00%	0 / 1	6.07
scanForEndstream	86.67% covered (warning)	86.67%	26 / 30	0.00% covered (danger)	0.00%	0 / 1	10.24
skipStreamTrailer	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	6
expect	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	2

1	<?php
2
3	declare(strict_types=1);
4
5	namespace Phpdftk\Pdf\Reader\Parser;
6
7	use Phpdftk\Pdf\Core\PdfArray;
8	use Phpdftk\Pdf\Core\PdfBoolean;
9	use Phpdftk\Pdf\Core\PdfDictionary;
10	use Phpdftk\Pdf\Core\PdfName;
11	use Phpdftk\Pdf\Core\PdfNull;
12	use Phpdftk\Pdf\Core\PdfNumber;
13	use Phpdftk\Pdf\Core\PdfReference;
14	use Phpdftk\Pdf\Core\PdfStream;
15	use Phpdftk\Pdf\Core\PdfString;
16	use Phpdftk\Pdf\Core\Serializable;
17	use Phpdftk\Pdf\Reader\Exception\InvalidPdfException;
18	use Phpdftk\Pdf\Reader\Tokenizer\Source;
19	use Phpdftk\Pdf\Reader\Tokenizer\Token;
20	use Phpdftk\Pdf\Reader\Tokenizer\Tokenizer;
21	use Phpdftk\Pdf\Reader\Tokenizer\TokenType;
22
23	/**
24	* Recursive-descent PDF object parser.
25	*
26	* Consumes tokens from a {@see Tokenizer} and builds the core
27	* `PdfDictionary`, `PdfArray`, `PdfName`, `PdfString`, `PdfNumber`,
28	* `PdfBoolean`, `PdfNull`, `PdfReference`, and `PdfStream` instances.
29	*/
30	final class ObjectParser
31	{
32	public function __construct(
33	private readonly Tokenizer $tokenizer,
34	private readonly Source $source,
35	) {}
36
37	/**
38	* Parse any PDF value.
39	*/
40	public function parseValue(): Serializable
41	{
42	$token = $this->tokenizer->nextToken();
43	return $this->parseTokenValue($token);
44	}
45
46	/**
47	* Parse a complete indirect object: `X Y obj <value> endobj`.
48	*
49	* @return array{int, int, Serializable} [objNum, genNum, value]
50	*/
51	public function parseIndirectObject(): array
52	{
53	$objNumToken = $this->tokenizer->nextToken();
54	$this->expect($objNumToken, TokenType::Integer, 'object number');
55
56	$genNumToken = $this->tokenizer->nextToken();
57	$this->expect($genNumToken, TokenType::Integer, 'generation number');
58
59	$objToken = $this->tokenizer->nextToken();
60	$this->expect($objToken, TokenType::ObjKeyword, 'obj keyword');
61
62	$value = $this->parseValue();
63
64	// After the value, expect `endobj` — but if the value was a dict
65	// that was followed by `stream`, it became a PdfStream and we
66	// should now see `endobj`.
67	$end = $this->tokenizer->nextToken();
68	if ($end->type !== TokenType::EndObjKeyword) {
69	// Tolerant: some generators put extra data between the value
70	// and endobj. Try skipping up to 5 tokens to find endobj.
71	if ($end->type !== TokenType::Eof) {
72	$found = false;
73	for ($skip = 0; $skip < 5; $skip++) {
74	$retry = $this->tokenizer->nextToken();
75	if ($retry->type === TokenType::EndObjKeyword \|\| $retry->type === TokenType::Eof) {
76	$found = true;
77	break;
78	}
79	}
80	// If we still can't find endobj, just continue — the object
81	// value is already parsed. The tokenizer position may be
82	// slightly off but the xref table will resync for the next object.
83	}
84	}
85
86	return [(int) $objNumToken->value, (int) $genNumToken->value, $value];
87	}
88
89	// -----------------------------------------------------------------------
90	// Internal
91	// -----------------------------------------------------------------------
92
93	private function parseTokenValue(Token $token): Serializable
94	{
95	return match ($token->type) {
96	TokenType::DictStart => $this->parseDictionaryOrStream(),
97	TokenType::ArrayStart => $this->parseArray(),
98	TokenType::Name => new PdfName($token->value),
99	TokenType::LiteralString => new PdfString($token->value),
100	TokenType::HexString => new PdfString($token->value, hex: true),
101	TokenType::Integer => $this->parseIntegerOrReference($token),
102	TokenType::Real => new PdfNumber((float) $token->value),
103	TokenType::Boolean => new PdfBoolean($token->value === 'true'),
104	TokenType::Null => new PdfNull(),
105	// Unknown keywords: skip and try the next token
106	TokenType::Unknown => $this->parseValue(),
107	default => throw new InvalidPdfException(
108	"Unexpected token {$token->type->name} ('{$token->value}') at offset {$token->offset}",
109	),
110	};
111	}
112
113	/**
114	* After reading an integer, look ahead for `<int> R` (indirect
115	* reference) or just return the integer.
116	*/
117	private function parseIntegerOrReference(Token $intToken): Serializable
118	{
119	$savedPos = $this->tokenizer->tell();
120	$next = $this->tokenizer->peek();
121
122	if ($next->type === TokenType::Integer) {
123	$this->tokenizer->nextToken(); // consume the gen number
124	$rToken = $this->tokenizer->peek();
125	if ($rToken->type === TokenType::RKeyword) {
126	$this->tokenizer->nextToken(); // consume R
127	return new PdfReference((int) $intToken->value, (int) $next->value);
128	}
129	// Not a reference — push back by seeking to saved position.
130	$this->tokenizer->seek($savedPos);
131	}
132
133	return new PdfNumber((int) $intToken->value);
134	}
135
136	private function parseDictionaryOrStream(): Serializable
137	{
138	$dict = $this->parseDictionary();
139
140	// Check if the dictionary is followed by a `stream` keyword.
141	$next = $this->tokenizer->peek();
142	if ($next->type === TokenType::StreamKeyword) {
143	$this->tokenizer->nextToken(); // consume 'stream'
144	return $this->parseStream($dict);
145	}
146
147	return $dict;
148	}
149
150	private function parseDictionary(): PdfDictionary
151	{
152	$dict = new PdfDictionary();
153
154	while (true) {
155	$token = $this->tokenizer->nextToken();
156	if ($token->type === TokenType::DictEnd) {
157	break;
158	}
159	if ($token->type === TokenType::Eof) {
160	// Tolerate unclosed dictionaries at EOF
161	break;
162	}
163	// Skip unknown tokens between dictionary entries
164	if ($token->type === TokenType::Unknown) {
165	continue;
166	}
167	if ($token->type !== TokenType::Name) {
168	// Skip unexpected tokens and try to continue
169	continue;
170	}
171
172	$key = $token->value;
173	$value = $this->parseValue();
174	$dict->set($key, $value);
175	}
176
177	return $dict;
178	}
179
180	private function parseArray(): PdfArray
181	{
182	$items = [];
183	while (true) {
184	$token = $this->tokenizer->nextToken();
185	if ($token->type === TokenType::ArrayEnd) {
186	break;
187	}
188	if ($token->type === TokenType::Eof) {
189	// Tolerate unclosed arrays at EOF
190	break;
191	}
192	$items[] = $this->parseTokenValue($token);
193	}
194	return new PdfArray($items);
195	}
196
197	/**
198	* Read stream data after the `stream` keyword has been consumed.
199	* The `stream` keyword must be followed by a single EOL (LF or CR+LF).
200	* The data length comes from `/Length` in the dictionary.
201	*/
202	private function parseStream(PdfDictionary $dict): PdfStream
203	{
204	// Skip the mandatory EOL after 'stream'
205	$byte = $this->source->readByte();
206	if ($byte === "\r") {
207	// CR+LF
208	if ($this->source->peek() === "\n") {
209	$this->source->readByte();
210	}
211	}
212	// If it was already LF, we consumed it. If something else, tolerate.
213
214	$length = $dict->get('Length');
215	if ($length instanceof PdfNumber) {
216	$streamLength = (int) $length->toPdf();
217	} elseif (is_int($length)) {
218	$streamLength = $length;
219	} else {
220	// If Length is an indirect reference, we cannot resolve it here
221	// because we don't have the resolver yet. Fall back to scanning
222	// for 'endstream'.
223	$streamLength = $this->scanForEndstream();
224	}
225
226	if ($streamLength >= 0) {
227	$data = $this->source->read($streamLength);
228	} else {
229	$data = '';
230	}
231
232	// Consume the trailing EOL + endstream keyword.
233	// The spec says data is followed by an EOL then 'endstream'.
234	// Tolerate missing EOL.
235	$this->skipStreamTrailer();
236
237	$stream = new PdfStream($dict, $data);
238	return $stream;
239	}
240
241	/**
242	* Fallback: scan forward for `endstream` to determine stream length.
243	*
244	* Limits scan to 64 MB to prevent OOM on corrupted/truncated streams.
245	*/
246	private function scanForEndstream(): int
247	{
248	$start = $this->source->tell();
249	$marker = 'endstream';
250	$markerLen = strlen($marker);
251
252	// Use a sliding window instead of accumulating a full buffer to limit memory
253	$maxScan = 64 * 1024 * 1024; // 64 MB safety limit
254	$scanned = 0;
255	$window = '';
256
257	while (!$this->source->isEof() && $scanned < $maxScan) {
258	$byte = $this->source->readByte();
259	if ($byte === null) {
260	break;
261	}
262	$scanned++;
263	$window .= $byte;
264
265	// Keep window just large enough to detect the marker with preceding char
266	if (strlen($window) > $markerLen + 1) {
267	$window = substr($window, -($markerLen + 1));
268	}
269
270	if (str_ends_with($window, $marker)) {
271	// Validate boundary: "endstream" must be preceded by
272	// whitespace (CR, LF, or space) or be at the start of data.
273	$markerStart = strlen($window) - $markerLen;
274	if ($markerStart > 0) {
275	$preceding = $window[$markerStart - 1];
276	if ($preceding !== "\n" && $preceding !== "\r" && $preceding !== ' ') {
277	// False match inside binary data — keep scanning
278	continue;
279	}
280	}
281
282	$endPos = $this->source->tell() - $markerLen;
283	$length = $endPos - $start;
284	$this->source->seek($start);
285	$data = $this->source->read($length);
286	$data = rtrim($data, "\r\n");
287	$actualLength = strlen($data);
288	$this->source->seek($start);
289	return $actualLength;
290	}
291	}
292
293	$this->source->seek($start);
294	return 0;
295	}
296
297	private function skipStreamTrailer(): void
298	{
299	// Skip whitespace/EOL between stream data and 'endstream'
300	while (!$this->source->isEof()) {
301	$byte = $this->source->peek();
302	if ($byte === "\r" \|\| $byte === "\n" \|\| $byte === ' ') {
303	$this->source->readByte();
304	} else {
305	break;
306	}
307	}
308
309	// Try to consume 'endstream' keyword via the tokenizer
310	$token = $this->tokenizer->peek();
311	if ($token->type === TokenType::EndStreamKeyword) {
312	$this->tokenizer->nextToken();
313	}
314	}
315
316	private function expect(Token $token, TokenType $expected, string $context): void
317	{
318	if ($token->type !== $expected) {
319	throw new InvalidPdfException(
320	"Expected $context ({$expected->name}) at offset {$token->offset}, "
321	. "got {$token->type->name} ('{$token->value}')",
322	);
323	}
324	}
325	}