Code Coverage for /home/runner/work/phpdftk/phpdftk/packages/pdf/reader/src/Parser/XrefParser.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	90.77% covered (success)	90.77%	59 / 65	80.00% covered (warning)	80.00%	4 / 5	CRAP	0.00% covered (danger)	0.00%	0 / 1
XrefParser	90.77% covered (success)	90.77%	59 / 65	80.00% covered (warning)	80.00%	4 / 5	38.08	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
parseClassicXref	85.00% covered (warning)	85.00%	34 / 40	0.00% covered (danger)	0.00%	0 / 1	11.41
readLine	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	4
skipWhitespace	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	9
readWord	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	12

1	<?php
2
3	declare(strict_types=1);
4
5	namespace Phpdftk\Pdf\Reader\Parser;
6
7	use Phpdftk\Pdf\Core\PdfDictionary;
8	use Phpdftk\Pdf\Reader\Exception\InvalidPdfException;
9	use Phpdftk\Pdf\Reader\Tokenizer\Source;
10	use Phpdftk\Pdf\Reader\Tokenizer\Tokenizer;
11	use Phpdftk\Pdf\Reader\XrefEntry;
12
13	/**
14	* Parses a classic cross-reference table and its trailer dictionary.
15	*
16	* Uses raw byte reads for the fixed-format xref section (to avoid
17	* interleaving tokenized and raw reads on the same Source), then hands
18	* off to the ObjectParser for the trailer dictionary only.
19	*/
20	final class XrefParser
21	{
22	public function __construct(
23	private readonly Tokenizer $tokenizer,
24	private readonly Source $source,
25	private readonly ObjectParser $objectParser,
26	) {}
27
28	/**
29	* Parse a classic xref table at the given byte offset.
30	*
31	* @param list<string> $warnings
32	* @return array{0: array<int, XrefEntry>, 1: PdfDictionary}
33	*/
34	public function parseClassicXref(int $offset, bool $strict = true, array &$warnings = []): array
35	{
36	$this->source->seek($offset);
37
38	// Read and verify "xref" keyword
39	$this->skipWhitespace();
40	$keyword = $this->readWord();
41	if ($keyword !== 'xref') {
42	throw new InvalidPdfException(
43	"Expected 'xref' at offset $offset, got '$keyword'",
44	);
45	}
46
47	$entries = [];
48
49	// Parse subsections until we hit "trailer"
50	while (true) {
51	$this->skipWhitespace();
52	$word = $this->readWord();
53	if ($word === 'trailer') {
54	break;
55	}
56	if ($word === '' \|\| $this->source->isEof()) {
57	throw new InvalidPdfException(
58	"Unexpected end of xref table at offset " . $this->source->tell() . ": expected 'trailer'",
59	);
60	}
61
62	// $word is the first object number of this subsection
63	$firstObj = (int) $word;
64
65	$this->skipWhitespace();
66	$countWord = $this->readWord();
67	$count = (int) $countWord;
68
69	$this->skipWhitespace();
70
71	for ($i = 0; $i < $count; $i++) {
72	// Read an entry line. Spec says exactly 20 bytes, but
73	// some producers write 21 (extra space before CRLF).
74	// Be tolerant: read up to 24 bytes, then trim and parse.
75	$line = $this->readLine(24);
76	if (!preg_match('/^(\d{10})\s+(\d{5})\s+([nf])/', $line, $em)) {
77	if ($strict) {
78	throw new InvalidPdfException(
79	"Malformed xref entry at offset " . $this->source->tell() . ": '$line'",
80	);
81	}
82	$warnings[] = "Skipped malformed xref entry for object " . ($firstObj + $i) . ": '$line'";
83	continue;
84	}
85	$entryOffset = (int) $em[1];
86	$gen = (int) $em[2];
87	$type = ($em[3] === 'f') ? XrefEntry::TYPE_FREE : XrefEntry::TYPE_IN_USE;
88	$entries[$firstObj + $i] = new XrefEntry($type, $entryOffset, $gen);
89	}
90	}
91
92	// Now the source is positioned right after "trailer".
93	// Sync the tokenizer to this position and parse the trailer dict.
94	$this->tokenizer->seek($this->source->tell());
95	$trailer = $this->objectParser->parseValue();
96	if (!$trailer instanceof PdfDictionary) {
97	throw new InvalidPdfException('Trailer is not a dictionary');
98	}
99
100	return [$entries, $trailer];
101	}
102
103	/**
104	* Read up to $maxBytes, stopping at (and consuming) the first \n.
105	*/
106	private function readLine(int $maxBytes): string
107	{
108	$line = '';
109	for ($i = 0; $i < $maxBytes; $i++) {
110	$byte = $this->source->readByte();
111	if ($byte === null) {
112	break;
113	}
114	if ($byte === "\n") {
115	break;
116	}
117	$line .= $byte;
118	}
119	return rtrim($line, "\r");
120	}
121
122	private function skipWhitespace(): void
123	{
124	while (!$this->source->isEof()) {
125	$byte = $this->source->peek();
126	if ($byte === '' \|\| ($byte !== "\x00" && $byte !== "\x09" && $byte !== "\x0A"
127	&& $byte !== "\x0C" && $byte !== "\x0D" && $byte !== "\x20")) {
128	return;
129	}
130	$this->source->readByte();
131	}
132	}
133
134	/**
135	* Read a contiguous run of non-whitespace, non-delimiter bytes.
136	*/
137	private function readWord(): string
138	{
139	$word = '';
140	while (!$this->source->isEof()) {
141	$byte = $this->source->peek();
142	if ($byte === '' \|\| $byte === "\x00" \|\| $byte === "\x09" \|\| $byte === "\x0A"
143	\|\| $byte === "\x0C" \|\| $byte === "\x0D" \|\| $byte === "\x20"
144	\|\| $byte === '<' \|\| $byte === '/' \|\| $byte === '[') {
145	break;
146	}
147	$word .= $this->source->readByte();
148	}
149	return $word;
150	}
151	}