Code Coverage for /home/runner/work/phpdftk/phpdftk/packages/pdf/reader/src/Tokenizer/Tokenizer.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	95.24% covered (success)	95.24%	160 / 168	73.68% covered (warning)	73.68%	14 / 19	CRAP	0.00% covered (danger)	0.00%	0 / 1
Tokenizer	95.24% covered (success)	95.24%	160 / 168	73.68% covered (warning)	73.68%	14 / 19	111	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
getSource	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
nextToken	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	2
peek	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	2
seek	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
tell	66.67% covered (warning)	66.67%	2 / 3	0.00% covered (danger)	0.00%	0 / 1	2.15
readToken	100.00% covered (success)	100.00%	19 / 19	100.00% covered (success)	100.00%	1 / 1	13
skipWhitespaceAndComments	100.00% covered (success)	100.00%	16 / 16	100.00% covered (success)	100.00%	1 / 1	13
readName	100.00% covered (success)	100.00%	12 / 12	100.00% covered (success)	100.00%	1 / 1	6
readLiteralString	94.12% covered (success)	94.12%	16 / 17	0.00% covered (danger)	0.00%	0 / 1	8.01
readEscapeSequence	80.00% covered (warning)	80.00%	12 / 15	0.00% covered (danger)	0.00%	0 / 1	14.35
handleLineContinuation	66.67% covered (warning)	66.67%	2 / 3	0.00% covered (danger)	0.00%	0 / 1	2.15
readOctalOrLiteral	77.78% covered (warning)	77.78%	7 / 9	0.00% covered (danger)	0.00%	0 / 1	6.40
readAngleBracketToken	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	2
readHexString	100.00% covered (success)	100.00%	13 / 13	100.00% covered (success)	100.00%	1 / 1	12
readDictEnd	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	2
readNumber	100.00% covered (success)	100.00%	15 / 15	100.00% covered (success)	100.00%	1 / 1	7
readKeyword	100.00% covered (success)	100.00%	19 / 19	100.00% covered (success)	100.00%	1 / 1	15
isDelimiterOrWhitespace	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	3

1	<?php
2
3	declare(strict_types=1);
4
5	namespace Phpdftk\Pdf\Reader\Tokenizer;
6
7	use Phpdftk\Pdf\Reader\Exception\InvalidPdfException;
8
9	/**
10	* PDF tokenizer — converts a byte stream into a sequence of typed tokens.
11	*
12	* Handles all PDF syntax per ISO 32000-2 §7.2–7.3: whitespace, comments,
13	* names (with `#XX` escaping), literal strings (balanced parens,
14	* backslash escapes, octal), hex strings, integers, reals, booleans,
15	* null, delimiters (`[`, `]`, `<<`, `>>`), and keywords (`obj`,
16	* `endobj`, `stream`, `endstream`, `R`, `xref`, `trailer`, `startxref`).
17	*/
18	final class Tokenizer
19	{
20	private ?Token $peeked = null;
21
22	public function __construct(private readonly Source $source) {}
23
24	public function getSource(): Source
25	{
26	return $this->source;
27	}
28
29	public function nextToken(): Token
30	{
31	if ($this->peeked !== null) {
32	$token = $this->peeked;
33	$this->peeked = null;
34	return $token;
35	}
36	return $this->readToken();
37	}
38
39	public function peek(): Token
40	{
41	if ($this->peeked === null) {
42	$this->peeked = $this->readToken();
43	}
44	return $this->peeked;
45	}
46
47	public function seek(int $offset): void
48	{
49	$this->peeked = null;
50	$this->source->seek($offset);
51	}
52
53	public function tell(): int
54	{
55	if ($this->peeked !== null) {
56	return $this->peeked->offset;
57	}
58	return $this->source->tell();
59	}
60
61	// -----------------------------------------------------------------------
62	// Internal
63	// -----------------------------------------------------------------------
64
65	private function readToken(): Token
66	{
67	$this->skipWhitespaceAndComments();
68
69	if ($this->source->isEof()) {
70	return new Token(TokenType::Eof, '', $this->source->tell());
71	}
72
73	$offset = $this->source->tell();
74	$byte = $this->source->readByte();
75	if ($byte === null) {
76	return new Token(TokenType::Eof, '', $offset);
77	}
78
79	return match ($byte) {
80	'/' => $this->readName($offset),
81	'(' => $this->readLiteralString($offset),
82	'<' => $this->readAngleBracketToken($offset),
83	'>' => $this->readDictEnd($offset),
84	'[' => new Token(TokenType::ArrayStart, '[', $offset),
85	']' => new Token(TokenType::ArrayEnd, ']', $offset),
86	'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
87	=> $this->readNumber($byte, $offset),
88	'+', '-' => $this->readNumber($byte, $offset),
89	'.' => $this->readNumber($byte, $offset),
90	default => $this->readKeyword($byte, $offset),
91	};
92	}
93
94	private function skipWhitespaceAndComments(): void
95	{
96	while (!$this->source->isEof()) {
97	$byte = $this->source->peek();
98	if ($byte === '') {
99	return;
100	}
101
102	// PDF whitespace: NUL, HT, LF, FF, CR, SP
103	if ($byte === "\x00" \|\| $byte === "\x09" \|\| $byte === "\x0A"
104	\|\| $byte === "\x0C" \|\| $byte === "\x0D" \|\| $byte === "\x20") {
105	$this->source->readByte();
106	continue;
107	}
108
109	// Comment: skip to end of line
110	if ($byte === '%') {
111	$this->source->readByte();
112	while (!$this->source->isEof()) {
113	$c = $this->source->readByte();
114	if ($c === "\x0A" \|\| $c === "\x0D") {
115	break;
116	}
117	}
118	continue;
119	}
120
121	return;
122	}
123	}
124
125	private function readName(int $offset): Token
126	{
127	$name = '';
128	while (!$this->source->isEof()) {
129	$byte = $this->source->peek();
130	if ($byte === '' \|\| $this->isDelimiterOrWhitespace($byte)) {
131	break;
132	}
133	$this->source->readByte();
134	if ($byte === '#') {
135	// #XX hex escape
136	$hex = $this->source->read(2);
137	if (strlen($hex) === 2) {
138	$name .= chr((int) hexdec($hex));
139	}
140	} else {
141	$name .= $byte;
142	}
143	}
144	return new Token(TokenType::Name, $name, $offset);
145	}
146
147	private function readLiteralString(int $offset): Token
148	{
149	$result = '';
150	$depth = 1;
151	while ($depth > 0 && !$this->source->isEof()) {
152	$byte = $this->source->readByte();
153	if ($byte === null) {
154	break;
155	}
156
157	if ($byte === '(') {
158	$depth++;
159	$result .= '(';
160	} elseif ($byte === ')') {
161	$depth--;
162	if ($depth > 0) {
163	$result .= ')';
164	}
165	} elseif ($byte === '\\') {
166	$result .= $this->readEscapeSequence();
167	} else {
168	$result .= $byte;
169	}
170	}
171	return new Token(TokenType::LiteralString, $result, $offset);
172	}
173
174	private function readEscapeSequence(): string
175	{
176	$next = $this->source->readByte();
177	if ($next === null) {
178	return '';
179	}
180	return match ($next) {
181	'n' => "\n",
182	'r' => "\r",
183	't' => "\t",
184	'b' => "\x08",
185	'f' => "\x0C",
186	'(' => '(',
187	')' => ')',
188	'\\' => '\\',
189	"\r" => $this->handleLineContinuation(),
190	"\n" => '', // line continuation
191	default => $this->readOctalOrLiteral($next),
192	};
193	}
194
195	private function handleLineContinuation(): string
196	{
197	// \r\n is a single line continuation
198	if ($this->source->peek() === "\n") {
199	$this->source->readByte();
200	}
201	return '';
202	}
203
204	private function readOctalOrLiteral(string $firstChar): string
205	{
206	if ($firstChar >= '0' && $firstChar <= '7') {
207	$octal = $firstChar;
208	for ($i = 0; $i < 2; $i++) {
209	$next = $this->source->peek();
210	if ($next >= '0' && $next <= '7') {
211	$octal .= $this->source->readByte();
212	} else {
213	break;
214	}
215	}
216	return chr((int) octdec($octal));
217	}
218	// Unknown escape: the spec says the backslash is ignored
219	return $firstChar;
220	}
221
222	private function readAngleBracketToken(int $offset): Token
223	{
224	$next = $this->source->peek();
225	if ($next === '<') {
226	$this->source->readByte();
227	return new Token(TokenType::DictStart, '<<', $offset);
228	}
229	return $this->readHexString($offset);
230	}
231
232	private function readHexString(int $offset): Token
233	{
234	$hex = '';
235	while (!$this->source->isEof()) {
236	$byte = $this->source->readByte();
237	if ($byte === null \|\| $byte === '>') {
238	break;
239	}
240	// Skip whitespace inside hex strings
241	if ($byte === "\x00" \|\| $byte === "\x09" \|\| $byte === "\x0A"
242	\|\| $byte === "\x0C" \|\| $byte === "\x0D" \|\| $byte === "\x20") {
243	continue;
244	}
245	$hex .= $byte;
246	}
247	// Odd length: append trailing 0
248	if (strlen($hex) % 2 !== 0) {
249	$hex .= '0';
250	}
251	$decoded = hex2bin($hex);
252	return new Token(TokenType::HexString, $decoded === false ? '' : $decoded, $offset);
253	}
254
255	private function readDictEnd(int $offset): Token
256	{
257	$next = $this->source->peek();
258	if ($next === '>') {
259	$this->source->readByte();
260	return new Token(TokenType::DictEnd, '>>', $offset);
261	}
262	// Tolerate lone '>' — treat as dict end (some malformed PDFs)
263	return new Token(TokenType::DictEnd, '>>', $offset);
264	}
265
266	private function readNumber(string $first, int $offset): Token
267	{
268	$num = $first;
269	$isReal = ($first === '.');
270	while (!$this->source->isEof()) {
271	$byte = $this->source->peek();
272	if ($byte >= '0' && $byte <= '9') {
273	$num .= $this->source->readByte();
274	} elseif ($byte === '.' && !$isReal) {
275	$isReal = true;
276	$num .= $this->source->readByte();
277	} else {
278	break;
279	}
280	}
281	return new Token(
282	$isReal ? TokenType::Real : TokenType::Integer,
283	$num,
284	$offset,
285	);
286	}
287
288	private function readKeyword(string $first, int $offset): Token
289	{
290	$word = $first;
291	while (!$this->source->isEof()) {
292	$byte = $this->source->peek();
293	if ($byte === '' \|\| $this->isDelimiterOrWhitespace($byte)) {
294	break;
295	}
296	$word .= $this->source->readByte();
297	}
298	$type = match ($word) {
299	'true', 'false' => TokenType::Boolean,
300	'null' => TokenType::Null,
301	'obj' => TokenType::ObjKeyword,
302	'endobj' => TokenType::EndObjKeyword,
303	'stream' => TokenType::StreamKeyword,
304	'endstream' => TokenType::EndStreamKeyword,
305	'R' => TokenType::RKeyword,
306	'xref' => TokenType::XrefKeyword,
307	'trailer' => TokenType::TrailerKeyword,
308	'startxref' => TokenType::StartXrefKeyword,
309	default => TokenType::Unknown,
310	};
311	return new Token($type, $word, $offset);
312	}
313
314	private function isDelimiterOrWhitespace(string $byte): bool
315	{
316	return match ($byte) {
317	// Whitespace
318	"\x00", "\x09", "\x0A", "\x0C", "\x0D", "\x20",
319	// Delimiters
320	'(', ')', '<', '>', '[', ']', '{', '}', '/', '%'
321	=> true,
322	default => false,
323	};
324	}
325	}