Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
95.24% covered (success)
95.24%
160 / 168
73.68% covered (warning)
73.68%
14 / 19
CRAP
0.00% covered (danger)
0.00%
0 / 1
Tokenizer
95.24% covered (success)
95.24%
160 / 168
73.68% covered (warning)
73.68%
14 / 19
111
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getSource
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 nextToken
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 peek
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 seek
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 tell
66.67% covered (warning)
66.67%
2 / 3
0.00% covered (danger)
0.00%
0 / 1
2.15
 readToken
100.00% covered (success)
100.00%
19 / 19
100.00% covered (success)
100.00%
1 / 1
13
 skipWhitespaceAndComments
100.00% covered (success)
100.00%
16 / 16
100.00% covered (success)
100.00%
1 / 1
13
 readName
100.00% covered (success)
100.00%
12 / 12
100.00% covered (success)
100.00%
1 / 1
6
 readLiteralString
94.12% covered (success)
94.12%
16 / 17
0.00% covered (danger)
0.00%
0 / 1
8.01
 readEscapeSequence
80.00% covered (warning)
80.00%
12 / 15
0.00% covered (danger)
0.00%
0 / 1
14.35
 handleLineContinuation
66.67% covered (warning)
66.67%
2 / 3
0.00% covered (danger)
0.00%
0 / 1
2.15
 readOctalOrLiteral
77.78% covered (warning)
77.78%
7 / 9
0.00% covered (danger)
0.00%
0 / 1
6.40
 readAngleBracketToken
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 readHexString
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
12
 readDictEnd
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 readNumber
100.00% covered (success)
100.00%
15 / 15
100.00% covered (success)
100.00%
1 / 1
7
 readKeyword
100.00% covered (success)
100.00%
19 / 19
100.00% covered (success)
100.00%
1 / 1
15
 isDelimiterOrWhitespace
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
3
1<?php
2
3declare(strict_types=1);
4
5namespace Phpdftk\Pdf\Reader\Tokenizer;
6
7use Phpdftk\Pdf\Reader\Exception\InvalidPdfException;
8
9/**
10 * PDF tokenizer — converts a byte stream into a sequence of typed tokens.
11 *
12 * Handles all PDF syntax per ISO 32000-2 §7.2–7.3: whitespace, comments,
13 * names (with `#XX` escaping), literal strings (balanced parens,
14 * backslash escapes, octal), hex strings, integers, reals, booleans,
15 * null, delimiters (`[`, `]`, `<<`, `>>`), and keywords (`obj`,
16 * `endobj`, `stream`, `endstream`, `R`, `xref`, `trailer`, `startxref`).
17 */
18final class Tokenizer
19{
20    private ?Token $peeked = null;
21
22    public function __construct(private readonly Source $source) {}
23
24    public function getSource(): Source
25    {
26        return $this->source;
27    }
28
29    public function nextToken(): Token
30    {
31        if ($this->peeked !== null) {
32            $token = $this->peeked;
33            $this->peeked = null;
34            return $token;
35        }
36        return $this->readToken();
37    }
38
39    public function peek(): Token
40    {
41        if ($this->peeked === null) {
42            $this->peeked = $this->readToken();
43        }
44        return $this->peeked;
45    }
46
47    public function seek(int $offset): void
48    {
49        $this->peeked = null;
50        $this->source->seek($offset);
51    }
52
53    public function tell(): int
54    {
55        if ($this->peeked !== null) {
56            return $this->peeked->offset;
57        }
58        return $this->source->tell();
59    }
60
61    // -----------------------------------------------------------------------
62    // Internal
63    // -----------------------------------------------------------------------
64
65    private function readToken(): Token
66    {
67        $this->skipWhitespaceAndComments();
68
69        if ($this->source->isEof()) {
70            return new Token(TokenType::Eof, '', $this->source->tell());
71        }
72
73        $offset = $this->source->tell();
74        $byte = $this->source->readByte();
75        if ($byte === null) {
76            return new Token(TokenType::Eof, '', $offset);
77        }
78
79        return match ($byte) {
80            '/'     => $this->readName($offset),
81            '('     => $this->readLiteralString($offset),
82            '<'     => $this->readAngleBracketToken($offset),
83            '>'     => $this->readDictEnd($offset),
84            '['     => new Token(TokenType::ArrayStart, '[', $offset),
85            ']'     => new Token(TokenType::ArrayEnd, ']', $offset),
86            '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
87                    => $this->readNumber($byte, $offset),
88            '+', '-' => $this->readNumber($byte, $offset),
89            '.'     => $this->readNumber($byte, $offset),
90            default => $this->readKeyword($byte, $offset),
91        };
92    }
93
94    private function skipWhitespaceAndComments(): void
95    {
96        while (!$this->source->isEof()) {
97            $byte = $this->source->peek();
98            if ($byte === '') {
99                return;
100            }
101
102            // PDF whitespace: NUL, HT, LF, FF, CR, SP
103            if ($byte === "\x00" || $byte === "\x09" || $byte === "\x0A"
104                || $byte === "\x0C" || $byte === "\x0D" || $byte === "\x20") {
105                $this->source->readByte();
106                continue;
107            }
108
109            // Comment: skip to end of line
110            if ($byte === '%') {
111                $this->source->readByte();
112                while (!$this->source->isEof()) {
113                    $c = $this->source->readByte();
114                    if ($c === "\x0A" || $c === "\x0D") {
115                        break;
116                    }
117                }
118                continue;
119            }
120
121            return;
122        }
123    }
124
125    private function readName(int $offset): Token
126    {
127        $name = '';
128        while (!$this->source->isEof()) {
129            $byte = $this->source->peek();
130            if ($byte === '' || $this->isDelimiterOrWhitespace($byte)) {
131                break;
132            }
133            $this->source->readByte();
134            if ($byte === '#') {
135                // #XX hex escape
136                $hex = $this->source->read(2);
137                if (strlen($hex) === 2) {
138                    $name .= chr((int) hexdec($hex));
139                }
140            } else {
141                $name .= $byte;
142            }
143        }
144        return new Token(TokenType::Name, $name, $offset);
145    }
146
147    private function readLiteralString(int $offset): Token
148    {
149        $result = '';
150        $depth = 1;
151        while ($depth > 0 && !$this->source->isEof()) {
152            $byte = $this->source->readByte();
153            if ($byte === null) {
154                break;
155            }
156
157            if ($byte === '(') {
158                $depth++;
159                $result .= '(';
160            } elseif ($byte === ')') {
161                $depth--;
162                if ($depth > 0) {
163                    $result .= ')';
164                }
165            } elseif ($byte === '\\') {
166                $result .= $this->readEscapeSequence();
167            } else {
168                $result .= $byte;
169            }
170        }
171        return new Token(TokenType::LiteralString, $result, $offset);
172    }
173
174    private function readEscapeSequence(): string
175    {
176        $next = $this->source->readByte();
177        if ($next === null) {
178            return '';
179        }
180        return match ($next) {
181            'n' => "\n",
182            'r' => "\r",
183            't' => "\t",
184            'b' => "\x08",
185            'f' => "\x0C",
186            '(' => '(',
187            ')' => ')',
188            '\\' => '\\',
189            "\r" => $this->handleLineContinuation(),
190            "\n" => '',  // line continuation
191            default => $this->readOctalOrLiteral($next),
192        };
193    }
194
195    private function handleLineContinuation(): string
196    {
197        // \r\n is a single line continuation
198        if ($this->source->peek() === "\n") {
199            $this->source->readByte();
200        }
201        return '';
202    }
203
204    private function readOctalOrLiteral(string $firstChar): string
205    {
206        if ($firstChar >= '0' && $firstChar <= '7') {
207            $octal = $firstChar;
208            for ($i = 0; $i < 2; $i++) {
209                $next = $this->source->peek();
210                if ($next >= '0' && $next <= '7') {
211                    $octal .= $this->source->readByte();
212                } else {
213                    break;
214                }
215            }
216            return chr((int) octdec($octal));
217        }
218        // Unknown escape: the spec says the backslash is ignored
219        return $firstChar;
220    }
221
222    private function readAngleBracketToken(int $offset): Token
223    {
224        $next = $this->source->peek();
225        if ($next === '<') {
226            $this->source->readByte();
227            return new Token(TokenType::DictStart, '<<', $offset);
228        }
229        return $this->readHexString($offset);
230    }
231
232    private function readHexString(int $offset): Token
233    {
234        $hex = '';
235        while (!$this->source->isEof()) {
236            $byte = $this->source->readByte();
237            if ($byte === null || $byte === '>') {
238                break;
239            }
240            // Skip whitespace inside hex strings
241            if ($byte === "\x00" || $byte === "\x09" || $byte === "\x0A"
242                || $byte === "\x0C" || $byte === "\x0D" || $byte === "\x20") {
243                continue;
244            }
245            $hex .= $byte;
246        }
247        // Odd length: append trailing 0
248        if (strlen($hex) % 2 !== 0) {
249            $hex .= '0';
250        }
251        $decoded = hex2bin($hex);
252        return new Token(TokenType::HexString, $decoded === false ? '' : $decoded, $offset);
253    }
254
255    private function readDictEnd(int $offset): Token
256    {
257        $next = $this->source->peek();
258        if ($next === '>') {
259            $this->source->readByte();
260            return new Token(TokenType::DictEnd, '>>', $offset);
261        }
262        // Tolerate lone '>' — treat as dict end (some malformed PDFs)
263        return new Token(TokenType::DictEnd, '>>', $offset);
264    }
265
266    private function readNumber(string $first, int $offset): Token
267    {
268        $num = $first;
269        $isReal = ($first === '.');
270        while (!$this->source->isEof()) {
271            $byte = $this->source->peek();
272            if ($byte >= '0' && $byte <= '9') {
273                $num .= $this->source->readByte();
274            } elseif ($byte === '.' && !$isReal) {
275                $isReal = true;
276                $num .= $this->source->readByte();
277            } else {
278                break;
279            }
280        }
281        return new Token(
282            $isReal ? TokenType::Real : TokenType::Integer,
283            $num,
284            $offset,
285        );
286    }
287
288    private function readKeyword(string $first, int $offset): Token
289    {
290        $word = $first;
291        while (!$this->source->isEof()) {
292            $byte = $this->source->peek();
293            if ($byte === '' || $this->isDelimiterOrWhitespace($byte)) {
294                break;
295            }
296            $word .= $this->source->readByte();
297        }
298        $type = match ($word) {
299            'true', 'false' => TokenType::Boolean,
300            'null'          => TokenType::Null,
301            'obj'           => TokenType::ObjKeyword,
302            'endobj'        => TokenType::EndObjKeyword,
303            'stream'        => TokenType::StreamKeyword,
304            'endstream'     => TokenType::EndStreamKeyword,
305            'R'             => TokenType::RKeyword,
306            'xref'          => TokenType::XrefKeyword,
307            'trailer'       => TokenType::TrailerKeyword,
308            'startxref'     => TokenType::StartXrefKeyword,
309            default         => TokenType::Unknown,
310        };
311        return new Token($type, $word, $offset);
312    }
313
314    private function isDelimiterOrWhitespace(string $byte): bool
315    {
316        return match ($byte) {
317            // Whitespace
318            "\x00", "\x09", "\x0A", "\x0C", "\x0D", "\x20",
319            // Delimiters
320            '(', ')', '<', '>', '[', ']', '{', '}', '/', '%'
321                => true,
322            default => false,
323        };
324    }
325}