Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
95.24% |
160 / 168 |
|
73.68% |
14 / 19 |
CRAP | |
0.00% |
0 / 1 |
| Tokenizer | |
95.24% |
160 / 168 |
|
73.68% |
14 / 19 |
111 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getSource | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| nextToken | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
| peek | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
| seek | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| tell | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
| readToken | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
13 | |||
| skipWhitespaceAndComments | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
13 | |||
| readName | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
6 | |||
| readLiteralString | |
94.12% |
16 / 17 |
|
0.00% |
0 / 1 |
8.01 | |||
| readEscapeSequence | |
80.00% |
12 / 15 |
|
0.00% |
0 / 1 |
14.35 | |||
| handleLineContinuation | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
| readOctalOrLiteral | |
77.78% |
7 / 9 |
|
0.00% |
0 / 1 |
6.40 | |||
| readAngleBracketToken | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
| readHexString | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
12 | |||
| readDictEnd | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
| readNumber | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
7 | |||
| readKeyword | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
15 | |||
| isDelimiterOrWhitespace | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
| 1 | <?php |
| 2 | |
| 3 | declare(strict_types=1); |
| 4 | |
| 5 | namespace Phpdftk\Pdf\Reader\Tokenizer; |
| 6 | |
| 7 | use Phpdftk\Pdf\Reader\Exception\InvalidPdfException; |
| 8 | |
| 9 | /** |
| 10 | * PDF tokenizer — converts a byte stream into a sequence of typed tokens. |
| 11 | * |
| 12 | * Handles all PDF syntax per ISO 32000-2 §7.2–7.3: whitespace, comments, |
| 13 | * names (with `#XX` escaping), literal strings (balanced parens, |
| 14 | * backslash escapes, octal), hex strings, integers, reals, booleans, |
| 15 | * null, delimiters (`[`, `]`, `<<`, `>>`), and keywords (`obj`, |
| 16 | * `endobj`, `stream`, `endstream`, `R`, `xref`, `trailer`, `startxref`). |
| 17 | */ |
| 18 | final class Tokenizer |
| 19 | { |
| 20 | private ?Token $peeked = null; |
| 21 | |
| 22 | public function __construct(private readonly Source $source) {} |
| 23 | |
| 24 | public function getSource(): Source |
| 25 | { |
| 26 | return $this->source; |
| 27 | } |
| 28 | |
| 29 | public function nextToken(): Token |
| 30 | { |
| 31 | if ($this->peeked !== null) { |
| 32 | $token = $this->peeked; |
| 33 | $this->peeked = null; |
| 34 | return $token; |
| 35 | } |
| 36 | return $this->readToken(); |
| 37 | } |
| 38 | |
| 39 | public function peek(): Token |
| 40 | { |
| 41 | if ($this->peeked === null) { |
| 42 | $this->peeked = $this->readToken(); |
| 43 | } |
| 44 | return $this->peeked; |
| 45 | } |
| 46 | |
| 47 | public function seek(int $offset): void |
| 48 | { |
| 49 | $this->peeked = null; |
| 50 | $this->source->seek($offset); |
| 51 | } |
| 52 | |
| 53 | public function tell(): int |
| 54 | { |
| 55 | if ($this->peeked !== null) { |
| 56 | return $this->peeked->offset; |
| 57 | } |
| 58 | return $this->source->tell(); |
| 59 | } |
| 60 | |
| 61 | // ----------------------------------------------------------------------- |
| 62 | // Internal |
| 63 | // ----------------------------------------------------------------------- |
| 64 | |
| 65 | private function readToken(): Token |
| 66 | { |
| 67 | $this->skipWhitespaceAndComments(); |
| 68 | |
| 69 | if ($this->source->isEof()) { |
| 70 | return new Token(TokenType::Eof, '', $this->source->tell()); |
| 71 | } |
| 72 | |
| 73 | $offset = $this->source->tell(); |
| 74 | $byte = $this->source->readByte(); |
| 75 | if ($byte === null) { |
| 76 | return new Token(TokenType::Eof, '', $offset); |
| 77 | } |
| 78 | |
| 79 | return match ($byte) { |
| 80 | '/' => $this->readName($offset), |
| 81 | '(' => $this->readLiteralString($offset), |
| 82 | '<' => $this->readAngleBracketToken($offset), |
| 83 | '>' => $this->readDictEnd($offset), |
| 84 | '[' => new Token(TokenType::ArrayStart, '[', $offset), |
| 85 | ']' => new Token(TokenType::ArrayEnd, ']', $offset), |
| 86 | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' |
| 87 | => $this->readNumber($byte, $offset), |
| 88 | '+', '-' => $this->readNumber($byte, $offset), |
| 89 | '.' => $this->readNumber($byte, $offset), |
| 90 | default => $this->readKeyword($byte, $offset), |
| 91 | }; |
| 92 | } |
| 93 | |
| 94 | private function skipWhitespaceAndComments(): void |
| 95 | { |
| 96 | while (!$this->source->isEof()) { |
| 97 | $byte = $this->source->peek(); |
| 98 | if ($byte === '') { |
| 99 | return; |
| 100 | } |
| 101 | |
| 102 | // PDF whitespace: NUL, HT, LF, FF, CR, SP |
| 103 | if ($byte === "\x00" || $byte === "\x09" || $byte === "\x0A" |
| 104 | || $byte === "\x0C" || $byte === "\x0D" || $byte === "\x20") { |
| 105 | $this->source->readByte(); |
| 106 | continue; |
| 107 | } |
| 108 | |
| 109 | // Comment: skip to end of line |
| 110 | if ($byte === '%') { |
| 111 | $this->source->readByte(); |
| 112 | while (!$this->source->isEof()) { |
| 113 | $c = $this->source->readByte(); |
| 114 | if ($c === "\x0A" || $c === "\x0D") { |
| 115 | break; |
| 116 | } |
| 117 | } |
| 118 | continue; |
| 119 | } |
| 120 | |
| 121 | return; |
| 122 | } |
| 123 | } |
| 124 | |
| 125 | private function readName(int $offset): Token |
| 126 | { |
| 127 | $name = ''; |
| 128 | while (!$this->source->isEof()) { |
| 129 | $byte = $this->source->peek(); |
| 130 | if ($byte === '' || $this->isDelimiterOrWhitespace($byte)) { |
| 131 | break; |
| 132 | } |
| 133 | $this->source->readByte(); |
| 134 | if ($byte === '#') { |
| 135 | // #XX hex escape |
| 136 | $hex = $this->source->read(2); |
| 137 | if (strlen($hex) === 2) { |
| 138 | $name .= chr((int) hexdec($hex)); |
| 139 | } |
| 140 | } else { |
| 141 | $name .= $byte; |
| 142 | } |
| 143 | } |
| 144 | return new Token(TokenType::Name, $name, $offset); |
| 145 | } |
| 146 | |
| 147 | private function readLiteralString(int $offset): Token |
| 148 | { |
| 149 | $result = ''; |
| 150 | $depth = 1; |
| 151 | while ($depth > 0 && !$this->source->isEof()) { |
| 152 | $byte = $this->source->readByte(); |
| 153 | if ($byte === null) { |
| 154 | break; |
| 155 | } |
| 156 | |
| 157 | if ($byte === '(') { |
| 158 | $depth++; |
| 159 | $result .= '('; |
| 160 | } elseif ($byte === ')') { |
| 161 | $depth--; |
| 162 | if ($depth > 0) { |
| 163 | $result .= ')'; |
| 164 | } |
| 165 | } elseif ($byte === '\\') { |
| 166 | $result .= $this->readEscapeSequence(); |
| 167 | } else { |
| 168 | $result .= $byte; |
| 169 | } |
| 170 | } |
| 171 | return new Token(TokenType::LiteralString, $result, $offset); |
| 172 | } |
| 173 | |
| 174 | private function readEscapeSequence(): string |
| 175 | { |
| 176 | $next = $this->source->readByte(); |
| 177 | if ($next === null) { |
| 178 | return ''; |
| 179 | } |
| 180 | return match ($next) { |
| 181 | 'n' => "\n", |
| 182 | 'r' => "\r", |
| 183 | 't' => "\t", |
| 184 | 'b' => "\x08", |
| 185 | 'f' => "\x0C", |
| 186 | '(' => '(', |
| 187 | ')' => ')', |
| 188 | '\\' => '\\', |
| 189 | "\r" => $this->handleLineContinuation(), |
| 190 | "\n" => '', // line continuation |
| 191 | default => $this->readOctalOrLiteral($next), |
| 192 | }; |
| 193 | } |
| 194 | |
| 195 | private function handleLineContinuation(): string |
| 196 | { |
| 197 | // \r\n is a single line continuation |
| 198 | if ($this->source->peek() === "\n") { |
| 199 | $this->source->readByte(); |
| 200 | } |
| 201 | return ''; |
| 202 | } |
| 203 | |
| 204 | private function readOctalOrLiteral(string $firstChar): string |
| 205 | { |
| 206 | if ($firstChar >= '0' && $firstChar <= '7') { |
| 207 | $octal = $firstChar; |
| 208 | for ($i = 0; $i < 2; $i++) { |
| 209 | $next = $this->source->peek(); |
| 210 | if ($next >= '0' && $next <= '7') { |
| 211 | $octal .= $this->source->readByte(); |
| 212 | } else { |
| 213 | break; |
| 214 | } |
| 215 | } |
| 216 | return chr((int) octdec($octal)); |
| 217 | } |
| 218 | // Unknown escape: the spec says the backslash is ignored |
| 219 | return $firstChar; |
| 220 | } |
| 221 | |
| 222 | private function readAngleBracketToken(int $offset): Token |
| 223 | { |
| 224 | $next = $this->source->peek(); |
| 225 | if ($next === '<') { |
| 226 | $this->source->readByte(); |
| 227 | return new Token(TokenType::DictStart, '<<', $offset); |
| 228 | } |
| 229 | return $this->readHexString($offset); |
| 230 | } |
| 231 | |
| 232 | private function readHexString(int $offset): Token |
| 233 | { |
| 234 | $hex = ''; |
| 235 | while (!$this->source->isEof()) { |
| 236 | $byte = $this->source->readByte(); |
| 237 | if ($byte === null || $byte === '>') { |
| 238 | break; |
| 239 | } |
| 240 | // Skip whitespace inside hex strings |
| 241 | if ($byte === "\x00" || $byte === "\x09" || $byte === "\x0A" |
| 242 | || $byte === "\x0C" || $byte === "\x0D" || $byte === "\x20") { |
| 243 | continue; |
| 244 | } |
| 245 | $hex .= $byte; |
| 246 | } |
| 247 | // Odd length: append trailing 0 |
| 248 | if (strlen($hex) % 2 !== 0) { |
| 249 | $hex .= '0'; |
| 250 | } |
| 251 | $decoded = hex2bin($hex); |
| 252 | return new Token(TokenType::HexString, $decoded === false ? '' : $decoded, $offset); |
| 253 | } |
| 254 | |
| 255 | private function readDictEnd(int $offset): Token |
| 256 | { |
| 257 | $next = $this->source->peek(); |
| 258 | if ($next === '>') { |
| 259 | $this->source->readByte(); |
| 260 | return new Token(TokenType::DictEnd, '>>', $offset); |
| 261 | } |
| 262 | // Tolerate lone '>' — treat as dict end (some malformed PDFs) |
| 263 | return new Token(TokenType::DictEnd, '>>', $offset); |
| 264 | } |
| 265 | |
| 266 | private function readNumber(string $first, int $offset): Token |
| 267 | { |
| 268 | $num = $first; |
| 269 | $isReal = ($first === '.'); |
| 270 | while (!$this->source->isEof()) { |
| 271 | $byte = $this->source->peek(); |
| 272 | if ($byte >= '0' && $byte <= '9') { |
| 273 | $num .= $this->source->readByte(); |
| 274 | } elseif ($byte === '.' && !$isReal) { |
| 275 | $isReal = true; |
| 276 | $num .= $this->source->readByte(); |
| 277 | } else { |
| 278 | break; |
| 279 | } |
| 280 | } |
| 281 | return new Token( |
| 282 | $isReal ? TokenType::Real : TokenType::Integer, |
| 283 | $num, |
| 284 | $offset, |
| 285 | ); |
| 286 | } |
| 287 | |
| 288 | private function readKeyword(string $first, int $offset): Token |
| 289 | { |
| 290 | $word = $first; |
| 291 | while (!$this->source->isEof()) { |
| 292 | $byte = $this->source->peek(); |
| 293 | if ($byte === '' || $this->isDelimiterOrWhitespace($byte)) { |
| 294 | break; |
| 295 | } |
| 296 | $word .= $this->source->readByte(); |
| 297 | } |
| 298 | $type = match ($word) { |
| 299 | 'true', 'false' => TokenType::Boolean, |
| 300 | 'null' => TokenType::Null, |
| 301 | 'obj' => TokenType::ObjKeyword, |
| 302 | 'endobj' => TokenType::EndObjKeyword, |
| 303 | 'stream' => TokenType::StreamKeyword, |
| 304 | 'endstream' => TokenType::EndStreamKeyword, |
| 305 | 'R' => TokenType::RKeyword, |
| 306 | 'xref' => TokenType::XrefKeyword, |
| 307 | 'trailer' => TokenType::TrailerKeyword, |
| 308 | 'startxref' => TokenType::StartXrefKeyword, |
| 309 | default => TokenType::Unknown, |
| 310 | }; |
| 311 | return new Token($type, $word, $offset); |
| 312 | } |
| 313 | |
| 314 | private function isDelimiterOrWhitespace(string $byte): bool |
| 315 | { |
| 316 | return match ($byte) { |
| 317 | // Whitespace |
| 318 | "\x00", "\x09", "\x0A", "\x0C", "\x0D", "\x20", |
| 319 | // Delimiters |
| 320 | '(', ')', '<', '>', '[', ']', '{', '}', '/', '%' |
| 321 | => true, |
| 322 | default => false, |
| 323 | }; |
| 324 | } |
| 325 | } |