Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
93.98% |
125 / 133 |
|
66.67% |
8 / 12 |
CRAP | |
0.00% |
0 / 1 |
| ObjectParser | |
93.98% |
125 / 133 |
|
66.67% |
8 / 12 |
59.76 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| parseValue | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| parseIndirectObject | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
6 | |||
| parseTokenValue | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
12 | |||
| parseIntegerOrReference | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
3 | |||
| parseDictionaryOrStream | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
| parseDictionary | |
93.33% |
14 / 15 |
|
0.00% |
0 / 1 |
6.01 | |||
| parseArray | |
88.89% |
8 / 9 |
|
0.00% |
0 / 1 |
4.02 | |||
| parseStream | |
87.50% |
14 / 16 |
|
0.00% |
0 / 1 |
6.07 | |||
| scanForEndstream | |
86.67% |
26 / 30 |
|
0.00% |
0 / 1 |
10.24 | |||
| skipStreamTrailer | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
6 | |||
| expect | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
| 1 | <?php |
| 2 | |
| 3 | declare(strict_types=1); |
| 4 | |
| 5 | namespace Phpdftk\Pdf\Reader\Parser; |
| 6 | |
| 7 | use Phpdftk\Pdf\Core\PdfArray; |
| 8 | use Phpdftk\Pdf\Core\PdfBoolean; |
| 9 | use Phpdftk\Pdf\Core\PdfDictionary; |
| 10 | use Phpdftk\Pdf\Core\PdfName; |
| 11 | use Phpdftk\Pdf\Core\PdfNull; |
| 12 | use Phpdftk\Pdf\Core\PdfNumber; |
| 13 | use Phpdftk\Pdf\Core\PdfReference; |
| 14 | use Phpdftk\Pdf\Core\PdfStream; |
| 15 | use Phpdftk\Pdf\Core\PdfString; |
| 16 | use Phpdftk\Pdf\Core\Serializable; |
| 17 | use Phpdftk\Pdf\Reader\Exception\InvalidPdfException; |
| 18 | use Phpdftk\Pdf\Reader\Tokenizer\Source; |
| 19 | use Phpdftk\Pdf\Reader\Tokenizer\Token; |
| 20 | use Phpdftk\Pdf\Reader\Tokenizer\Tokenizer; |
| 21 | use Phpdftk\Pdf\Reader\Tokenizer\TokenType; |
| 22 | |
| 23 | /** |
| 24 | * Recursive-descent PDF object parser. |
| 25 | * |
| 26 | * Consumes tokens from a {@see Tokenizer} and builds the core |
| 27 | * `PdfDictionary`, `PdfArray`, `PdfName`, `PdfString`, `PdfNumber`, |
| 28 | * `PdfBoolean`, `PdfNull`, `PdfReference`, and `PdfStream` instances. |
| 29 | */ |
| 30 | final class ObjectParser |
| 31 | { |
| 32 | public function __construct( |
| 33 | private readonly Tokenizer $tokenizer, |
| 34 | private readonly Source $source, |
| 35 | ) {} |
| 36 | |
| 37 | /** |
| 38 | * Parse any PDF value. |
| 39 | */ |
| 40 | public function parseValue(): Serializable |
| 41 | { |
| 42 | $token = $this->tokenizer->nextToken(); |
| 43 | return $this->parseTokenValue($token); |
| 44 | } |
| 45 | |
| 46 | /** |
| 47 | * Parse a complete indirect object: `X Y obj <value> endobj`. |
| 48 | * |
| 49 | * @return array{int, int, Serializable} [objNum, genNum, value] |
| 50 | */ |
| 51 | public function parseIndirectObject(): array |
| 52 | { |
| 53 | $objNumToken = $this->tokenizer->nextToken(); |
| 54 | $this->expect($objNumToken, TokenType::Integer, 'object number'); |
| 55 | |
| 56 | $genNumToken = $this->tokenizer->nextToken(); |
| 57 | $this->expect($genNumToken, TokenType::Integer, 'generation number'); |
| 58 | |
| 59 | $objToken = $this->tokenizer->nextToken(); |
| 60 | $this->expect($objToken, TokenType::ObjKeyword, 'obj keyword'); |
| 61 | |
| 62 | $value = $this->parseValue(); |
| 63 | |
| 64 | // After the value, expect `endobj` — but if the value was a dict |
| 65 | // that was followed by `stream`, it became a PdfStream and we |
| 66 | // should now see `endobj`. |
| 67 | $end = $this->tokenizer->nextToken(); |
| 68 | if ($end->type !== TokenType::EndObjKeyword) { |
| 69 | // Tolerant: some generators put extra data between the value |
| 70 | // and endobj. Try skipping up to 5 tokens to find endobj. |
| 71 | if ($end->type !== TokenType::Eof) { |
| 72 | $found = false; |
| 73 | for ($skip = 0; $skip < 5; $skip++) { |
| 74 | $retry = $this->tokenizer->nextToken(); |
| 75 | if ($retry->type === TokenType::EndObjKeyword || $retry->type === TokenType::Eof) { |
| 76 | $found = true; |
| 77 | break; |
| 78 | } |
| 79 | } |
| 80 | // If we still can't find endobj, just continue — the object |
| 81 | // value is already parsed. The tokenizer position may be |
| 82 | // slightly off but the xref table will resync for the next object. |
| 83 | } |
| 84 | } |
| 85 | |
| 86 | return [(int) $objNumToken->value, (int) $genNumToken->value, $value]; |
| 87 | } |
| 88 | |
| 89 | // ----------------------------------------------------------------------- |
| 90 | // Internal |
| 91 | // ----------------------------------------------------------------------- |
| 92 | |
| 93 | private function parseTokenValue(Token $token): Serializable |
| 94 | { |
| 95 | return match ($token->type) { |
| 96 | TokenType::DictStart => $this->parseDictionaryOrStream(), |
| 97 | TokenType::ArrayStart => $this->parseArray(), |
| 98 | TokenType::Name => new PdfName($token->value), |
| 99 | TokenType::LiteralString => new PdfString($token->value), |
| 100 | TokenType::HexString => new PdfString($token->value, hex: true), |
| 101 | TokenType::Integer => $this->parseIntegerOrReference($token), |
| 102 | TokenType::Real => new PdfNumber((float) $token->value), |
| 103 | TokenType::Boolean => new PdfBoolean($token->value === 'true'), |
| 104 | TokenType::Null => new PdfNull(), |
| 105 | // Unknown keywords: skip and try the next token |
| 106 | TokenType::Unknown => $this->parseValue(), |
| 107 | default => throw new InvalidPdfException( |
| 108 | "Unexpected token {$token->type->name} ('{$token->value}') at offset {$token->offset}", |
| 109 | ), |
| 110 | }; |
| 111 | } |
| 112 | |
| 113 | /** |
| 114 | * After reading an integer, look ahead for `<int> R` (indirect |
| 115 | * reference) or just return the integer. |
| 116 | */ |
| 117 | private function parseIntegerOrReference(Token $intToken): Serializable |
| 118 | { |
| 119 | $savedPos = $this->tokenizer->tell(); |
| 120 | $next = $this->tokenizer->peek(); |
| 121 | |
| 122 | if ($next->type === TokenType::Integer) { |
| 123 | $this->tokenizer->nextToken(); // consume the gen number |
| 124 | $rToken = $this->tokenizer->peek(); |
| 125 | if ($rToken->type === TokenType::RKeyword) { |
| 126 | $this->tokenizer->nextToken(); // consume R |
| 127 | return new PdfReference((int) $intToken->value, (int) $next->value); |
| 128 | } |
| 129 | // Not a reference — push back by seeking to saved position. |
| 130 | $this->tokenizer->seek($savedPos); |
| 131 | } |
| 132 | |
| 133 | return new PdfNumber((int) $intToken->value); |
| 134 | } |
| 135 | |
| 136 | private function parseDictionaryOrStream(): Serializable |
| 137 | { |
| 138 | $dict = $this->parseDictionary(); |
| 139 | |
| 140 | // Check if the dictionary is followed by a `stream` keyword. |
| 141 | $next = $this->tokenizer->peek(); |
| 142 | if ($next->type === TokenType::StreamKeyword) { |
| 143 | $this->tokenizer->nextToken(); // consume 'stream' |
| 144 | return $this->parseStream($dict); |
| 145 | } |
| 146 | |
| 147 | return $dict; |
| 148 | } |
| 149 | |
| 150 | private function parseDictionary(): PdfDictionary |
| 151 | { |
| 152 | $dict = new PdfDictionary(); |
| 153 | |
| 154 | while (true) { |
| 155 | $token = $this->tokenizer->nextToken(); |
| 156 | if ($token->type === TokenType::DictEnd) { |
| 157 | break; |
| 158 | } |
| 159 | if ($token->type === TokenType::Eof) { |
| 160 | // Tolerate unclosed dictionaries at EOF |
| 161 | break; |
| 162 | } |
| 163 | // Skip unknown tokens between dictionary entries |
| 164 | if ($token->type === TokenType::Unknown) { |
| 165 | continue; |
| 166 | } |
| 167 | if ($token->type !== TokenType::Name) { |
| 168 | // Skip unexpected tokens and try to continue |
| 169 | continue; |
| 170 | } |
| 171 | |
| 172 | $key = $token->value; |
| 173 | $value = $this->parseValue(); |
| 174 | $dict->set($key, $value); |
| 175 | } |
| 176 | |
| 177 | return $dict; |
| 178 | } |
| 179 | |
| 180 | private function parseArray(): PdfArray |
| 181 | { |
| 182 | $items = []; |
| 183 | while (true) { |
| 184 | $token = $this->tokenizer->nextToken(); |
| 185 | if ($token->type === TokenType::ArrayEnd) { |
| 186 | break; |
| 187 | } |
| 188 | if ($token->type === TokenType::Eof) { |
| 189 | // Tolerate unclosed arrays at EOF |
| 190 | break; |
| 191 | } |
| 192 | $items[] = $this->parseTokenValue($token); |
| 193 | } |
| 194 | return new PdfArray($items); |
| 195 | } |
| 196 | |
| 197 | /** |
| 198 | * Read stream data after the `stream` keyword has been consumed. |
| 199 | * The `stream` keyword must be followed by a single EOL (LF or CR+LF). |
| 200 | * The data length comes from `/Length` in the dictionary. |
| 201 | */ |
| 202 | private function parseStream(PdfDictionary $dict): PdfStream |
| 203 | { |
| 204 | // Skip the mandatory EOL after 'stream' |
| 205 | $byte = $this->source->readByte(); |
| 206 | if ($byte === "\r") { |
| 207 | // CR+LF |
| 208 | if ($this->source->peek() === "\n") { |
| 209 | $this->source->readByte(); |
| 210 | } |
| 211 | } |
| 212 | // If it was already LF, we consumed it. If something else, tolerate. |
| 213 | |
| 214 | $length = $dict->get('Length'); |
| 215 | if ($length instanceof PdfNumber) { |
| 216 | $streamLength = (int) $length->toPdf(); |
| 217 | } elseif (is_int($length)) { |
| 218 | $streamLength = $length; |
| 219 | } else { |
| 220 | // If Length is an indirect reference, we cannot resolve it here |
| 221 | // because we don't have the resolver yet. Fall back to scanning |
| 222 | // for 'endstream'. |
| 223 | $streamLength = $this->scanForEndstream(); |
| 224 | } |
| 225 | |
| 226 | if ($streamLength >= 0) { |
| 227 | $data = $this->source->read($streamLength); |
| 228 | } else { |
| 229 | $data = ''; |
| 230 | } |
| 231 | |
| 232 | // Consume the trailing EOL + endstream keyword. |
| 233 | // The spec says data is followed by an EOL then 'endstream'. |
| 234 | // Tolerate missing EOL. |
| 235 | $this->skipStreamTrailer(); |
| 236 | |
| 237 | $stream = new PdfStream($dict, $data); |
| 238 | return $stream; |
| 239 | } |
| 240 | |
| 241 | /** |
| 242 | * Fallback: scan forward for `endstream` to determine stream length. |
| 243 | * |
| 244 | * Limits scan to 64 MB to prevent OOM on corrupted/truncated streams. |
| 245 | */ |
| 246 | private function scanForEndstream(): int |
| 247 | { |
| 248 | $start = $this->source->tell(); |
| 249 | $marker = 'endstream'; |
| 250 | $markerLen = strlen($marker); |
| 251 | |
| 252 | // Use a sliding window instead of accumulating a full buffer to limit memory |
| 253 | $maxScan = 64 * 1024 * 1024; // 64 MB safety limit |
| 254 | $scanned = 0; |
| 255 | $window = ''; |
| 256 | |
| 257 | while (!$this->source->isEof() && $scanned < $maxScan) { |
| 258 | $byte = $this->source->readByte(); |
| 259 | if ($byte === null) { |
| 260 | break; |
| 261 | } |
| 262 | $scanned++; |
| 263 | $window .= $byte; |
| 264 | |
| 265 | // Keep window just large enough to detect the marker with preceding char |
| 266 | if (strlen($window) > $markerLen + 1) { |
| 267 | $window = substr($window, -($markerLen + 1)); |
| 268 | } |
| 269 | |
| 270 | if (str_ends_with($window, $marker)) { |
| 271 | // Validate boundary: "endstream" must be preceded by |
| 272 | // whitespace (CR, LF, or space) or be at the start of data. |
| 273 | $markerStart = strlen($window) - $markerLen; |
| 274 | if ($markerStart > 0) { |
| 275 | $preceding = $window[$markerStart - 1]; |
| 276 | if ($preceding !== "\n" && $preceding !== "\r" && $preceding !== ' ') { |
| 277 | // False match inside binary data — keep scanning |
| 278 | continue; |
| 279 | } |
| 280 | } |
| 281 | |
| 282 | $endPos = $this->source->tell() - $markerLen; |
| 283 | $length = $endPos - $start; |
| 284 | $this->source->seek($start); |
| 285 | $data = $this->source->read($length); |
| 286 | $data = rtrim($data, "\r\n"); |
| 287 | $actualLength = strlen($data); |
| 288 | $this->source->seek($start); |
| 289 | return $actualLength; |
| 290 | } |
| 291 | } |
| 292 | |
| 293 | $this->source->seek($start); |
| 294 | return 0; |
| 295 | } |
| 296 | |
| 297 | private function skipStreamTrailer(): void |
| 298 | { |
| 299 | // Skip whitespace/EOL between stream data and 'endstream' |
| 300 | while (!$this->source->isEof()) { |
| 301 | $byte = $this->source->peek(); |
| 302 | if ($byte === "\r" || $byte === "\n" || $byte === ' ') { |
| 303 | $this->source->readByte(); |
| 304 | } else { |
| 305 | break; |
| 306 | } |
| 307 | } |
| 308 | |
| 309 | // Try to consume 'endstream' keyword via the tokenizer |
| 310 | $token = $this->tokenizer->peek(); |
| 311 | if ($token->type === TokenType::EndStreamKeyword) { |
| 312 | $this->tokenizer->nextToken(); |
| 313 | } |
| 314 | } |
| 315 | |
| 316 | private function expect(Token $token, TokenType $expected, string $context): void |
| 317 | { |
| 318 | if ($token->type !== $expected) { |
| 319 | throw new InvalidPdfException( |
| 320 | "Expected $context ({$expected->name}) at offset {$token->offset}, " |
| 321 | . "got {$token->type->name} ('{$token->value}')", |
| 322 | ); |
| 323 | } |
| 324 | } |
| 325 | } |