Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
90.77% |
59 / 65 |
|
80.00% |
4 / 5 |
CRAP | |
0.00% |
0 / 1 |
| XrefParser | |
90.77% |
59 / 65 |
|
80.00% |
4 / 5 |
38.08 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| parseClassicXref | |
85.00% |
34 / 40 |
|
0.00% |
0 / 1 |
11.41 | |||
| readLine | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
| skipWhitespace | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
9 | |||
| readWord | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
12 | |||
| 1 | <?php |
| 2 | |
| 3 | declare(strict_types=1); |
| 4 | |
| 5 | namespace Phpdftk\Pdf\Reader\Parser; |
| 6 | |
| 7 | use Phpdftk\Pdf\Core\PdfDictionary; |
| 8 | use Phpdftk\Pdf\Reader\Exception\InvalidPdfException; |
| 9 | use Phpdftk\Pdf\Reader\Tokenizer\Source; |
| 10 | use Phpdftk\Pdf\Reader\Tokenizer\Tokenizer; |
| 11 | use Phpdftk\Pdf\Reader\XrefEntry; |
| 12 | |
| 13 | /** |
| 14 | * Parses a classic cross-reference table and its trailer dictionary. |
| 15 | * |
| 16 | * Uses raw byte reads for the fixed-format xref section (to avoid |
| 17 | * interleaving tokenized and raw reads on the same Source), then hands |
| 18 | * off to the ObjectParser for the trailer dictionary only. |
| 19 | */ |
| 20 | final class XrefParser |
| 21 | { |
| 22 | public function __construct( |
| 23 | private readonly Tokenizer $tokenizer, |
| 24 | private readonly Source $source, |
| 25 | private readonly ObjectParser $objectParser, |
| 26 | ) {} |
| 27 | |
| 28 | /** |
| 29 | * Parse a classic xref table at the given byte offset. |
| 30 | * |
| 31 | * @param list<string> $warnings |
| 32 | * @return array{0: array<int, XrefEntry>, 1: PdfDictionary} |
| 33 | */ |
| 34 | public function parseClassicXref(int $offset, bool $strict = true, array &$warnings = []): array |
| 35 | { |
| 36 | $this->source->seek($offset); |
| 37 | |
| 38 | // Read and verify "xref" keyword |
| 39 | $this->skipWhitespace(); |
| 40 | $keyword = $this->readWord(); |
| 41 | if ($keyword !== 'xref') { |
| 42 | throw new InvalidPdfException( |
| 43 | "Expected 'xref' at offset $offset, got '$keyword'", |
| 44 | ); |
| 45 | } |
| 46 | |
| 47 | $entries = []; |
| 48 | |
| 49 | // Parse subsections until we hit "trailer" |
| 50 | while (true) { |
| 51 | $this->skipWhitespace(); |
| 52 | $word = $this->readWord(); |
| 53 | if ($word === 'trailer') { |
| 54 | break; |
| 55 | } |
| 56 | if ($word === '' || $this->source->isEof()) { |
| 57 | throw new InvalidPdfException( |
| 58 | "Unexpected end of xref table at offset " . $this->source->tell() . ": expected 'trailer'", |
| 59 | ); |
| 60 | } |
| 61 | |
| 62 | // $word is the first object number of this subsection |
| 63 | $firstObj = (int) $word; |
| 64 | |
| 65 | $this->skipWhitespace(); |
| 66 | $countWord = $this->readWord(); |
| 67 | $count = (int) $countWord; |
| 68 | |
| 69 | $this->skipWhitespace(); |
| 70 | |
| 71 | for ($i = 0; $i < $count; $i++) { |
| 72 | // Read an entry line. Spec says exactly 20 bytes, but |
| 73 | // some producers write 21 (extra space before CRLF). |
| 74 | // Be tolerant: read up to 24 bytes, then trim and parse. |
| 75 | $line = $this->readLine(24); |
| 76 | if (!preg_match('/^(\d{10})\s+(\d{5})\s+([nf])/', $line, $em)) { |
| 77 | if ($strict) { |
| 78 | throw new InvalidPdfException( |
| 79 | "Malformed xref entry at offset " . $this->source->tell() . ": '$line'", |
| 80 | ); |
| 81 | } |
| 82 | $warnings[] = "Skipped malformed xref entry for object " . ($firstObj + $i) . ": '$line'"; |
| 83 | continue; |
| 84 | } |
| 85 | $entryOffset = (int) $em[1]; |
| 86 | $gen = (int) $em[2]; |
| 87 | $type = ($em[3] === 'f') ? XrefEntry::TYPE_FREE : XrefEntry::TYPE_IN_USE; |
| 88 | $entries[$firstObj + $i] = new XrefEntry($type, $entryOffset, $gen); |
| 89 | } |
| 90 | } |
| 91 | |
| 92 | // Now the source is positioned right after "trailer". |
| 93 | // Sync the tokenizer to this position and parse the trailer dict. |
| 94 | $this->tokenizer->seek($this->source->tell()); |
| 95 | $trailer = $this->objectParser->parseValue(); |
| 96 | if (!$trailer instanceof PdfDictionary) { |
| 97 | throw new InvalidPdfException('Trailer is not a dictionary'); |
| 98 | } |
| 99 | |
| 100 | return [$entries, $trailer]; |
| 101 | } |
| 102 | |
| 103 | /** |
| 104 | * Read up to $maxBytes, stopping at (and consuming) the first \n. |
| 105 | */ |
| 106 | private function readLine(int $maxBytes): string |
| 107 | { |
| 108 | $line = ''; |
| 109 | for ($i = 0; $i < $maxBytes; $i++) { |
| 110 | $byte = $this->source->readByte(); |
| 111 | if ($byte === null) { |
| 112 | break; |
| 113 | } |
| 114 | if ($byte === "\n") { |
| 115 | break; |
| 116 | } |
| 117 | $line .= $byte; |
| 118 | } |
| 119 | return rtrim($line, "\r"); |
| 120 | } |
| 121 | |
| 122 | private function skipWhitespace(): void |
| 123 | { |
| 124 | while (!$this->source->isEof()) { |
| 125 | $byte = $this->source->peek(); |
| 126 | if ($byte === '' || ($byte !== "\x00" && $byte !== "\x09" && $byte !== "\x0A" |
| 127 | && $byte !== "\x0C" && $byte !== "\x0D" && $byte !== "\x20")) { |
| 128 | return; |
| 129 | } |
| 130 | $this->source->readByte(); |
| 131 | } |
| 132 | } |
| 133 | |
| 134 | /** |
| 135 | * Read a contiguous run of non-whitespace, non-delimiter bytes. |
| 136 | */ |
| 137 | private function readWord(): string |
| 138 | { |
| 139 | $word = ''; |
| 140 | while (!$this->source->isEof()) { |
| 141 | $byte = $this->source->peek(); |
| 142 | if ($byte === '' || $byte === "\x00" || $byte === "\x09" || $byte === "\x0A" |
| 143 | || $byte === "\x0C" || $byte === "\x0D" || $byte === "\x20" |
| 144 | || $byte === '<' || $byte === '/' || $byte === '[') { |
| 145 | break; |
| 146 | } |
| 147 | $word .= $this->source->readByte(); |
| 148 | } |
| 149 | return $word; |
| 150 | } |
| 151 | } |