Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
83.91% |
73 / 87 |
|
73.33% |
11 / 15 |
CRAP | |
0.00% |
0 / 1 |
| ObjectResolver | |
83.91% |
73 / 87 |
|
73.33% |
11 / 15 |
56.20 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| setStrict | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| mergeOlderEntries | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
| resolve | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
6.04 | |||
| resolveReference | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| has | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
| getEntry | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getObjectNumbers | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getEntries | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| resolveInUse | |
73.91% |
17 / 23 |
|
0.00% |
0 / 1 |
11.78 | |||
| recoverByRescan | |
78.95% |
15 / 19 |
|
0.00% |
0 / 1 |
10.93 | |||
| rescanFile | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
4 | |||
| scanObjectMap | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| readRaw | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| resolveCompressed | |
75.00% |
9 / 12 |
|
0.00% |
0 / 1 |
4.25 | |||
| 1 | <?php |
| 2 | |
| 3 | declare(strict_types=1); |
| 4 | |
| 5 | namespace Phpdftk\Pdf\Reader; |
| 6 | |
| 7 | use Phpdftk\Pdf\Core\PdfNull; |
| 8 | use Phpdftk\Pdf\Core\PdfReference; |
| 9 | use Phpdftk\Pdf\Core\PdfStream; |
| 10 | use Phpdftk\Pdf\Core\Serializable; |
| 11 | use Phpdftk\Pdf\Reader\Exception\InvalidPdfException; |
| 12 | use Phpdftk\Pdf\Reader\Parser\ObjectParser; |
| 13 | use Phpdftk\Pdf\Reader\Parser\ObjectScanner; |
| 14 | use Phpdftk\Pdf\Reader\Parser\ObjectStreamParser; |
| 15 | use Phpdftk\Pdf\Reader\Parser\StreamParser; |
| 16 | use Phpdftk\Pdf\Reader\Tokenizer\Source; |
| 17 | use Phpdftk\Pdf\Reader\Tokenizer\Tokenizer; |
| 18 | |
| 19 | /** |
| 20 | * Lazy-loading object cache. Resolves indirect references by seeking |
| 21 | * to the xref-recorded byte offset, parsing the object, decompressing |
| 22 | * any stream data, and caching the result. |
| 23 | */ |
| 24 | final class ObjectResolver |
| 25 | { |
| 26 | /** @var array<int, Serializable> */ |
| 27 | private array $cache = []; |
| 28 | |
| 29 | private bool $strict = true; |
| 30 | private bool $rescanned = false; |
| 31 | |
| 32 | /** |
| 33 | * @param array<int, XrefEntry> $entries |
| 34 | */ |
| 35 | public function __construct( |
| 36 | private array $entries, |
| 37 | private readonly Tokenizer $tokenizer, |
| 38 | private readonly Source $source, |
| 39 | private readonly ObjectParser $objectParser, |
| 40 | private readonly StreamParser $streamParser, |
| 41 | private readonly ?PdfDecryptor $decryptor = null, |
| 42 | ) {} |
| 43 | |
| 44 | /** |
| 45 | * Configure whether the resolver should attempt to recover from |
| 46 | * corrupted xref entries (lenient mode = false). |
| 47 | */ |
| 48 | public function setStrict(bool $strict): void |
| 49 | { |
| 50 | $this->strict = $strict; |
| 51 | } |
| 52 | |
| 53 | /** |
| 54 | * Merge additional xref entries (older entries do NOT overwrite |
| 55 | * newer ones — newer entries from later xref sections take |
| 56 | * precedence). Used for `/Prev` chain following. |
| 57 | * |
| 58 | * @param array<int, XrefEntry> $olderEntries |
| 59 | */ |
| 60 | public function mergeOlderEntries(array $olderEntries): void |
| 61 | { |
| 62 | foreach ($olderEntries as $objNum => $entry) { |
| 63 | // Only add if we don't already have a (newer) entry |
| 64 | if (!isset($this->entries[$objNum])) { |
| 65 | $this->entries[$objNum] = $entry; |
| 66 | } |
| 67 | } |
| 68 | } |
| 69 | |
| 70 | public function resolve(int $objNum, int $genNum = 0): Serializable |
| 71 | { |
| 72 | if (isset($this->cache[$objNum])) { |
| 73 | return $this->cache[$objNum]; |
| 74 | } |
| 75 | |
| 76 | $entry = $this->entries[$objNum] ?? null; |
| 77 | if ($entry === null || $entry->type === XrefEntry::TYPE_FREE) { |
| 78 | return new PdfNull(); |
| 79 | } |
| 80 | |
| 81 | if ($entry->type === XrefEntry::TYPE_IN_USE) { |
| 82 | return $this->resolveInUse($objNum, $entry); |
| 83 | } |
| 84 | |
| 85 | if ($entry->type === XrefEntry::TYPE_COMPRESSED) { |
| 86 | return $this->resolveCompressed($objNum, $entry); |
| 87 | } |
| 88 | |
| 89 | return new PdfNull(); |
| 90 | } |
| 91 | |
| 92 | public function resolveReference(PdfReference $ref): Serializable |
| 93 | { |
| 94 | return $this->resolve($ref->objectNumber, $ref->generationNumber); |
| 95 | } |
| 96 | |
| 97 | public function has(int $objNum): bool |
| 98 | { |
| 99 | return isset($this->entries[$objNum]) |
| 100 | && $this->entries[$objNum]->type !== XrefEntry::TYPE_FREE; |
| 101 | } |
| 102 | |
| 103 | public function getEntry(int $objNum): ?XrefEntry |
| 104 | { |
| 105 | return $this->entries[$objNum] ?? null; |
| 106 | } |
| 107 | |
| 108 | /** @return list<int> */ |
| 109 | public function getObjectNumbers(): array |
| 110 | { |
| 111 | return array_keys($this->entries); |
| 112 | } |
| 113 | |
| 114 | /** @return array<int, XrefEntry> */ |
| 115 | public function getEntries(): array |
| 116 | { |
| 117 | return $this->entries; |
| 118 | } |
| 119 | |
| 120 | private function resolveInUse(int $objNum, XrefEntry $entry): Serializable |
| 121 | { |
| 122 | $this->tokenizer->seek($entry->offset); |
| 123 | |
| 124 | try { |
| 125 | [$parsedObjNum, $parsedGenNum, $value] = $this->objectParser->parseIndirectObject(); |
| 126 | } catch (\Throwable $e) { |
| 127 | if ($this->strict) { |
| 128 | throw $e; |
| 129 | } |
| 130 | // Lenient mode — try to find the object by re-scanning the file |
| 131 | return $this->recoverByRescan($objNum) ?? throw $e; |
| 132 | } |
| 133 | |
| 134 | if ($parsedObjNum !== $objNum) { |
| 135 | if ($this->strict) { |
| 136 | throw new InvalidPdfException( |
| 137 | "Xref says object $objNum is at offset {$entry->offset}, " |
| 138 | . "but found object $parsedObjNum there", |
| 139 | ); |
| 140 | } |
| 141 | // Lenient mode — try to find the correct offset for $objNum |
| 142 | $recovered = $this->recoverByRescan($objNum); |
| 143 | if ($recovered !== null) { |
| 144 | return $recovered; |
| 145 | } |
| 146 | // If we cannot recover, accept the parsed object as-is so we |
| 147 | // can keep going. The caller will treat a non-dict result as |
| 148 | // a soft failure. |
| 149 | return new PdfNull(); |
| 150 | } |
| 151 | |
| 152 | // Decrypt object if a decryptor is configured |
| 153 | if ($this->decryptor !== null) { |
| 154 | $value = $this->decryptor->decryptObject($value, $parsedObjNum, $parsedGenNum); |
| 155 | } |
| 156 | |
| 157 | // Decompress stream data if applicable |
| 158 | if ($value instanceof PdfStream && $value->data !== '') { |
| 159 | try { |
| 160 | $value->data = $this->streamParser->decode($value->data, $value->dictionary); |
| 161 | } catch (\Throwable) { |
| 162 | // If decoding fails (e.g., image-only stream), keep raw data |
| 163 | } |
| 164 | } |
| 165 | |
| 166 | $this->cache[$objNum] = $value; |
| 167 | return $value; |
| 168 | } |
| 169 | |
| 170 | /** |
| 171 | * Lenient-mode fallback: rescan the entire source for indirect-object |
| 172 | * headers and rebuild xref entries. Returns the requested object's |
| 173 | * value if it can be parsed at the rescanned offset, or null if not |
| 174 | * found / unparseable. |
| 175 | */ |
| 176 | private function recoverByRescan(int $objNum): ?Serializable |
| 177 | { |
| 178 | if (!$this->rescanned) { |
| 179 | $this->rescanFile(); |
| 180 | $this->rescanned = true; |
| 181 | } |
| 182 | |
| 183 | $entry = $this->entries[$objNum] ?? null; |
| 184 | if ($entry === null || $entry->type !== XrefEntry::TYPE_IN_USE) { |
| 185 | return null; |
| 186 | } |
| 187 | |
| 188 | $this->tokenizer->seek($entry->offset); |
| 189 | try { |
| 190 | [$parsedObjNum, $parsedGenNum, $value] = $this->objectParser->parseIndirectObject(); |
| 191 | } catch (\Throwable) { |
| 192 | return null; |
| 193 | } |
| 194 | if ($parsedObjNum !== $objNum) { |
| 195 | return null; |
| 196 | } |
| 197 | |
| 198 | if ($this->decryptor !== null) { |
| 199 | $value = $this->decryptor->decryptObject($value, $parsedObjNum, $parsedGenNum); |
| 200 | } |
| 201 | if ($value instanceof PdfStream && $value->data !== '') { |
| 202 | try { |
| 203 | $value->data = $this->streamParser->decode($value->data, $value->dictionary); |
| 204 | } catch (\Throwable) { |
| 205 | // ignore |
| 206 | } |
| 207 | } |
| 208 | |
| 209 | $this->cache[$objNum] = $value; |
| 210 | return $value; |
| 211 | } |
| 212 | |
| 213 | /** |
| 214 | * Rescan the entire source for indirect-object headers and |
| 215 | * overwrite the in-use entries with the discovered offsets. |
| 216 | */ |
| 217 | private function rescanFile(): void |
| 218 | { |
| 219 | $this->source->seek(0); |
| 220 | $bytes = $this->source->read($this->source->size()); |
| 221 | $map = ObjectScanner::scan($bytes); |
| 222 | foreach ($map as $num => $offset) { |
| 223 | $existing = $this->entries[$num] ?? null; |
| 224 | if ($existing === null || $existing->type !== XrefEntry::TYPE_COMPRESSED) { |
| 225 | $this->entries[$num] = new XrefEntry(XrefEntry::TYPE_IN_USE, $offset, 0); |
| 226 | } |
| 227 | } |
| 228 | } |
| 229 | |
| 230 | /** |
| 231 | * Scan the file once and return the discovered object map. Used by |
| 232 | * the reader to find catalogs / pages roots when the trailer /Root |
| 233 | * cannot be resolved. |
| 234 | * |
| 235 | * @return array<int, int> |
| 236 | */ |
| 237 | public function scanObjectMap(): array |
| 238 | { |
| 239 | $this->source->seek(0); |
| 240 | $bytes = $this->source->read($this->source->size()); |
| 241 | return ObjectScanner::scan($bytes); |
| 242 | } |
| 243 | |
| 244 | /** |
| 245 | * Read a window of raw bytes from the source. Used by the reader's |
| 246 | * catalog-recovery code to peek at object bodies. |
| 247 | */ |
| 248 | public function readRaw(int $offset, int $length): string |
| 249 | { |
| 250 | $this->source->seek($offset); |
| 251 | return $this->source->read($length); |
| 252 | } |
| 253 | |
| 254 | /** |
| 255 | * Resolve a compressed object from an ObjStm. |
| 256 | * entry->offset = containing ObjStm object number |
| 257 | * entry->generation = index within the ObjStm |
| 258 | */ |
| 259 | private function resolveCompressed(int $objNum, XrefEntry $entry): Serializable |
| 260 | { |
| 261 | $objStmNum = $entry->offset; |
| 262 | |
| 263 | // Resolve the containing ObjStm itself (must be type 1) |
| 264 | $objStm = $this->resolve($objStmNum); |
| 265 | if (!$objStm instanceof PdfStream) { |
| 266 | throw new InvalidPdfException( |
| 267 | "ObjStm $objStmNum is not a stream", |
| 268 | ); |
| 269 | } |
| 270 | |
| 271 | // Unpack all objects from the ObjStm and cache them |
| 272 | $parser = new ObjectStreamParser($this->streamParser); |
| 273 | $unpacked = $parser->unpack($objStm); |
| 274 | foreach ($unpacked as $num => $value) { |
| 275 | if (!isset($this->cache[$num])) { |
| 276 | $this->cache[$num] = $value; |
| 277 | } |
| 278 | } |
| 279 | |
| 280 | return $this->cache[$objNum] ?? new PdfNull(); |
| 281 | } |
| 282 | } |