Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
83.91% covered (warning)
83.91%
73 / 87
73.33% covered (warning)
73.33%
11 / 15
CRAP
0.00% covered (danger)
0.00%
0 / 1
ObjectResolver
83.91% covered (warning)
83.91%
73 / 87
73.33% covered (warning)
73.33%
11 / 15
56.20
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 setStrict
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 mergeOlderEntries
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
3
 resolve
90.00% covered (success)
90.00%
9 / 10
0.00% covered (danger)
0.00%
0 / 1
6.04
 resolveReference
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 has
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
2
 getEntry
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getObjectNumbers
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getEntries
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 resolveInUse
73.91% covered (warning)
73.91%
17 / 23
0.00% covered (danger)
0.00%
0 / 1
11.78
 recoverByRescan
78.95% covered (warning)
78.95%
15 / 19
0.00% covered (danger)
0.00%
0 / 1
10.93
 rescanFile
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
4
 scanObjectMap
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 readRaw
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 resolveCompressed
75.00% covered (warning)
75.00%
9 / 12
0.00% covered (danger)
0.00%
0 / 1
4.25
1<?php
2
3declare(strict_types=1);
4
5namespace Phpdftk\Pdf\Reader;
6
7use Phpdftk\Pdf\Core\PdfNull;
8use Phpdftk\Pdf\Core\PdfReference;
9use Phpdftk\Pdf\Core\PdfStream;
10use Phpdftk\Pdf\Core\Serializable;
11use Phpdftk\Pdf\Reader\Exception\InvalidPdfException;
12use Phpdftk\Pdf\Reader\Parser\ObjectParser;
13use Phpdftk\Pdf\Reader\Parser\ObjectScanner;
14use Phpdftk\Pdf\Reader\Parser\ObjectStreamParser;
15use Phpdftk\Pdf\Reader\Parser\StreamParser;
16use Phpdftk\Pdf\Reader\Tokenizer\Source;
17use Phpdftk\Pdf\Reader\Tokenizer\Tokenizer;
18
19/**
20 * Lazy-loading object cache. Resolves indirect references by seeking
21 * to the xref-recorded byte offset, parsing the object, decompressing
22 * any stream data, and caching the result.
23 */
24final class ObjectResolver
25{
26    /** @var array<int, Serializable> */
27    private array $cache = [];
28
29    private bool $strict = true;
30    private bool $rescanned = false;
31
32    /**
33     * @param array<int, XrefEntry> $entries
34     */
35    public function __construct(
36        private array $entries,
37        private readonly Tokenizer $tokenizer,
38        private readonly Source $source,
39        private readonly ObjectParser $objectParser,
40        private readonly StreamParser $streamParser,
41        private readonly ?PdfDecryptor $decryptor = null,
42    ) {}
43
44    /**
45     * Configure whether the resolver should attempt to recover from
46     * corrupted xref entries (lenient mode = false).
47     */
48    public function setStrict(bool $strict): void
49    {
50        $this->strict = $strict;
51    }
52
53    /**
54     * Merge additional xref entries (older entries do NOT overwrite
55     * newer ones — newer entries from later xref sections take
56     * precedence). Used for `/Prev` chain following.
57     *
58     * @param array<int, XrefEntry> $olderEntries
59     */
60    public function mergeOlderEntries(array $olderEntries): void
61    {
62        foreach ($olderEntries as $objNum => $entry) {
63            // Only add if we don't already have a (newer) entry
64            if (!isset($this->entries[$objNum])) {
65                $this->entries[$objNum] = $entry;
66            }
67        }
68    }
69
70    public function resolve(int $objNum, int $genNum = 0): Serializable
71    {
72        if (isset($this->cache[$objNum])) {
73            return $this->cache[$objNum];
74        }
75
76        $entry = $this->entries[$objNum] ?? null;
77        if ($entry === null || $entry->type === XrefEntry::TYPE_FREE) {
78            return new PdfNull();
79        }
80
81        if ($entry->type === XrefEntry::TYPE_IN_USE) {
82            return $this->resolveInUse($objNum, $entry);
83        }
84
85        if ($entry->type === XrefEntry::TYPE_COMPRESSED) {
86            return $this->resolveCompressed($objNum, $entry);
87        }
88
89        return new PdfNull();
90    }
91
92    public function resolveReference(PdfReference $ref): Serializable
93    {
94        return $this->resolve($ref->objectNumber, $ref->generationNumber);
95    }
96
97    public function has(int $objNum): bool
98    {
99        return isset($this->entries[$objNum])
100            && $this->entries[$objNum]->type !== XrefEntry::TYPE_FREE;
101    }
102
103    public function getEntry(int $objNum): ?XrefEntry
104    {
105        return $this->entries[$objNum] ?? null;
106    }
107
108    /** @return list<int> */
109    public function getObjectNumbers(): array
110    {
111        return array_keys($this->entries);
112    }
113
114    /** @return array<int, XrefEntry> */
115    public function getEntries(): array
116    {
117        return $this->entries;
118    }
119
120    private function resolveInUse(int $objNum, XrefEntry $entry): Serializable
121    {
122        $this->tokenizer->seek($entry->offset);
123
124        try {
125            [$parsedObjNum, $parsedGenNum, $value] = $this->objectParser->parseIndirectObject();
126        } catch (\Throwable $e) {
127            if ($this->strict) {
128                throw $e;
129            }
130            // Lenient mode — try to find the object by re-scanning the file
131            return $this->recoverByRescan($objNum) ?? throw $e;
132        }
133
134        if ($parsedObjNum !== $objNum) {
135            if ($this->strict) {
136                throw new InvalidPdfException(
137                    "Xref says object $objNum is at offset {$entry->offset}"
138                    . "but found object $parsedObjNum there",
139                );
140            }
141            // Lenient mode — try to find the correct offset for $objNum
142            $recovered = $this->recoverByRescan($objNum);
143            if ($recovered !== null) {
144                return $recovered;
145            }
146            // If we cannot recover, accept the parsed object as-is so we
147            // can keep going. The caller will treat a non-dict result as
148            // a soft failure.
149            return new PdfNull();
150        }
151
152        // Decrypt object if a decryptor is configured
153        if ($this->decryptor !== null) {
154            $value = $this->decryptor->decryptObject($value, $parsedObjNum, $parsedGenNum);
155        }
156
157        // Decompress stream data if applicable
158        if ($value instanceof PdfStream && $value->data !== '') {
159            try {
160                $value->data = $this->streamParser->decode($value->data, $value->dictionary);
161            } catch (\Throwable) {
162                // If decoding fails (e.g., image-only stream), keep raw data
163            }
164        }
165
166        $this->cache[$objNum] = $value;
167        return $value;
168    }
169
170    /**
171     * Lenient-mode fallback: rescan the entire source for indirect-object
172     * headers and rebuild xref entries. Returns the requested object's
173     * value if it can be parsed at the rescanned offset, or null if not
174     * found / unparseable.
175     */
176    private function recoverByRescan(int $objNum): ?Serializable
177    {
178        if (!$this->rescanned) {
179            $this->rescanFile();
180            $this->rescanned = true;
181        }
182
183        $entry = $this->entries[$objNum] ?? null;
184        if ($entry === null || $entry->type !== XrefEntry::TYPE_IN_USE) {
185            return null;
186        }
187
188        $this->tokenizer->seek($entry->offset);
189        try {
190            [$parsedObjNum, $parsedGenNum, $value] = $this->objectParser->parseIndirectObject();
191        } catch (\Throwable) {
192            return null;
193        }
194        if ($parsedObjNum !== $objNum) {
195            return null;
196        }
197
198        if ($this->decryptor !== null) {
199            $value = $this->decryptor->decryptObject($value, $parsedObjNum, $parsedGenNum);
200        }
201        if ($value instanceof PdfStream && $value->data !== '') {
202            try {
203                $value->data = $this->streamParser->decode($value->data, $value->dictionary);
204            } catch (\Throwable) {
205                // ignore
206            }
207        }
208
209        $this->cache[$objNum] = $value;
210        return $value;
211    }
212
213    /**
214     * Rescan the entire source for indirect-object headers and
215     * overwrite the in-use entries with the discovered offsets.
216     */
217    private function rescanFile(): void
218    {
219        $this->source->seek(0);
220        $bytes = $this->source->read($this->source->size());
221        $map = ObjectScanner::scan($bytes);
222        foreach ($map as $num => $offset) {
223            $existing = $this->entries[$num] ?? null;
224            if ($existing === null || $existing->type !== XrefEntry::TYPE_COMPRESSED) {
225                $this->entries[$num] = new XrefEntry(XrefEntry::TYPE_IN_USE, $offset, 0);
226            }
227        }
228    }
229
230    /**
231     * Scan the file once and return the discovered object map. Used by
232     * the reader to find catalogs / pages roots when the trailer /Root
233     * cannot be resolved.
234     *
235     * @return array<int, int>
236     */
237    public function scanObjectMap(): array
238    {
239        $this->source->seek(0);
240        $bytes = $this->source->read($this->source->size());
241        return ObjectScanner::scan($bytes);
242    }
243
244    /**
245     * Read a window of raw bytes from the source. Used by the reader's
246     * catalog-recovery code to peek at object bodies.
247     */
248    public function readRaw(int $offset, int $length): string
249    {
250        $this->source->seek($offset);
251        return $this->source->read($length);
252    }
253
254    /**
255     * Resolve a compressed object from an ObjStm.
256     * entry->offset = containing ObjStm object number
257     * entry->generation = index within the ObjStm
258     */
259    private function resolveCompressed(int $objNum, XrefEntry $entry): Serializable
260    {
261        $objStmNum = $entry->offset;
262
263        // Resolve the containing ObjStm itself (must be type 1)
264        $objStm = $this->resolve($objStmNum);
265        if (!$objStm instanceof PdfStream) {
266            throw new InvalidPdfException(
267                "ObjStm $objStmNum is not a stream",
268            );
269        }
270
271        // Unpack all objects from the ObjStm and cache them
272        $parser = new ObjectStreamParser($this->streamParser);
273        $unpacked = $parser->unpack($objStm);
274        foreach ($unpacked as $num => $value) {
275            if (!isset($this->cache[$num])) {
276                $this->cache[$num] = $value;
277            }
278        }
279
280        return $this->cache[$objNum] ?? new PdfNull();
281    }
282}