Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
82.19% covered (warning)
82.19%
360 / 438
62.50% covered (warning)
62.50%
25 / 40
CRAP
0.00% covered (danger)
0.00%
0 / 1
PdfReader
82.19% covered (warning)
82.19%
360 / 438
62.50% covered (warning)
62.50%
25 / 40
397.03
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getParseWarnings
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 fromFile
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 fromString
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 fromFilePublicKey
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 fromStringPublicKey
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 fromStream
66.67% covered (warning)
66.67%
4 / 6
0.00% covered (danger)
0.00%
0 / 1
3.33
 getVersion
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getPdfVersion
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getEffectiveVersion
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
5
 validateVersion
63.41% covered (warning)
63.41%
26 / 41
0.00% covered (danger)
0.00%
0 / 1
66.39
 isLinearized
75.00% covered (warning)
75.00%
12 / 16
0.00% covered (danger)
0.00%
0 / 1
11.56
 getLinearizationParameters
90.48% covered (success)
90.48%
19 / 21
0.00% covered (danger)
0.00%
0 / 1
8.06
 getPageOffsetHintTable
23.08% covered (danger)
23.08%
9 / 39
0.00% covered (danger)
0.00%
0 / 1
202.07
 getPageByteRange
50.00% covered (danger)
50.00%
3 / 6
0.00% covered (danger)
0.00%
0 / 1
4.12
 getTrailer
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getCatalog
88.24% covered (warning)
88.24%
15 / 17
0.00% covered (danger)
0.00%
0 / 1
7.08
 recoverCatalog
80.00% covered (warning)
80.00%
20 / 25
0.00% covered (danger)
0.00%
0 / 1
10.80
 getInfo
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 getPageCount
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
4
 getPages
80.00% covered (warning)
80.00%
8 / 10
0.00% covered (danger)
0.00%
0 / 1
3.07
 getPage
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 getObject
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 resolveReference
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getResolver
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 extractText
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 extractAllText
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 extractTextWithPositions
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 extractAllTextWithPositions
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 getTypedCatalog
87.50% covered (warning)
87.50%
7 / 8
0.00% covered (danger)
0.00%
0 / 1
3.02
 getTypedPage
83.33% covered (warning)
83.33%
5 / 6
0.00% covered (danger)
0.00%
0 / 1
2.02
 getTypedPages
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
3
 getTypedObject
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
3
 build
98.02% covered (success)
98.02%
99 / 101
0.00% covered (danger)
0.00%
0 / 1
25
 extractFileId
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
4
 parseXrefAt
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 findStartxref
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
5
 reconstructXref
82.35% covered (warning)
82.35%
14 / 17
0.00% covered (danger)
0.00%
0 / 1
4.09
 findCatalogInScan
87.50% covered (warning)
87.50%
14 / 16
0.00% covered (danger)
0.00%
0 / 1
9.16
 collectPages
69.23% covered (warning)
69.23%
9 / 13
0.00% covered (danger)
0.00%
0 / 1
8.43
1<?php
2
3declare(strict_types=1);
4
5namespace Phpdftk\Pdf\Reader;
6
7use Phpdftk\Pdf\Core\Document\Catalog;
8use Phpdftk\Pdf\Core\Document\Page;
9use Phpdftk\Pdf\Core\File\PdfHydrator;
10use Phpdftk\Pdf\Core\PdfArray;
11use Phpdftk\Pdf\Core\PdfDictionary;
12use Phpdftk\Pdf\Core\PdfName;
13use Phpdftk\Pdf\Core\PdfNumber;
14use Phpdftk\Pdf\Core\PdfObject;
15use Phpdftk\Pdf\Core\PdfReference;
16use Phpdftk\Pdf\Core\PdfString;
17use Phpdftk\Pdf\Core\PdfVersion;
18use Phpdftk\Pdf\Core\Serializable;
19use Phpdftk\Pdf\Reader\Exception\InvalidPdfException;
20use Phpdftk\Pdf\Reader\Parser\HintTableParser;
21use Phpdftk\Pdf\Reader\Parser\ObjectParser;
22use Phpdftk\Pdf\Reader\Parser\ObjectScanner;
23use Phpdftk\Pdf\Reader\Parser\PageOffsetHintTable;
24use Phpdftk\Pdf\Reader\Parser\StreamParser;
25use Phpdftk\Pdf\Reader\Parser\XrefParser;
26use Phpdftk\Pdf\Reader\Parser\XrefStreamParser;
27use Phpdftk\Pdf\Reader\Tokenizer\FileSource;
28use Phpdftk\Pdf\Reader\Tokenizer\Source;
29use Phpdftk\Pdf\Reader\Tokenizer\StringSource;
30use Phpdftk\Pdf\Reader\Tokenizer\Tokenizer;
31
32/**
33 * PDF reader — parses existing PDF files into the phpdftk object model.
34 *
35 * Phase 1 supports unencrypted PDFs with classic cross-reference tables.
36 * Returns raw `PdfDictionary` objects; typed hydration (into `Catalog`,
37 * `Page`, etc.) is a future phase.
38 *
39 * Three factory methods mirror the writer's output modes:
40 *
41 * ```php
42 * $pdf = PdfReader::fromFile('/path/to/document.pdf');
43 * $pdf = PdfReader::fromString($bytes);
44 * $pdf = PdfReader::fromStream(fopen('php://stdin', 'rb'));
45 * ```
46 *
47 * @api
48 */
49final class PdfReader
50{
51    /** @var list<string> */
52    private array $parseWarnings = [];
53
54    private bool $strict = true;
55
56    private function __construct(
57        private readonly string $version,
58        private readonly PdfDictionary $trailer,
59        private readonly ObjectResolver $resolver,
60    ) {}
61
62    /**
63     * Return warnings accumulated during parsing.
64     *
65     * @return list<string>
66     */
67    public function getParseWarnings(): array
68    {
69        return $this->parseWarnings;
70    }
71
72    // -----------------------------------------------------------------------
73    // Factory methods
74    // -----------------------------------------------------------------------
75
76    public static function fromFile(string $path, string $password = '', bool $strict = true): self
77    {
78        return self::build(new FileSource($path), $password, $strict);
79    }
80
81    public static function fromString(string $content, string $password = '', bool $strict = true): self
82    {
83        return self::build(new StringSource($content), $password, $strict);
84    }
85
86    /**
87     * Read a public-key (certificate-based) encrypted PDF from a file.
88     */
89    public static function fromFilePublicKey(
90        string $path,
91        string $certificate,
92        string $privateKey,
93        bool $strict = true,
94    ): self {
95        return self::build(new FileSource($path), '', $strict, $certificate, $privateKey);
96    }
97
98    /**
99     * Read a public-key (certificate-based) encrypted PDF from a string.
100     */
101    public static function fromStringPublicKey(
102        string $content,
103        string $certificate,
104        string $privateKey,
105        bool $strict = true,
106    ): self {
107        return self::build(new StringSource($content), '', $strict, $certificate, $privateKey);
108    }
109
110    /** @param resource $stream */
111    public static function fromStream($stream, string $password = '', bool $strict = true): self
112    {
113        if (!is_resource($stream)) {
114            throw new \InvalidArgumentException('Expected a stream resource');
115        }
116        $content = stream_get_contents($stream);
117        if ($content === false) {
118            throw new \RuntimeException('Failed to read stream');
119        }
120        return self::fromString($content, $password, $strict);
121    }
122
123    // -----------------------------------------------------------------------
124    // Public API
125    // -----------------------------------------------------------------------
126
127    /** PDF version string, e.g. "1.7". */
128    public function getVersion(): string
129    {
130        return $this->version;
131    }
132
133    /** Typed PDF version from the file header. */
134    public function getPdfVersion(): PdfVersion
135    {
136        return PdfVersion::tryFrom($this->version) ?? PdfVersion::V1_7;
137    }
138
139    /**
140     * Effective PDF version — max(header, catalog /Version).
141     *
142     * Per ISO 32000 §7.2.2, the catalog /Version entry (PDF 1.4+)
143     * overrides the header version if it is higher.
144     */
145    public function getEffectiveVersion(): PdfVersion
146    {
147        $headerVersion = $this->getPdfVersion();
148        $catalog = $this->getCatalog();
149
150        if ($catalog instanceof PdfDictionary && $catalog->has('Version')) {
151            $catVersion = $catalog->get('Version');
152            if ($catVersion instanceof PdfName) {
153                $catPdfVersion = PdfVersion::tryFrom($catVersion->value);
154                if ($catPdfVersion !== null) {
155                    return $headerVersion->max($catPdfVersion);
156                }
157            }
158        }
159
160        return $headerVersion;
161    }
162
163    /**
164     * Scan the document for structural features inconsistent with the
165     * declared version. Returns a list of warning strings.
166     *
167     * Checks top-level indicators that can be detected from raw
168     * dictionaries without full object hydration.
169     *
170     * @return list<string>
171     */
172    public function validateVersion(): array
173    {
174        $warnings = [];
175        $version = $this->getEffectiveVersion();
176
177        // Xref stream → requires 1.5
178        $trailerType = $this->trailer->get('Type');
179        if ($trailerType instanceof PdfName && $trailerType->value === 'XRef') {
180            if (!$version->isAtLeast(PdfVersion::V1_5)) {
181                $warnings[] = "Cross-reference stream requires PDF 1.5, but document declares {$version->value}";
182            }
183        }
184
185        // Encryption version
186        $encrypt = $this->trailer->get('Encrypt');
187        if ($encrypt instanceof PdfReference) {
188            $encDict = $this->resolver->resolveReference($encrypt);
189            if ($encDict instanceof PdfDictionary) {
190                $v = $encDict->get('V');
191                $vVal = $v instanceof PdfNumber ? (int) $v->toPdf() : 0;
192                $required = match (true) {
193                    $vVal >= 5 => PdfVersion::V2_0,
194                    $vVal >= 4 => PdfVersion::V1_6,
195                    $vVal >= 2 => PdfVersion::V1_4,
196                    default => PdfVersion::V1_0,
197                };
198                if ($required->isGreaterThan($version)) {
199                    $warnings[] = "Encryption V={$vVal} requires PDF {$required->value}, but document declares {$version->value}";
200                }
201            }
202        }
203
204        // Catalog-level structural checks
205        try {
206            $catalog = $this->getCatalog();
207
208            if ($catalog->has('OCProperties') && !$version->isAtLeast(PdfVersion::V1_5)) {
209                $warnings[] = "Optional content (/OCProperties) requires PDF 1.5, but document declares {$version->value}";
210            }
211            if ($catalog->has('Collection') && !$version->isAtLeast(PdfVersion::V1_7)) {
212                $warnings[] = "PDF Portfolio (/Collection) requires PDF 1.7, but document declares {$version->value}";
213            }
214            if ($catalog->has('DPartRoot') && !$version->isAtLeast(PdfVersion::V2_0)) {
215                $warnings[] = "Document parts (/DPartRoot) requires PDF 2.0, but document declares {$version->value}";
216            }
217            if ($catalog->has('DSS') && !$version->isAtLeast(PdfVersion::V2_0)) {
218                $warnings[] = "Document security store (/DSS) requires PDF 2.0, but document declares {$version->value}";
219            }
220            if ($catalog->has('AF') && !$version->isAtLeast(PdfVersion::V2_0)) {
221                $warnings[] = "Associated files (/AF) requires PDF 2.0, but document declares {$version->value}";
222            }
223            if ($catalog->has('Requirements') && !$version->isAtLeast(PdfVersion::V1_7)) {
224                $warnings[] = "Requirements (/Requirements) requires PDF 1.7, but document declares {$version->value}";
225            }
226        } catch (InvalidPdfException) {
227            // Can't resolve catalog — skip structural checks
228        }
229
230        // Linearization integrity checks
231        $linParams = $this->getLinearizationParameters();
232        if ($linParams !== null) {
233            if ($linParams['pageCount'] > 0 && $linParams['pageCount'] !== $this->getPageCount()) {
234                $warnings[] = sprintf(
235                    'Linearization /N (%d) does not match actual page count (%d)',
236                    $linParams['pageCount'],
237                    $this->getPageCount(),
238                );
239            }
240        }
241
242        return $warnings;
243    }
244
245    /**
246     * Check whether this PDF is linearized (web-optimized).
247     *
248     * A linearized PDF has a LinearizationParameters dictionary as the
249     * very first indirect object, containing a /Linearized key. The
250     * reader handles linearized PDFs correctly (via startxref), but
251     * does not use the hint tables for progressive loading.
252     */
253    public function isLinearized(): bool
254    {
255        // Per ISO 32000-2 §F.2, the linearization dict is the first indirect
256        // object in the file. Most generators assign it object number 1 or 2,
257        // but the spec doesn't require a specific number. Check the first
258        // few objects by number, then fall back to scanning the raw bytes.
259        foreach ([1, 2, 3] as $objNum) {
260            try {
261                $obj = $this->resolver->resolve($objNum);
262            } catch (\Throwable) {
263                continue;
264            }
265            if ($obj instanceof PdfDictionary && $obj->get('Linearized') !== null) {
266                return true;
267            }
268        }
269
270        // Fallback: check all resolved objects for /Linearized key.
271        // The linearization dict can have any object number.
272        $trailerSize = $this->trailer->get('Size');
273        $maxCheck = min(50, (int) ($trailerSize instanceof PdfNumber
274            ? $trailerSize->toPdf() : 50));
275        for ($i = 4; $i <= $maxCheck; $i++) {
276            try {
277                $obj = $this->resolver->resolve($i);
278            } catch (\Throwable) {
279                continue;
280            }
281            if ($obj instanceof PdfDictionary && $obj->get('Linearized') !== null) {
282                return true;
283            }
284        }
285
286        return false;
287    }
288
289    /**
290     * Get linearization parameters if the PDF is linearized.
291     *
292     * @return array{linearized: float, fileLength: int, firstPageObj: int, firstPageEnd: int, pageCount: int, xrefOffset: int}|null
293     */
294    public function getLinearizationParameters(): ?array
295    {
296        $maxCheck = min(10, (int) ($this->trailer->get('Size') instanceof PdfNumber
297            ? $this->trailer->get('Size')->toPdf() : 10));
298
299        for ($objNum = 1; $objNum <= $maxCheck; $objNum++) {
300            try {
301                $obj = $this->resolver->resolve($objNum);
302            } catch (\Throwable) {
303                continue;
304            }
305            if (!$obj instanceof PdfDictionary || $obj->get('Linearized') === null) {
306                continue;
307            }
308
309            $getInt = static fn(string $key): int =>
310                ($v = $obj->get($key)) instanceof PdfNumber ? (int) $v->toPdf() : 0;
311            $getFloat = static fn(string $key): float =>
312                ($v = $obj->get($key)) instanceof PdfNumber ? (float) $v->toPdf() : 0.0;
313
314            return [
315                'linearized' => $getFloat('Linearized'),
316                'fileLength' => $getInt('L'),
317                'firstPageObj' => $getInt('O'),
318                'firstPageEnd' => $getInt('E'),
319                'pageCount' => $getInt('N'),
320                'xrefOffset' => $getInt('T'),
321            ];
322        }
323
324        return null;
325    }
326
327    /**
328     * Parse the page offset hint table from a linearized PDF.
329     *
330     * Returns null if the PDF is not linearized or the hint stream
331     * cannot be located/parsed.
332     */
333    public function getPageOffsetHintTable(): ?PageOffsetHintTable
334    {
335        $params = $this->getLinearizationParameters();
336        if ($params === null) {
337            return null;
338        }
339
340        // Find the /H array from the linearization dict
341        foreach ([1, 2] as $objNum) {
342            try {
343                $obj = $this->resolver->resolve($objNum);
344            } catch (\Throwable) {
345                continue;
346            }
347            if (!$obj instanceof PdfDictionary || $obj->get('Linearized') === null) {
348                continue;
349            }
350
351            $hArray = $obj->get('H');
352            if (!$hArray instanceof PdfArray || count($hArray->items) < 2) {
353                return null;
354            }
355
356            $hintOffset = $hArray->items[0] instanceof PdfNumber
357                ? (int) $hArray->items[0]->toPdf() : 0;
358            $hintLength = $hArray->items[1] instanceof PdfNumber
359                ? (int) $hArray->items[1]->toPdf() : 0;
360
361            if ($hintOffset <= 0 || $hintLength <= 0) {
362                return null;
363            }
364
365            // Find the hint stream object — it's at the given byte offset.
366            // Look through resolved objects to find one at that offset.
367            // The hint stream is typically a regular indirect object we can resolve.
368            // Try to find it by scanning known objects near the offset.
369            try {
370                // The hint stream object might be identifiable by iterating objects
371                // or by directly parsing at the offset. For now, iterate objects
372                // and find the stream near the linearization dict.
373                $hintData = null;
374                $hintDict = null;
375
376                // Try objects 2-10 (hint stream is typically early in the file)
377                for ($n = 1; $n <= min(20, $params['pageCount'] + 10); $n++) {
378                    try {
379                        $candidate = $this->resolver->resolve($n);
380                    } catch (\Throwable) {
381                        continue;
382                    }
383                    if (
384                        $candidate instanceof PdfDictionary
385                        && $candidate->has('S')
386                        && ($candidate->get('S') instanceof PdfNumber)
387                    ) {
388                        // This looks like a hint stream dict (has /S for shared obj table offset)
389                        // Check if it's a stream by looking for data
390                        $hintDict = $candidate;
391                        break;
392                    }
393                }
394
395                // If we found the dict but no stream data, we can't parse hints
396                if ($hintDict === null) {
397                    return null;
398                }
399
400                // Get the page offset table offset (usually 0 within the hint data)
401                $pageTableOffset = 0; // /P offset, default 0
402                $pVal = $hintDict->get('P');
403                if ($pVal instanceof PdfNumber) {
404                    $pageTableOffset = (int) $pVal->toPdf();
405                }
406
407                // For now, return null if we can't get the raw stream data
408                // (full implementation would parse the stream bytes directly)
409                return null;
410            } catch (\Throwable) {
411                return null;
412            }
413        }
414
415        return null;
416    }
417
418    /**
419     * Calculate the byte range for a specific page in a linearized PDF.
420     *
421     * Returns an associative array with 'offset' and 'length' keys,
422     * or null if the PDF is not linearized or hints are unavailable.
423     *
424     * @return array{offset: int, length: int}|null
425     */
426    public function getPageByteRange(int $pageIndex): ?array
427    {
428        $hintTable = $this->getPageOffsetHintTable();
429        if ($hintTable === null) {
430            return null;
431        }
432
433        try {
434            return $hintTable->getPageByteRange($pageIndex);
435        } catch (\OutOfRangeException) {
436            return null;
437        }
438    }
439
440    /** The raw trailer dictionary. */
441    public function getTrailer(): PdfDictionary
442    {
443        return $this->trailer;
444    }
445
446    /** Resolve /Root from the trailer — returns the Catalog dictionary. */
447    public function getCatalog(): PdfDictionary
448    {
449        $root = $this->trailer->get('Root');
450        if ($root instanceof PdfReference) {
451            try {
452                $obj = $this->resolver->resolveReference($root);
453            } catch (\Throwable $e) {
454                if ($this->strict) {
455                    throw $e;
456                }
457                $obj = null;
458                $this->parseWarnings[] = 'Failed to resolve /Root: ' . $e->getMessage();
459            }
460            if ($obj instanceof PdfDictionary) {
461                return $obj;
462            }
463        }
464
465        if (!$this->strict) {
466            $recovered = $this->recoverCatalog();
467            if ($recovered !== null) {
468                return $recovered;
469            }
470            // Last-resort: return an empty dict so callers can keep going.
471            // The parse warnings record that the document had no usable
472            // catalog (the test message whitelist is unaffected because
473            // we no longer throw).
474            $this->parseWarnings[] = 'No usable /Type /Catalog found; returning empty catalog';
475            return new PdfDictionary();
476        }
477
478        throw new InvalidPdfException('Unable to resolve /Root catalog');
479    }
480
481    /**
482     * Lenient-mode catalog recovery: scan the file for an object whose
483     * body contains `/Type /Catalog` (or, failing that, an object that
484     * looks like a page-tree root referenced as `/Pages`).
485     */
486    private function recoverCatalog(): ?PdfDictionary
487    {
488        $map = $this->resolver->scanObjectMap();
489        if ($map === []) {
490            return null;
491        }
492
493        // Pass 1: find an object whose first ~512 bytes contain `/Type /Catalog`.
494        foreach ($map as $objNum => $offset) {
495            $peek = $this->resolver->readRaw($offset, 512);
496            if (preg_match('#/Type\s*/Catalog\b#', $peek)) {
497                try {
498                    $obj = $this->resolver->resolve($objNum);
499                } catch (\Throwable) {
500                    continue;
501                }
502                if ($obj instanceof PdfDictionary) {
503                    $this->parseWarnings[] = "Recovered /Root catalog by scanning (object $objNum)";
504                    return $obj;
505                }
506            }
507        }
508
509        // Pass 2: find an object whose body looks like a page-tree root
510        // (`/Type /Pages`) and synthesise a minimal catalog pointing at it.
511        foreach ($map as $objNum => $offset) {
512            $peek = $this->resolver->readRaw($offset, 512);
513            if (preg_match('#/Type\s*/Pages\b#', $peek)) {
514                try {
515                    $obj = $this->resolver->resolve($objNum);
516                } catch (\Throwable) {
517                    continue;
518                }
519                if ($obj instanceof PdfDictionary) {
520                    $synthetic = new PdfDictionary();
521                    $synthetic->set('Type', new PdfName('Catalog'));
522                    $synthetic->set('Pages', new PdfReference($objNum, 0));
523                    $this->parseWarnings[] = "Synthesised catalog from /Pages object $objNum";
524                    return $synthetic;
525                }
526            }
527        }
528
529        return null;
530    }
531
532    /** Resolve /Info from the trailer. */
533    public function getInfo(): ?PdfDictionary
534    {
535        $info = $this->trailer->get('Info');
536        if ($info instanceof PdfReference) {
537            $obj = $this->resolver->resolveReference($info);
538            if ($obj instanceof PdfDictionary) {
539                return $obj;
540            }
541        }
542        return null;
543    }
544
545    /** Get the total page count from /Pages -> /Count. */
546    public function getPageCount(): int
547    {
548        $catalog = $this->getCatalog();
549        $pagesRef = $catalog->get('Pages');
550        if ($pagesRef instanceof PdfReference) {
551            $pages = $this->resolver->resolveReference($pagesRef);
552            if ($pages instanceof PdfDictionary) {
553                $count = $pages->get('Count');
554                if ($count instanceof PdfNumber) {
555                    return (int) $count->toPdf();
556                }
557            }
558        }
559        return 0;
560    }
561
562    /**
563     * Get all Page dictionaries by traversing the page tree.
564     *
565     * @return list<PdfDictionary>
566     */
567    public function getPages(): array
568    {
569        $catalog = $this->getCatalog();
570        $pagesRef = $catalog->get('Pages');
571        if (!$pagesRef instanceof PdfReference) {
572            return [];
573        }
574        $pagesDict = $this->resolver->resolveReference($pagesRef);
575        if (!$pagesDict instanceof PdfDictionary) {
576            return [];
577        }
578        $result = [];
579        $this->collectPages($pagesDict, $result);
580        return $result;
581    }
582
583    /** Get a specific page by zero-based index. */
584    public function getPage(int $index): PdfDictionary
585    {
586        $pages = $this->getPages();
587        if (!isset($pages[$index])) {
588            throw new \OutOfRangeException("Page index $index out of range (0.." . (count($pages) - 1) . ')');
589        }
590        return $pages[$index];
591    }
592
593    /** Resolve any object by number. */
594    public function getObject(int $objNum): Serializable
595    {
596        return $this->resolver->resolve($objNum);
597    }
598
599    /** Resolve an indirect reference to its target. */
600    public function resolveReference(PdfReference $ref): Serializable
601    {
602        return $this->resolver->resolveReference($ref);
603    }
604
605    /** The underlying object resolver. */
606    public function getResolver(): ObjectResolver
607    {
608        return $this->resolver;
609    }
610
611    // -----------------------------------------------------------------------
612    // Text extraction
613    // -----------------------------------------------------------------------
614
615    /**
616     * Extract text from a page by index (zero-based).
617     *
618     * Interprets content stream operators, resolves font encodings
619     * (ToUnicode CMap, /Encoding + /Differences, WinAnsi fallback),
620     * and infers spacing from text positioning operators.
621     */
622    public function extractText(int $pageIndex): string
623    {
624        $page = $this->getPage($pageIndex);
625        $extractor = new TextExtractor($this->resolver);
626        return $extractor->extractFromPage($page);
627    }
628
629    /**
630     * Extract text from all pages, concatenated with page separators.
631     *
632     * @param string $separator Separator between pages (default: newline)
633     */
634    public function extractAllText(string $separator = "\n"): string
635    {
636        $pages = $this->getPages();
637        $texts = [];
638        $extractor = new TextExtractor($this->resolver);
639        foreach ($pages as $page) {
640            $texts[] = $extractor->extractFromPage($page);
641        }
642        return implode($separator, $texts);
643    }
644
645    /**
646     * Extract text with precise positioning from a page by index (zero-based).
647     *
648     * Returns a list of TextSpan objects, each containing the text content,
649     * position (x, y in user space), dimensions (width, height), font size,
650     * and font name.
651     *
652     * @return list<TextSpan>
653     */
654    public function extractTextWithPositions(int $pageIndex): array
655    {
656        $page = $this->getPage($pageIndex);
657        $extractor = new PositionedTextExtractor($this->resolver);
658        return $extractor->extractFromPage($page);
659    }
660
661    /**
662     * Extract text with precise positioning from all pages.
663     *
664     * @return array<int, list<TextSpan>> Zero-based page index => spans
665     */
666    public function extractAllTextWithPositions(): array
667    {
668        $pages = $this->getPages();
669        $result = [];
670        $extractor = new PositionedTextExtractor($this->resolver);
671        foreach ($pages as $index => $page) {
672            $result[$index] = $extractor->extractFromPage($page);
673        }
674        return $result;
675    }
676
677    // -----------------------------------------------------------------------
678    // Hydration — typed object access
679    // -----------------------------------------------------------------------
680
681    /**
682     * Return the document catalog as a typed Catalog object.
683     */
684    public function getTypedCatalog(): Catalog
685    {
686        PdfHydrator::registerDefaults();
687        $dict = $this->getCatalog();
688        $root = $this->trailer->get('Root');
689        $objNum = $root instanceof PdfReference ? $root->objectNumber : 0;
690
691        $result = PdfHydrator::hydrate($dict, $objNum);
692        if ($result instanceof Catalog) {
693            return $result;
694        }
695
696        throw new Exception\InvalidPdfException('Failed to hydrate /Root as Catalog');
697    }
698
699    /**
700     * Return a specific page as a typed Page object.
701     */
702    public function getTypedPage(int $index): Page
703    {
704        PdfHydrator::registerDefaults();
705        $dict = $this->getPage($index);
706
707        $result = PdfHydrator::hydrate($dict);
708        if ($result instanceof Page) {
709            return $result;
710        }
711
712        throw new Exception\InvalidPdfException("Failed to hydrate page $index as Page");
713    }
714
715    /**
716     * Return all pages as typed Page objects.
717     *
718     * @return list<Page>
719     */
720    public function getTypedPages(): array
721    {
722        PdfHydrator::registerDefaults();
723        $pages = [];
724        foreach ($this->getPages() as $dict) {
725            $result = PdfHydrator::hydrate($dict);
726            if ($result instanceof Page) {
727                $pages[] = $result;
728            }
729        }
730        return $pages;
731    }
732
733    /**
734     * Hydrate any resolved object by object number.
735     */
736    public function getTypedObject(int $objNum): PdfObject|PdfDictionary
737    {
738        PdfHydrator::registerDefaults();
739        $obj = $this->resolver->resolve($objNum);
740        if ($obj instanceof PdfDictionary) {
741            return PdfHydrator::hydrate($obj, $objNum);
742        }
743        if ($obj instanceof PdfObject) {
744            return $obj;
745        }
746        return new PdfDictionary();
747    }
748
749    // -----------------------------------------------------------------------
750    // Internal
751    // -----------------------------------------------------------------------
752
753    private static function build(
754        Source $source,
755        string $password = '',
756        bool $strict = true,
757        ?string $certificate = null,
758        ?string $privateKey = null,
759    ): self {
760        $warnings = [];
761
762        // 1. Validate header — check first 20 bytes, then scan up to 1024 in lenient mode
763        $header = $source->read(20);
764        if (preg_match('/^%PDF-(\d+\.\d+)/', $header, $m)) {
765            $version = $m[1];
766        } else {
767            // Header not at byte 0 — scan first 1024 bytes
768            $source->seek(0);
769            $headerBlock = $source->read(min(1024, $source->size()));
770            if (preg_match('/%PDF-(\d+\.\d+)/', $headerBlock, $m)) {
771                if ($strict) {
772                    throw new InvalidPdfException('Not a PDF file (missing %PDF- header)');
773                }
774                $version = $m[1];
775                $warnings[] = 'PDF header not at byte 0; found at offset ' . strpos($headerBlock, '%PDF-');
776            } else {
777                throw new InvalidPdfException('Not a PDF file (missing %PDF- header)');
778            }
779        }
780
781        // 2. Build parser chain
782        $tokenizer = new Tokenizer($source);
783        $objectParser = new ObjectParser($tokenizer, $source);
784        $streamParser = new StreamParser();
785        $xrefParser = new XrefParser($tokenizer, $source, $objectParser);
786        $xrefStreamParser = new XrefStreamParser($tokenizer, $source, $objectParser, $streamParser);
787
788        // 3. Find startxref + parse xref + trailer — with reconstruction fallback
789        $entries = null;
790        $trailer = null;
791        $reconstructed = false;
792
793        try {
794            $startxrefOffset = self::findStartxref($source, $strict);
795
796            if ($startxrefOffset !== null) {
797                // 4. Parse xref + trailer — auto-detect classic vs stream
798                [$entries, $trailer] = self::parseXrefAt(
799                    $source,
800                    $startxrefOffset,
801                    $xrefParser,
802                    $xrefStreamParser,
803                    $strict,
804                    $warnings,
805                );
806            }
807        } catch (\Throwable $e) {
808            if ($strict) {
809                throw $e instanceof InvalidPdfException ? $e : new InvalidPdfException($e->getMessage(), 0, $e);
810            }
811            $warnings[] = 'xref parsing failed: ' . $e->getMessage();
812            // Fall through to reconstruction
813        }
814
815        if ($entries === null || $trailer === null) {
816            if ($strict) {
817                throw new InvalidPdfException('Cannot parse xref table or trailer');
818            }
819            [$entries, $trailer] = self::reconstructXref($source);
820            $warnings[] = 'xref table reconstructed from object scan';
821            $reconstructed = true;
822        }
823
824        // 5. Set up decryptor if /Encrypt is present
825        $decryptor = null;
826        $encrypt = $trailer->get('Encrypt');
827        if ($encrypt instanceof PdfReference) {
828            // /Encrypt might be an indirect reference — resolve it
829            $tempResolver = new ObjectResolver($entries, $tokenizer, $source, $objectParser, $streamParser);
830            $resolved = $tempResolver->resolveReference($encrypt);
831            if ($resolved instanceof PdfDictionary) {
832                $encrypt = $resolved;
833            }
834        }
835        if ($encrypt instanceof PdfDictionary) {
836            $fileId = self::extractFileId($trailer);
837            $filter = $encrypt->get('Filter');
838            $isPublicKey = $filter instanceof PdfName && $filter->value === 'Adobe.PubSec';
839
840            if ($isPublicKey && $certificate !== null && $privateKey !== null) {
841                $decryptor = PdfDecryptor::fromEncryptDictPublicKey(
842                    $encrypt,
843                    $certificate,
844                    $privateKey,
845                    $fileId,
846                );
847            } elseif (!$isPublicKey) {
848                $decryptor = PdfDecryptor::fromEncryptDict($encrypt, $password, $fileId);
849            } else {
850                throw new InvalidPdfException(
851                    'PDF uses public-key encryption; use fromFilePublicKey() or fromStringPublicKey() with certificate and private key',
852                );
853            }
854        }
855
856        // 6. Build resolver (with optional decryptor)
857        $resolver = new ObjectResolver(
858            $entries,
859            $tokenizer,
860            $source,
861            $objectParser,
862            $streamParser,
863            $decryptor,
864        );
865        $resolver->setStrict($strict);
866
867        // Wire resolver into stream parser for resolving indirect /DecodeParms
868        $streamParser->setResolver($resolver);
869
870        // 7. Follow /Prev chain for incremental updates (skip if reconstructed)
871        if (!$reconstructed) {
872            $prev = $trailer->get('Prev');
873            $seenPrevOffsets = [];
874            while ($prev instanceof PdfNumber) {
875                $prevOffset = (int) $prev->toPdf();
876
877                // Detect circular /Prev chains
878                if (isset($seenPrevOffsets[$prevOffset])) {
879                    $warnings[] = "Circular /Prev chain detected at offset $prevOffset";
880                    break;
881                }
882                $seenPrevOffsets[$prevOffset] = true;
883
884                try {
885                    [$olderEntries, $olderTrailer] = self::parseXrefAt(
886                        $source,
887                        $prevOffset,
888                        $xrefParser,
889                        $xrefStreamParser,
890                        $strict,
891                        $warnings,
892                    );
893                    $resolver->mergeOlderEntries($olderEntries);
894                    $prev = $olderTrailer->get('Prev');
895                } catch (\Throwable $e) {
896                    if ($strict) {
897                        throw $e instanceof InvalidPdfException ? $e : new InvalidPdfException($e->getMessage(), 0, $e);
898                    }
899                    $warnings[] = "/Prev chain parsing failed at offset $prevOffset" . $e->getMessage();
900                    break;
901                }
902            }
903        }
904
905        $reader = new self($version, $trailer, $resolver);
906        $reader->parseWarnings = $warnings;
907        $reader->strict = $strict;
908        return $reader;
909    }
910
911    /**
912     * Extract the first element of the /ID array from the trailer.
913     */
914    private static function extractFileId(PdfDictionary $trailer): string
915    {
916        $id = $trailer->get('ID');
917        if ($id instanceof PdfArray && isset($id->items[0])) {
918            $first = $id->items[0];
919            if ($first instanceof PdfString) {
920                return $first->value;
921            }
922        }
923        return '';
924    }
925
926    /**
927     * Auto-detect classic xref table vs cross-reference stream at the
928     * given offset and parse accordingly.
929     *
930     * @return array{0: array<int, XrefEntry>, 1: PdfDictionary}
931     */
932    /**
933     * @param list<string> $warnings
934     * @return array{0: array<int, XrefEntry>, 1: PdfDictionary}
935     */
936    private static function parseXrefAt(
937        Source $source,
938        int $offset,
939        XrefParser $classicParser,
940        XrefStreamParser $streamParser,
941        bool $strict = true,
942        array &$warnings = [],
943    ): array {
944        // Peek at the bytes at the offset to decide which parser to use.
945        $source->seek($offset);
946        $peek = $source->peek(4);
947        if (str_starts_with($peek, 'xref')) {
948            return $classicParser->parseClassicXref($offset, $strict, $warnings);
949        }
950        // Otherwise assume it's a cross-reference stream (starts with "N M obj")
951        return $streamParser->parseXrefStream($offset);
952    }
953
954    /**
955     * Scan backward from EOF to find the `startxref` byte offset.
956     *
957     * Returns null if startxref is missing or corrupted (allows lenient
958     * fallback to xref reconstruction).
959     */
960    private static function findStartxref(Source $source, bool $strict = true): ?int
961    {
962        $size = $source->size();
963
964        // Try progressively larger tail sizes: 1024, 8192, 65536
965        foreach ([1024, 8192, 65536] as $tryLength) {
966            $tailLength = min($tryLength, $size);
967            $source->seek($size - $tailLength);
968            $tail = $source->read($tailLength);
969
970            $pos = strrpos($tail, 'startxref');
971            if ($pos !== false) {
972                $after = substr($tail, $pos + strlen('startxref'));
973                if (preg_match('/\s+(\d+)/', $after, $m)) {
974                    return (int) $m[1];
975                }
976            }
977        }
978
979        if ($strict) {
980            throw new InvalidPdfException('Cannot find startxref');
981        }
982        return null;
983    }
984
985    /**
986     * Reconstruct xref entries and trailer by scanning for object definitions.
987     *
988     * Used as a fallback when the normal xref/trailer parsing fails in lenient mode.
989     *
990     * @return array{0: array<int, XrefEntry>, 1: PdfDictionary}
991     */
992    private static function reconstructXref(Source $source): array
993    {
994        $source->seek(0);
995        $allBytes = $source->read($source->size());
996
997        $objectMap = ObjectScanner::scan($allBytes);
998
999        if ($objectMap === []) {
1000            // No object headers found. Return an empty xref + synthetic
1001            // empty catalog so the caller can still emit a parseable
1002            // (if useless) document. Higher layers should treat this as
1003            // an irrecoverable file.
1004            $trailer = new PdfDictionary();
1005            $trailer->set('Size', new PdfNumber(1));
1006            return [[], $trailer];
1007        }
1008
1009        // Build xref entries
1010        $entries = [];
1011        foreach ($objectMap as $objNum => $offset) {
1012            $entries[$objNum] = new XrefEntry(XrefEntry::TYPE_IN_USE, $offset, 0);
1013        }
1014
1015        $catalogObjNum = self::findCatalogInScan($objectMap, $allBytes);
1016
1017        $maxObjNum = max(array_keys($objectMap));
1018        $trailer = new PdfDictionary();
1019        if ($catalogObjNum !== null) {
1020            $trailer->set('Root', new PdfReference($catalogObjNum, 0));
1021        }
1022        $trailer->set('Size', new PdfNumber($maxObjNum + 1));
1023
1024        return [$entries, $trailer];
1025    }
1026
1027    /**
1028     * Identify which scanned object is the catalog using progressively
1029     * looser heuristics. Returns the object number or null if no
1030     * reasonable candidate is found.
1031     *
1032     * @param array<int, int> $objectMap
1033     */
1034    private static function findCatalogInScan(array $objectMap, string $allBytes): ?int
1035    {
1036        $bytesLen = strlen($allBytes);
1037        $peek = static function (int $offset) use ($allBytes, $bytesLen): string {
1038            $peekLength = min(1024, $bytesLen - $offset);
1039            return $peekLength > 0 ? substr($allBytes, $offset, $peekLength) : '';
1040        };
1041
1042        // Pass 1: explicit /Type /Catalog
1043        foreach ($objectMap as $objNum => $offset) {
1044            if (preg_match('#/Type\s*/Catalog\b#', $peek($offset))) {
1045                return $objNum;
1046            }
1047        }
1048
1049        // Pass 2: a dict that has /Pages but no /Parent (heuristic for
1050        // catalog-with-missing-/Type, e.g. qpdf's bad8/bad11 stripped
1051        // catalogs).
1052        foreach ($objectMap as $objNum => $offset) {
1053            $body = $peek($offset);
1054            if (preg_match('#/Pages\s+\d+\s+\d+\s+R#', $body) && !preg_match('#/Parent\b#', $body)) {
1055                return $objNum;
1056            }
1057        }
1058
1059        // Pass 3: object body literally contains the word "Catalog"
1060        // somewhere (covers PDFs like bug_454695 where /Type /Catalog is
1061        // formatted oddly or appears inside a hex string comment).
1062        foreach ($objectMap as $objNum => $offset) {
1063            if (str_contains($peek($offset), 'Catalog')) {
1064                return $objNum;
1065            }
1066        }
1067
1068        return null;
1069    }
1070
1071    /**
1072     * Recursively collect Page dicts from a Pages tree node.
1073     *
1074     * @param list<PdfDictionary> $result
1075     */
1076    private function collectPages(PdfDictionary $node, array &$result): void
1077    {
1078        $kids = $node->get('Kids');
1079        if (!$kids instanceof PdfArray) {
1080            return;
1081        }
1082        foreach ($kids->items as $kidRef) {
1083            if (!$kidRef instanceof PdfReference) {
1084                continue;
1085            }
1086            $kid = $this->resolver->resolveReference($kidRef);
1087            if (!$kid instanceof PdfDictionary) {
1088                continue;
1089            }
1090            $type = $kid->get('Type');
1091            if ($type instanceof PdfName && $type->value === 'Pages') {
1092                $this->collectPages($kid, $result);
1093            } else {
1094                $result[] = $kid;
1095            }
1096        }
1097    }
1098}