Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
90.77% covered (success)
90.77%
59 / 65
80.00% covered (warning)
80.00%
4 / 5
CRAP
0.00% covered (danger)
0.00%
0 / 1
XrefParser
90.77% covered (success)
90.77%
59 / 65
80.00% covered (warning)
80.00%
4 / 5
38.08
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 parseClassicXref
85.00% covered (warning)
85.00%
34 / 40
0.00% covered (danger)
0.00%
0 / 1
11.41
 readLine
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
4
 skipWhitespace
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
9
 readWord
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
12
1<?php
2
3declare(strict_types=1);
4
5namespace Phpdftk\Pdf\Reader\Parser;
6
7use Phpdftk\Pdf\Core\PdfDictionary;
8use Phpdftk\Pdf\Reader\Exception\InvalidPdfException;
9use Phpdftk\Pdf\Reader\Tokenizer\Source;
10use Phpdftk\Pdf\Reader\Tokenizer\Tokenizer;
11use Phpdftk\Pdf\Reader\XrefEntry;
12
13/**
14 * Parses a classic cross-reference table and its trailer dictionary.
15 *
16 * Uses raw byte reads for the fixed-format xref section (to avoid
17 * interleaving tokenized and raw reads on the same Source), then hands
18 * off to the ObjectParser for the trailer dictionary only.
19 */
20final class XrefParser
21{
22    public function __construct(
23        private readonly Tokenizer $tokenizer,
24        private readonly Source $source,
25        private readonly ObjectParser $objectParser,
26    ) {}
27
28    /**
29     * Parse a classic xref table at the given byte offset.
30     *
31     * @param list<string> $warnings
32     * @return array{0: array<int, XrefEntry>, 1: PdfDictionary}
33     */
34    public function parseClassicXref(int $offset, bool $strict = true, array &$warnings = []): array
35    {
36        $this->source->seek($offset);
37
38        // Read and verify "xref" keyword
39        $this->skipWhitespace();
40        $keyword = $this->readWord();
41        if ($keyword !== 'xref') {
42            throw new InvalidPdfException(
43                "Expected 'xref' at offset $offset, got '$keyword'",
44            );
45        }
46
47        $entries = [];
48
49        // Parse subsections until we hit "trailer"
50        while (true) {
51            $this->skipWhitespace();
52            $word = $this->readWord();
53            if ($word === 'trailer') {
54                break;
55            }
56            if ($word === '' || $this->source->isEof()) {
57                throw new InvalidPdfException(
58                    "Unexpected end of xref table at offset " . $this->source->tell() . ": expected 'trailer'",
59                );
60            }
61
62            // $word is the first object number of this subsection
63            $firstObj = (int) $word;
64
65            $this->skipWhitespace();
66            $countWord = $this->readWord();
67            $count = (int) $countWord;
68
69            $this->skipWhitespace();
70
71            for ($i = 0; $i < $count; $i++) {
72                // Read an entry line. Spec says exactly 20 bytes, but
73                // some producers write 21 (extra space before CRLF).
74                // Be tolerant: read up to 24 bytes, then trim and parse.
75                $line = $this->readLine(24);
76                if (!preg_match('/^(\d{10})\s+(\d{5})\s+([nf])/', $line, $em)) {
77                    if ($strict) {
78                        throw new InvalidPdfException(
79                            "Malformed xref entry at offset " . $this->source->tell() . ": '$line'",
80                        );
81                    }
82                    $warnings[] = "Skipped malformed xref entry for object " . ($firstObj + $i) . ": '$line'";
83                    continue;
84                }
85                $entryOffset = (int) $em[1];
86                $gen = (int) $em[2];
87                $type = ($em[3] === 'f') ? XrefEntry::TYPE_FREE : XrefEntry::TYPE_IN_USE;
88                $entries[$firstObj + $i] = new XrefEntry($type, $entryOffset, $gen);
89            }
90        }
91
92        // Now the source is positioned right after "trailer".
93        // Sync the tokenizer to this position and parse the trailer dict.
94        $this->tokenizer->seek($this->source->tell());
95        $trailer = $this->objectParser->parseValue();
96        if (!$trailer instanceof PdfDictionary) {
97            throw new InvalidPdfException('Trailer is not a dictionary');
98        }
99
100        return [$entries, $trailer];
101    }
102
103    /**
104     * Read up to $maxBytes, stopping at (and consuming) the first \n.
105     */
106    private function readLine(int $maxBytes): string
107    {
108        $line = '';
109        for ($i = 0; $i < $maxBytes; $i++) {
110            $byte = $this->source->readByte();
111            if ($byte === null) {
112                break;
113            }
114            if ($byte === "\n") {
115                break;
116            }
117            $line .= $byte;
118        }
119        return rtrim($line, "\r");
120    }
121
122    private function skipWhitespace(): void
123    {
124        while (!$this->source->isEof()) {
125            $byte = $this->source->peek();
126            if ($byte === '' || ($byte !== "\x00" && $byte !== "\x09" && $byte !== "\x0A"
127                && $byte !== "\x0C" && $byte !== "\x0D" && $byte !== "\x20")) {
128                return;
129            }
130            $this->source->readByte();
131        }
132    }
133
134    /**
135     * Read a contiguous run of non-whitespace, non-delimiter bytes.
136     */
137    private function readWord(): string
138    {
139        $word = '';
140        while (!$this->source->isEof()) {
141            $byte = $this->source->peek();
142            if ($byte === '' || $byte === "\x00" || $byte === "\x09" || $byte === "\x0A"
143                || $byte === "\x0C" || $byte === "\x0D" || $byte === "\x20"
144                || $byte === '<' || $byte === '/' || $byte === '[') {
145                break;
146            }
147            $word .= $this->source->readByte();
148        }
149        return $word;
150    }
151}