Code Coverage for /home/runner/work/phpdftk/phpdftk/packages/pdf/reader/src/PdfReader.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	82.19% covered (warning)	82.19%	360 / 438	62.50% covered (warning)	62.50%	25 / 40	CRAP	0.00% covered (danger)	0.00%	0 / 1
PdfReader	82.19% covered (warning)	82.19%	360 / 438	62.50% covered (warning)	62.50%	25 / 40	397.03	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
getParseWarnings	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
fromFile	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
fromString	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
fromFilePublicKey	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
fromStringPublicKey	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
fromStream	66.67% covered (warning)	66.67%	4 / 6	0.00% covered (danger)	0.00%	0 / 1	3.33
getVersion	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
getPdfVersion	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
getEffectiveVersion	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	5
validateVersion	63.41% covered (warning)	63.41%	26 / 41	0.00% covered (danger)	0.00%	0 / 1	66.39
isLinearized	75.00% covered (warning)	75.00%	12 / 16	0.00% covered (danger)	0.00%	0 / 1	11.56
getLinearizationParameters	90.48% covered (success)	90.48%	19 / 21	0.00% covered (danger)	0.00%	0 / 1	8.06
getPageOffsetHintTable	23.08% covered (danger)	23.08%	9 / 39	0.00% covered (danger)	0.00%	0 / 1	202.07
getPageByteRange	50.00% covered (danger)	50.00%	3 / 6	0.00% covered (danger)	0.00%	0 / 1	4.12
getTrailer	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
getCatalog	88.24% covered (warning)	88.24%	15 / 17	0.00% covered (danger)	0.00%	0 / 1	7.08
recoverCatalog	80.00% covered (warning)	80.00%	20 / 25	0.00% covered (danger)	0.00%	0 / 1	10.80
getInfo	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	3
getPageCount	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	4
getPages	80.00% covered (warning)	80.00%	8 / 10	0.00% covered (danger)	0.00%	0 / 1	3.07
getPage	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	2
getObject	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
resolveReference	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
getResolver	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
extractText	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
extractAllText	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	2
extractTextWithPositions	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
extractAllTextWithPositions	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	2
getTypedCatalog	87.50% covered (warning)	87.50%	7 / 8	0.00% covered (danger)	0.00%	0 / 1	3.02
getTypedPage	83.33% covered (warning)	83.33%	5 / 6	0.00% covered (danger)	0.00%	0 / 1	2.02
getTypedPages	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	3
getTypedObject	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	3
build	98.02% covered (success)	98.02%	99 / 101	0.00% covered (danger)	0.00%	0 / 1	25
extractFileId	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	4
parseXrefAt	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	2
findStartxref	100.00% covered (success)	100.00%	13 / 13	100.00% covered (success)	100.00%	1 / 1	5
reconstructXref	82.35% covered (warning)	82.35%	14 / 17	0.00% covered (danger)	0.00%	0 / 1	4.09
findCatalogInScan	87.50% covered (warning)	87.50%	14 / 16	0.00% covered (danger)	0.00%	0 / 1	9.16
collectPages	69.23% covered (warning)	69.23%	9 / 13	0.00% covered (danger)	0.00%	0 / 1	8.43

1	<?php
2
3	declare(strict_types=1);
4
5	namespace Phpdftk\Pdf\Reader;
6
7	use Phpdftk\Pdf\Core\Document\Catalog;
8	use Phpdftk\Pdf\Core\Document\Page;
9	use Phpdftk\Pdf\Core\File\PdfHydrator;
10	use Phpdftk\Pdf\Core\PdfArray;
11	use Phpdftk\Pdf\Core\PdfDictionary;
12	use Phpdftk\Pdf\Core\PdfName;
13	use Phpdftk\Pdf\Core\PdfNumber;
14	use Phpdftk\Pdf\Core\PdfObject;
15	use Phpdftk\Pdf\Core\PdfReference;
16	use Phpdftk\Pdf\Core\PdfString;
17	use Phpdftk\Pdf\Core\PdfVersion;
18	use Phpdftk\Pdf\Core\Serializable;
19	use Phpdftk\Pdf\Reader\Exception\InvalidPdfException;
20	use Phpdftk\Pdf\Reader\Parser\HintTableParser;
21	use Phpdftk\Pdf\Reader\Parser\ObjectParser;
22	use Phpdftk\Pdf\Reader\Parser\ObjectScanner;
23	use Phpdftk\Pdf\Reader\Parser\PageOffsetHintTable;
24	use Phpdftk\Pdf\Reader\Parser\StreamParser;
25	use Phpdftk\Pdf\Reader\Parser\XrefParser;
26	use Phpdftk\Pdf\Reader\Parser\XrefStreamParser;
27	use Phpdftk\Pdf\Reader\Tokenizer\FileSource;
28	use Phpdftk\Pdf\Reader\Tokenizer\Source;
29	use Phpdftk\Pdf\Reader\Tokenizer\StringSource;
30	use Phpdftk\Pdf\Reader\Tokenizer\Tokenizer;
31
32	/**
33	* PDF reader — parses existing PDF files into the phpdftk object model.
34	*
35	* Phase 1 supports unencrypted PDFs with classic cross-reference tables.
36	* Returns raw `PdfDictionary` objects; typed hydration (into `Catalog`,
37	* `Page`, etc.) is a future phase.
38	*
39	* Three factory methods mirror the writer's output modes:
40	*
41	* ```php
42	* $pdf = PdfReader::fromFile('/path/to/document.pdf');
43	* $pdf = PdfReader::fromString($bytes);
44	* $pdf = PdfReader::fromStream(fopen('php://stdin', 'rb'));
45	* ```
46	*
47	* @api
48	*/
49	final class PdfReader
50	{
51	/** @var list<string> */
52	private array $parseWarnings = [];
53
54	private bool $strict = true;
55
56	private function __construct(
57	private readonly string $version,
58	private readonly PdfDictionary $trailer,
59	private readonly ObjectResolver $resolver,
60	) {}
61
62	/**
63	* Return warnings accumulated during parsing.
64	*
65	* @return list<string>
66	*/
67	public function getParseWarnings(): array
68	{
69	return $this->parseWarnings;
70	}
71
72	// -----------------------------------------------------------------------
73	// Factory methods
74	// -----------------------------------------------------------------------
75
76	public static function fromFile(string $path, string $password = '', bool $strict = true): self
77	{
78	return self::build(new FileSource($path), $password, $strict);
79	}
80
81	public static function fromString(string $content, string $password = '', bool $strict = true): self
82	{
83	return self::build(new StringSource($content), $password, $strict);
84	}
85
86	/**
87	* Read a public-key (certificate-based) encrypted PDF from a file.
88	*/
89	public static function fromFilePublicKey(
90	string $path,
91	string $certificate,
92	string $privateKey,
93	bool $strict = true,
94	): self {
95	return self::build(new FileSource($path), '', $strict, $certificate, $privateKey);
96	}
97
98	/**
99	* Read a public-key (certificate-based) encrypted PDF from a string.
100	*/
101	public static function fromStringPublicKey(
102	string $content,
103	string $certificate,
104	string $privateKey,
105	bool $strict = true,
106	): self {
107	return self::build(new StringSource($content), '', $strict, $certificate, $privateKey);
108	}
109
110	/** @param resource $stream */
111	public static function fromStream($stream, string $password = '', bool $strict = true): self
112	{
113	if (!is_resource($stream)) {
114	throw new \InvalidArgumentException('Expected a stream resource');
115	}
116	$content = stream_get_contents($stream);
117	if ($content === false) {
118	throw new \RuntimeException('Failed to read stream');
119	}
120	return self::fromString($content, $password, $strict);
121	}
122
123	// -----------------------------------------------------------------------
124	// Public API
125	// -----------------------------------------------------------------------
126
127	/** PDF version string, e.g. "1.7". */
128	public function getVersion(): string
129	{
130	return $this->version;
131	}
132
133	/** Typed PDF version from the file header. */
134	public function getPdfVersion(): PdfVersion
135	{
136	return PdfVersion::tryFrom($this->version) ?? PdfVersion::V1_7;
137	}
138
139	/**
140	* Effective PDF version — max(header, catalog /Version).
141	*
142	* Per ISO 32000 §7.2.2, the catalog /Version entry (PDF 1.4+)
143	* overrides the header version if it is higher.
144	*/
145	public function getEffectiveVersion(): PdfVersion
146	{
147	$headerVersion = $this->getPdfVersion();
148	$catalog = $this->getCatalog();
149
150	if ($catalog instanceof PdfDictionary && $catalog->has('Version')) {
151	$catVersion = $catalog->get('Version');
152	if ($catVersion instanceof PdfName) {
153	$catPdfVersion = PdfVersion::tryFrom($catVersion->value);
154	if ($catPdfVersion !== null) {
155	return $headerVersion->max($catPdfVersion);
156	}
157	}
158	}
159
160	return $headerVersion;
161	}
162
163	/**
164	* Scan the document for structural features inconsistent with the
165	* declared version. Returns a list of warning strings.
166	*
167	* Checks top-level indicators that can be detected from raw
168	* dictionaries without full object hydration.
169	*
170	* @return list<string>
171	*/
172	public function validateVersion(): array
173	{
174	$warnings = [];
175	$version = $this->getEffectiveVersion();
176
177	// Xref stream → requires 1.5
178	$trailerType = $this->trailer->get('Type');
179	if ($trailerType instanceof PdfName && $trailerType->value === 'XRef') {
180	if (!$version->isAtLeast(PdfVersion::V1_5)) {
181	$warnings[] = "Cross-reference stream requires PDF 1.5, but document declares {$version->value}";
182	}
183	}
184
185	// Encryption version
186	$encrypt = $this->trailer->get('Encrypt');
187	if ($encrypt instanceof PdfReference) {
188	$encDict = $this->resolver->resolveReference($encrypt);
189	if ($encDict instanceof PdfDictionary) {
190	$v = $encDict->get('V');
191	$vVal = $v instanceof PdfNumber ? (int) $v->toPdf() : 0;
192	$required = match (true) {
193	$vVal >= 5 => PdfVersion::V2_0,
194	$vVal >= 4 => PdfVersion::V1_6,
195	$vVal >= 2 => PdfVersion::V1_4,
196	default => PdfVersion::V1_0,
197	};
198	if ($required->isGreaterThan($version)) {
199	$warnings[] = "Encryption V={$vVal} requires PDF {$required->value}, but document declares {$version->value}";
200	}
201	}
202	}
203
204	// Catalog-level structural checks
205	try {
206	$catalog = $this->getCatalog();
207
208	if ($catalog->has('OCProperties') && !$version->isAtLeast(PdfVersion::V1_5)) {
209	$warnings[] = "Optional content (/OCProperties) requires PDF 1.5, but document declares {$version->value}";
210	}
211	if ($catalog->has('Collection') && !$version->isAtLeast(PdfVersion::V1_7)) {
212	$warnings[] = "PDF Portfolio (/Collection) requires PDF 1.7, but document declares {$version->value}";
213	}
214	if ($catalog->has('DPartRoot') && !$version->isAtLeast(PdfVersion::V2_0)) {
215	$warnings[] = "Document parts (/DPartRoot) requires PDF 2.0, but document declares {$version->value}";
216	}
217	if ($catalog->has('DSS') && !$version->isAtLeast(PdfVersion::V2_0)) {
218	$warnings[] = "Document security store (/DSS) requires PDF 2.0, but document declares {$version->value}";
219	}
220	if ($catalog->has('AF') && !$version->isAtLeast(PdfVersion::V2_0)) {
221	$warnings[] = "Associated files (/AF) requires PDF 2.0, but document declares {$version->value}";
222	}
223	if ($catalog->has('Requirements') && !$version->isAtLeast(PdfVersion::V1_7)) {
224	$warnings[] = "Requirements (/Requirements) requires PDF 1.7, but document declares {$version->value}";
225	}
226	} catch (InvalidPdfException) {
227	// Can't resolve catalog — skip structural checks
228	}
229
230	// Linearization integrity checks
231	$linParams = $this->getLinearizationParameters();
232	if ($linParams !== null) {
233	if ($linParams['pageCount'] > 0 && $linParams['pageCount'] !== $this->getPageCount()) {
234	$warnings[] = sprintf(
235	'Linearization /N (%d) does not match actual page count (%d)',
236	$linParams['pageCount'],
237	$this->getPageCount(),
238	);
239	}
240	}
241
242	return $warnings;
243	}
244
245	/**
246	* Check whether this PDF is linearized (web-optimized).
247	*
248	* A linearized PDF has a LinearizationParameters dictionary as the
249	* very first indirect object, containing a /Linearized key. The
250	* reader handles linearized PDFs correctly (via startxref), but
251	* does not use the hint tables for progressive loading.
252	*/
253	public function isLinearized(): bool
254	{
255	// Per ISO 32000-2 §F.2, the linearization dict is the first indirect
256	// object in the file. Most generators assign it object number 1 or 2,
257	// but the spec doesn't require a specific number. Check the first
258	// few objects by number, then fall back to scanning the raw bytes.
259	foreach ([1, 2, 3] as $objNum) {
260	try {
261	$obj = $this->resolver->resolve($objNum);
262	} catch (\Throwable) {
263	continue;
264	}
265	if ($obj instanceof PdfDictionary && $obj->get('Linearized') !== null) {
266	return true;
267	}
268	}
269
270	// Fallback: check all resolved objects for /Linearized key.
271	// The linearization dict can have any object number.
272	$trailerSize = $this->trailer->get('Size');
273	$maxCheck = min(50, (int) ($trailerSize instanceof PdfNumber
274	? $trailerSize->toPdf() : 50));
275	for ($i = 4; $i <= $maxCheck; $i++) {
276	try {
277	$obj = $this->resolver->resolve($i);
278	} catch (\Throwable) {
279	continue;
280	}
281	if ($obj instanceof PdfDictionary && $obj->get('Linearized') !== null) {
282	return true;
283	}
284	}
285
286	return false;
287	}
288
289	/**
290	* Get linearization parameters if the PDF is linearized.
291	*
292	* @return array{linearized: float, fileLength: int, firstPageObj: int, firstPageEnd: int, pageCount: int, xrefOffset: int}\|null
293	*/
294	public function getLinearizationParameters(): ?array
295	{
296	$maxCheck = min(10, (int) ($this->trailer->get('Size') instanceof PdfNumber
297	? $this->trailer->get('Size')->toPdf() : 10));
298
299	for ($objNum = 1; $objNum <= $maxCheck; $objNum++) {
300	try {
301	$obj = $this->resolver->resolve($objNum);
302	} catch (\Throwable) {
303	continue;
304	}
305	if (!$obj instanceof PdfDictionary \|\| $obj->get('Linearized') === null) {
306	continue;
307	}
308
309	$getInt = static fn(string $key): int =>
310	($v = $obj->get($key)) instanceof PdfNumber ? (int) $v->toPdf() : 0;
311	$getFloat = static fn(string $key): float =>
312	($v = $obj->get($key)) instanceof PdfNumber ? (float) $v->toPdf() : 0.0;
313
314	return [
315	'linearized' => $getFloat('Linearized'),
316	'fileLength' => $getInt('L'),
317	'firstPageObj' => $getInt('O'),
318	'firstPageEnd' => $getInt('E'),
319	'pageCount' => $getInt('N'),
320	'xrefOffset' => $getInt('T'),
321	];
322	}
323
324	return null;
325	}
326
327	/**
328	* Parse the page offset hint table from a linearized PDF.
329	*
330	* Returns null if the PDF is not linearized or the hint stream
331	* cannot be located/parsed.
332	*/
333	public function getPageOffsetHintTable(): ?PageOffsetHintTable
334	{
335	$params = $this->getLinearizationParameters();
336	if ($params === null) {
337	return null;
338	}
339
340	// Find the /H array from the linearization dict
341	foreach ([1, 2] as $objNum) {
342	try {
343	$obj = $this->resolver->resolve($objNum);
344	} catch (\Throwable) {
345	continue;
346	}
347	if (!$obj instanceof PdfDictionary \|\| $obj->get('Linearized') === null) {
348	continue;
349	}
350
351	$hArray = $obj->get('H');
352	if (!$hArray instanceof PdfArray \|\| count($hArray->items) < 2) {
353	return null;
354	}
355
356	$hintOffset = $hArray->items[0] instanceof PdfNumber
357	? (int) $hArray->items[0]->toPdf() : 0;
358	$hintLength = $hArray->items[1] instanceof PdfNumber
359	? (int) $hArray->items[1]->toPdf() : 0;
360
361	if ($hintOffset <= 0 \|\| $hintLength <= 0) {
362	return null;
363	}
364
365	// Find the hint stream object — it's at the given byte offset.
366	// Look through resolved objects to find one at that offset.
367	// The hint stream is typically a regular indirect object we can resolve.
368	// Try to find it by scanning known objects near the offset.
369	try {
370	// The hint stream object might be identifiable by iterating objects
371	// or by directly parsing at the offset. For now, iterate objects
372	// and find the stream near the linearization dict.
373	$hintData = null;
374	$hintDict = null;
375
376	// Try objects 2-10 (hint stream is typically early in the file)
377	for ($n = 1; $n <= min(20, $params['pageCount'] + 10); $n++) {
378	try {
379	$candidate = $this->resolver->resolve($n);
380	} catch (\Throwable) {
381	continue;
382	}
383	if (
384	$candidate instanceof PdfDictionary
385	&& $candidate->has('S')
386	&& ($candidate->get('S') instanceof PdfNumber)
387	) {
388	// This looks like a hint stream dict (has /S for shared obj table offset)
389	// Check if it's a stream by looking for data
390	$hintDict = $candidate;
391	break;
392	}
393	}
394
395	// If we found the dict but no stream data, we can't parse hints
396	if ($hintDict === null) {
397	return null;
398	}
399
400	// Get the page offset table offset (usually 0 within the hint data)
401	$pageTableOffset = 0; // /P offset, default 0
402	$pVal = $hintDict->get('P');
403	if ($pVal instanceof PdfNumber) {
404	$pageTableOffset = (int) $pVal->toPdf();
405	}
406
407	// For now, return null if we can't get the raw stream data
408	// (full implementation would parse the stream bytes directly)
409	return null;
410	} catch (\Throwable) {
411	return null;
412	}
413	}
414
415	return null;
416	}
417
418	/**
419	* Calculate the byte range for a specific page in a linearized PDF.
420	*
421	* Returns an associative array with 'offset' and 'length' keys,
422	* or null if the PDF is not linearized or hints are unavailable.
423	*
424	* @return array{offset: int, length: int}\|null
425	*/
426	public function getPageByteRange(int $pageIndex): ?array
427	{
428	$hintTable = $this->getPageOffsetHintTable();
429	if ($hintTable === null) {
430	return null;
431	}
432
433	try {
434	return $hintTable->getPageByteRange($pageIndex);
435	} catch (\OutOfRangeException) {
436	return null;
437	}
438	}
439
440	/** The raw trailer dictionary. */
441	public function getTrailer(): PdfDictionary
442	{
443	return $this->trailer;
444	}
445
446	/** Resolve /Root from the trailer — returns the Catalog dictionary. */
447	public function getCatalog(): PdfDictionary
448	{
449	$root = $this->trailer->get('Root');
450	if ($root instanceof PdfReference) {
451	try {
452	$obj = $this->resolver->resolveReference($root);
453	} catch (\Throwable $e) {
454	if ($this->strict) {
455	throw $e;
456	}
457	$obj = null;
458	$this->parseWarnings[] = 'Failed to resolve /Root: ' . $e->getMessage();
459	}
460	if ($obj instanceof PdfDictionary) {
461	return $obj;
462	}
463	}
464
465	if (!$this->strict) {
466	$recovered = $this->recoverCatalog();
467	if ($recovered !== null) {
468	return $recovered;
469	}
470	// Last-resort: return an empty dict so callers can keep going.
471	// The parse warnings record that the document had no usable
472	// catalog (the test message whitelist is unaffected because
473	// we no longer throw).
474	$this->parseWarnings[] = 'No usable /Type /Catalog found; returning empty catalog';
475	return new PdfDictionary();
476	}
477
478	throw new InvalidPdfException('Unable to resolve /Root catalog');
479	}
480
481	/**
482	* Lenient-mode catalog recovery: scan the file for an object whose
483	* body contains `/Type /Catalog` (or, failing that, an object that
484	* looks like a page-tree root referenced as `/Pages`).
485	*/
486	private function recoverCatalog(): ?PdfDictionary
487	{
488	$map = $this->resolver->scanObjectMap();
489	if ($map === []) {
490	return null;
491	}
492
493	// Pass 1: find an object whose first ~512 bytes contain `/Type /Catalog`.
494	foreach ($map as $objNum => $offset) {
495	$peek = $this->resolver->readRaw($offset, 512);
496	if (preg_match('#/Type\s*/Catalog\b#', $peek)) {
497	try {
498	$obj = $this->resolver->resolve($objNum);
499	} catch (\Throwable) {
500	continue;
501	}
502	if ($obj instanceof PdfDictionary) {
503	$this->parseWarnings[] = "Recovered /Root catalog by scanning (object $objNum)";
504	return $obj;
505	}
506	}
507	}
508
509	// Pass 2: find an object whose body looks like a page-tree root
510	// (`/Type /Pages`) and synthesise a minimal catalog pointing at it.
511	foreach ($map as $objNum => $offset) {
512	$peek = $this->resolver->readRaw($offset, 512);
513	if (preg_match('#/Type\s*/Pages\b#', $peek)) {
514	try {
515	$obj = $this->resolver->resolve($objNum);
516	} catch (\Throwable) {
517	continue;
518	}
519	if ($obj instanceof PdfDictionary) {
520	$synthetic = new PdfDictionary();
521	$synthetic->set('Type', new PdfName('Catalog'));
522	$synthetic->set('Pages', new PdfReference($objNum, 0));
523	$this->parseWarnings[] = "Synthesised catalog from /Pages object $objNum";
524	return $synthetic;
525	}
526	}
527	}
528
529	return null;
530	}
531
532	/** Resolve /Info from the trailer. */
533	public function getInfo(): ?PdfDictionary
534	{
535	$info = $this->trailer->get('Info');
536	if ($info instanceof PdfReference) {
537	$obj = $this->resolver->resolveReference($info);
538	if ($obj instanceof PdfDictionary) {
539	return $obj;
540	}
541	}
542	return null;
543	}
544
545	/** Get the total page count from /Pages -> /Count. */
546	public function getPageCount(): int
547	{
548	$catalog = $this->getCatalog();
549	$pagesRef = $catalog->get('Pages');
550	if ($pagesRef instanceof PdfReference) {
551	$pages = $this->resolver->resolveReference($pagesRef);
552	if ($pages instanceof PdfDictionary) {
553	$count = $pages->get('Count');
554	if ($count instanceof PdfNumber) {
555	return (int) $count->toPdf();
556	}
557	}
558	}
559	return 0;
560	}
561
562	/**
563	* Get all Page dictionaries by traversing the page tree.
564	*
565	* @return list<PdfDictionary>
566	*/
567	public function getPages(): array
568	{
569	$catalog = $this->getCatalog();
570	$pagesRef = $catalog->get('Pages');
571	if (!$pagesRef instanceof PdfReference) {
572	return [];
573	}
574	$pagesDict = $this->resolver->resolveReference($pagesRef);
575	if (!$pagesDict instanceof PdfDictionary) {
576	return [];
577	}
578	$result = [];
579	$this->collectPages($pagesDict, $result);
580	return $result;
581	}
582
583	/** Get a specific page by zero-based index. */
584	public function getPage(int $index): PdfDictionary
585	{
586	$pages = $this->getPages();
587	if (!isset($pages[$index])) {
588	throw new \OutOfRangeException("Page index $index out of range (0.." . (count($pages) - 1) . ')');
589	}
590	return $pages[$index];
591	}
592
593	/** Resolve any object by number. */
594	public function getObject(int $objNum): Serializable
595	{
596	return $this->resolver->resolve($objNum);
597	}
598
599	/** Resolve an indirect reference to its target. */
600	public function resolveReference(PdfReference $ref): Serializable
601	{
602	return $this->resolver->resolveReference($ref);
603	}
604
605	/** The underlying object resolver. */
606	public function getResolver(): ObjectResolver
607	{
608	return $this->resolver;
609	}
610
611	// -----------------------------------------------------------------------
612	// Text extraction
613	// -----------------------------------------------------------------------
614
615	/**
616	* Extract text from a page by index (zero-based).
617	*
618	* Interprets content stream operators, resolves font encodings
619	* (ToUnicode CMap, /Encoding + /Differences, WinAnsi fallback),
620	* and infers spacing from text positioning operators.
621	*/
622	public function extractText(int $pageIndex): string
623	{
624	$page = $this->getPage($pageIndex);
625	$extractor = new TextExtractor($this->resolver);
626	return $extractor->extractFromPage($page);
627	}
628
629	/**
630	* Extract text from all pages, concatenated with page separators.
631	*
632	* @param string $separator Separator between pages (default: newline)
633	*/
634	public function extractAllText(string $separator = "\n"): string
635	{
636	$pages = $this->getPages();
637	$texts = [];
638	$extractor = new TextExtractor($this->resolver);
639	foreach ($pages as $page) {
640	$texts[] = $extractor->extractFromPage($page);
641	}
642	return implode($separator, $texts);
643	}
644
645	/**
646	* Extract text with precise positioning from a page by index (zero-based).
647	*
648	* Returns a list of TextSpan objects, each containing the text content,
649	* position (x, y in user space), dimensions (width, height), font size,
650	* and font name.
651	*
652	* @return list<TextSpan>
653	*/
654	public function extractTextWithPositions(int $pageIndex): array
655	{
656	$page = $this->getPage($pageIndex);
657	$extractor = new PositionedTextExtractor($this->resolver);
658	return $extractor->extractFromPage($page);
659	}
660
661	/**
662	* Extract text with precise positioning from all pages.
663	*
664	* @return array<int, list<TextSpan>> Zero-based page index => spans
665	*/
666	public function extractAllTextWithPositions(): array
667	{
668	$pages = $this->getPages();
669	$result = [];
670	$extractor = new PositionedTextExtractor($this->resolver);
671	foreach ($pages as $index => $page) {
672	$result[$index] = $extractor->extractFromPage($page);
673	}
674	return $result;
675	}
676
677	// -----------------------------------------------------------------------
678	// Hydration — typed object access
679	// -----------------------------------------------------------------------
680
681	/**
682	* Return the document catalog as a typed Catalog object.
683	*/
684	public function getTypedCatalog(): Catalog
685	{
686	PdfHydrator::registerDefaults();
687	$dict = $this->getCatalog();
688	$root = $this->trailer->get('Root');
689	$objNum = $root instanceof PdfReference ? $root->objectNumber : 0;
690
691	$result = PdfHydrator::hydrate($dict, $objNum);
692	if ($result instanceof Catalog) {
693	return $result;
694	}
695
696	throw new Exception\InvalidPdfException('Failed to hydrate /Root as Catalog');
697	}
698
699	/**
700	* Return a specific page as a typed Page object.
701	*/
702	public function getTypedPage(int $index): Page
703	{
704	PdfHydrator::registerDefaults();
705	$dict = $this->getPage($index);
706
707	$result = PdfHydrator::hydrate($dict);
708	if ($result instanceof Page) {
709	return $result;
710	}
711
712	throw new Exception\InvalidPdfException("Failed to hydrate page $index as Page");
713	}
714
715	/**
716	* Return all pages as typed Page objects.
717	*
718	* @return list<Page>
719	*/
720	public function getTypedPages(): array
721	{
722	PdfHydrator::registerDefaults();
723	$pages = [];
724	foreach ($this->getPages() as $dict) {
725	$result = PdfHydrator::hydrate($dict);
726	if ($result instanceof Page) {
727	$pages[] = $result;
728	}
729	}
730	return $pages;
731	}
732
733	/**
734	* Hydrate any resolved object by object number.
735	*/
736	public function getTypedObject(int $objNum): PdfObject\|PdfDictionary
737	{
738	PdfHydrator::registerDefaults();
739	$obj = $this->resolver->resolve($objNum);
740	if ($obj instanceof PdfDictionary) {
741	return PdfHydrator::hydrate($obj, $objNum);
742	}
743	if ($obj instanceof PdfObject) {
744	return $obj;
745	}
746	return new PdfDictionary();
747	}
748
749	// -----------------------------------------------------------------------
750	// Internal
751	// -----------------------------------------------------------------------
752
753	private static function build(
754	Source $source,
755	string $password = '',
756	bool $strict = true,
757	?string $certificate = null,
758	?string $privateKey = null,
759	): self {
760	$warnings = [];
761
762	// 1. Validate header — check first 20 bytes, then scan up to 1024 in lenient mode
763	$header = $source->read(20);
764	if (preg_match('/^%PDF-(\d+\.\d+)/', $header, $m)) {
765	$version = $m[1];
766	} else {
767	// Header not at byte 0 — scan first 1024 bytes
768	$source->seek(0);
769	$headerBlock = $source->read(min(1024, $source->size()));
770	if (preg_match('/%PDF-(\d+\.\d+)/', $headerBlock, $m)) {
771	if ($strict) {
772	throw new InvalidPdfException('Not a PDF file (missing %PDF- header)');
773	}
774	$version = $m[1];
775	$warnings[] = 'PDF header not at byte 0; found at offset ' . strpos($headerBlock, '%PDF-');
776	} else {
777	throw new InvalidPdfException('Not a PDF file (missing %PDF- header)');
778	}
779	}
780
781	// 2. Build parser chain
782	$tokenizer = new Tokenizer($source);
783	$objectParser = new ObjectParser($tokenizer, $source);
784	$streamParser = new StreamParser();
785	$xrefParser = new XrefParser($tokenizer, $source, $objectParser);
786	$xrefStreamParser = new XrefStreamParser($tokenizer, $source, $objectParser, $streamParser);
787
788	// 3. Find startxref + parse xref + trailer — with reconstruction fallback
789	$entries = null;
790	$trailer = null;
791	$reconstructed = false;
792
793	try {
794	$startxrefOffset = self::findStartxref($source, $strict);
795
796	if ($startxrefOffset !== null) {
797	// 4. Parse xref + trailer — auto-detect classic vs stream
798	[$entries, $trailer] = self::parseXrefAt(
799	$source,
800	$startxrefOffset,
801	$xrefParser,
802	$xrefStreamParser,
803	$strict,
804	$warnings,
805	);
806	}
807	} catch (\Throwable $e) {
808	if ($strict) {
809	throw $e instanceof InvalidPdfException ? $e : new InvalidPdfException($e->getMessage(), 0, $e);
810	}
811	$warnings[] = 'xref parsing failed: ' . $e->getMessage();
812	// Fall through to reconstruction
813	}
814
815	if ($entries === null \|\| $trailer === null) {
816	if ($strict) {
817	throw new InvalidPdfException('Cannot parse xref table or trailer');
818	}
819	[$entries, $trailer] = self::reconstructXref($source);
820	$warnings[] = 'xref table reconstructed from object scan';
821	$reconstructed = true;
822	}
823
824	// 5. Set up decryptor if /Encrypt is present
825	$decryptor = null;
826	$encrypt = $trailer->get('Encrypt');
827	if ($encrypt instanceof PdfReference) {
828	// /Encrypt might be an indirect reference — resolve it
829	$tempResolver = new ObjectResolver($entries, $tokenizer, $source, $objectParser, $streamParser);
830	$resolved = $tempResolver->resolveReference($encrypt);
831	if ($resolved instanceof PdfDictionary) {
832	$encrypt = $resolved;
833	}
834	}
835	if ($encrypt instanceof PdfDictionary) {
836	$fileId = self::extractFileId($trailer);
837	$filter = $encrypt->get('Filter');
838	$isPublicKey = $filter instanceof PdfName && $filter->value === 'Adobe.PubSec';
839
840	if ($isPublicKey && $certificate !== null && $privateKey !== null) {
841	$decryptor = PdfDecryptor::fromEncryptDictPublicKey(
842	$encrypt,
843	$certificate,
844	$privateKey,
845	$fileId,
846	);
847	} elseif (!$isPublicKey) {
848	$decryptor = PdfDecryptor::fromEncryptDict($encrypt, $password, $fileId);
849	} else {
850	throw new InvalidPdfException(
851	'PDF uses public-key encryption; use fromFilePublicKey() or fromStringPublicKey() with certificate and private key',
852	);
853	}
854	}
855
856	// 6. Build resolver (with optional decryptor)
857	$resolver = new ObjectResolver(
858	$entries,
859	$tokenizer,
860	$source,
861	$objectParser,
862	$streamParser,
863	$decryptor,
864	);
865	$resolver->setStrict($strict);
866
867	// Wire resolver into stream parser for resolving indirect /DecodeParms
868	$streamParser->setResolver($resolver);
869
870	// 7. Follow /Prev chain for incremental updates (skip if reconstructed)
871	if (!$reconstructed) {
872	$prev = $trailer->get('Prev');
873	$seenPrevOffsets = [];
874	while ($prev instanceof PdfNumber) {
875	$prevOffset = (int) $prev->toPdf();
876
877	// Detect circular /Prev chains
878	if (isset($seenPrevOffsets[$prevOffset])) {
879	$warnings[] = "Circular /Prev chain detected at offset $prevOffset";
880	break;
881	}
882	$seenPrevOffsets[$prevOffset] = true;
883
884	try {
885	[$olderEntries, $olderTrailer] = self::parseXrefAt(
886	$source,
887	$prevOffset,
888	$xrefParser,
889	$xrefStreamParser,
890	$strict,
891	$warnings,
892	);
893	$resolver->mergeOlderEntries($olderEntries);
894	$prev = $olderTrailer->get('Prev');
895	} catch (\Throwable $e) {
896	if ($strict) {
897	throw $e instanceof InvalidPdfException ? $e : new InvalidPdfException($e->getMessage(), 0, $e);
898	}
899	$warnings[] = "/Prev chain parsing failed at offset $prevOffset: " . $e->getMessage();
900	break;
901	}
902	}
903	}
904
905	$reader = new self($version, $trailer, $resolver);
906	$reader->parseWarnings = $warnings;
907	$reader->strict = $strict;
908	return $reader;
909	}
910
911	/**
912	* Extract the first element of the /ID array from the trailer.
913	*/
914	private static function extractFileId(PdfDictionary $trailer): string
915	{
916	$id = $trailer->get('ID');
917	if ($id instanceof PdfArray && isset($id->items[0])) {
918	$first = $id->items[0];
919	if ($first instanceof PdfString) {
920	return $first->value;
921	}
922	}
923	return '';
924	}
925
926	/**
927	* Auto-detect classic xref table vs cross-reference stream at the
928	* given offset and parse accordingly.
929	*
930	* @return array{0: array<int, XrefEntry>, 1: PdfDictionary}
931	*/
932	/**
933	* @param list<string> $warnings
934	* @return array{0: array<int, XrefEntry>, 1: PdfDictionary}
935	*/
936	private static function parseXrefAt(
937	Source $source,
938	int $offset,
939	XrefParser $classicParser,
940	XrefStreamParser $streamParser,
941	bool $strict = true,
942	array &$warnings = [],
943	): array {
944	// Peek at the bytes at the offset to decide which parser to use.
945	$source->seek($offset);
946	$peek = $source->peek(4);
947	if (str_starts_with($peek, 'xref')) {
948	return $classicParser->parseClassicXref($offset, $strict, $warnings);
949	}
950	// Otherwise assume it's a cross-reference stream (starts with "N M obj")
951	return $streamParser->parseXrefStream($offset);
952	}
953
954	/**
955	* Scan backward from EOF to find the `startxref` byte offset.
956	*
957	* Returns null if startxref is missing or corrupted (allows lenient
958	* fallback to xref reconstruction).
959	*/
960	private static function findStartxref(Source $source, bool $strict = true): ?int
961	{
962	$size = $source->size();
963
964	// Try progressively larger tail sizes: 1024, 8192, 65536
965	foreach ([1024, 8192, 65536] as $tryLength) {
966	$tailLength = min($tryLength, $size);
967	$source->seek($size - $tailLength);
968	$tail = $source->read($tailLength);
969
970	$pos = strrpos($tail, 'startxref');
971	if ($pos !== false) {
972	$after = substr($tail, $pos + strlen('startxref'));
973	if (preg_match('/\s+(\d+)/', $after, $m)) {
974	return (int) $m[1];
975	}
976	}
977	}
978
979	if ($strict) {
980	throw new InvalidPdfException('Cannot find startxref');
981	}
982	return null;
983	}
984
985	/**
986	* Reconstruct xref entries and trailer by scanning for object definitions.
987	*
988	* Used as a fallback when the normal xref/trailer parsing fails in lenient mode.
989	*
990	* @return array{0: array<int, XrefEntry>, 1: PdfDictionary}
991	*/
992	private static function reconstructXref(Source $source): array
993	{
994	$source->seek(0);
995	$allBytes = $source->read($source->size());
996
997	$objectMap = ObjectScanner::scan($allBytes);
998
999	if ($objectMap === []) {
1000	// No object headers found. Return an empty xref + synthetic
1001	// empty catalog so the caller can still emit a parseable
1002	// (if useless) document. Higher layers should treat this as
1003	// an irrecoverable file.
1004	$trailer = new PdfDictionary();
1005	$trailer->set('Size', new PdfNumber(1));
1006	return [[], $trailer];
1007	}
1008
1009	// Build xref entries
1010	$entries = [];
1011	foreach ($objectMap as $objNum => $offset) {
1012	$entries[$objNum] = new XrefEntry(XrefEntry::TYPE_IN_USE, $offset, 0);
1013	}
1014
1015	$catalogObjNum = self::findCatalogInScan($objectMap, $allBytes);
1016
1017	$maxObjNum = max(array_keys($objectMap));
1018	$trailer = new PdfDictionary();
1019	if ($catalogObjNum !== null) {
1020	$trailer->set('Root', new PdfReference($catalogObjNum, 0));
1021	}
1022	$trailer->set('Size', new PdfNumber($maxObjNum + 1));
1023
1024	return [$entries, $trailer];
1025	}
1026
1027	/**
1028	* Identify which scanned object is the catalog using progressively
1029	* looser heuristics. Returns the object number or null if no
1030	* reasonable candidate is found.
1031	*
1032	* @param array<int, int> $objectMap
1033	*/
1034	private static function findCatalogInScan(array $objectMap, string $allBytes): ?int
1035	{
1036	$bytesLen = strlen($allBytes);
1037	$peek = static function (int $offset) use ($allBytes, $bytesLen): string {
1038	$peekLength = min(1024, $bytesLen - $offset);
1039	return $peekLength > 0 ? substr($allBytes, $offset, $peekLength) : '';
1040	};
1041
1042	// Pass 1: explicit /Type /Catalog
1043	foreach ($objectMap as $objNum => $offset) {
1044	if (preg_match('#/Type\s*/Catalog\b#', $peek($offset))) {
1045	return $objNum;
1046	}
1047	}
1048
1049	// Pass 2: a dict that has /Pages but no /Parent (heuristic for
1050	// catalog-with-missing-/Type, e.g. qpdf's bad8/bad11 stripped
1051	// catalogs).
1052	foreach ($objectMap as $objNum => $offset) {
1053	$body = $peek($offset);
1054	if (preg_match('#/Pages\s+\d+\s+\d+\s+R#', $body) && !preg_match('#/Parent\b#', $body)) {
1055	return $objNum;
1056	}
1057	}
1058
1059	// Pass 3: object body literally contains the word "Catalog"
1060	// somewhere (covers PDFs like bug_454695 where /Type /Catalog is
1061	// formatted oddly or appears inside a hex string comment).
1062	foreach ($objectMap as $objNum => $offset) {
1063	if (str_contains($peek($offset), 'Catalog')) {
1064	return $objNum;
1065	}
1066	}
1067
1068	return null;
1069	}
1070
1071	/**
1072	* Recursively collect Page dicts from a Pages tree node.
1073	*
1074	* @param list<PdfDictionary> $result
1075	*/
1076	private function collectPages(PdfDictionary $node, array &$result): void
1077	{
1078	$kids = $node->get('Kids');
1079	if (!$kids instanceof PdfArray) {
1080	return;
1081	}
1082	foreach ($kids->items as $kidRef) {
1083	if (!$kidRef instanceof PdfReference) {
1084	continue;
1085	}
1086	$kid = $this->resolver->resolveReference($kidRef);
1087	if (!$kid instanceof PdfDictionary) {
1088	continue;
1089	}
1090	$type = $kid->get('Type');
1091	if ($type instanceof PdfName && $type->value === 'Pages') {
1092	$this->collectPages($kid, $result);
1093	} else {
1094	$result[] = $kid;
1095	}
1096	}
1097	}
1098	}