Code Coverage for /home/runner/work/phpdftk/phpdftk/packages/pdf/reader/src/TextExtractor.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	74.83% covered (warning)	74.83%	336 / 449	25.00% covered (danger)	25.00%	6 / 24	CRAP	0.00% covered (danger)	0.00%	0 / 1
TextExtractor	74.83% covered (warning)	74.83%	336 / 449	25.00% covered (danger)	25.00%	6 / 24	936.20	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
extractFromPage	87.50% covered (warning)	87.50%	7 / 8	0.00% covered (danger)	0.00%	0 / 1	3.02
processOps	69.03% covered (warning)	69.03%	78 / 113	0.00% covered (danger)	0.00%	0 / 1	116.46
extractFromXObject	87.50% covered (warning)	87.50%	35 / 40	0.00% covered (danger)	0.00%	0 / 1	10.20
inferSpacing	40.00% covered (danger)	40.00%	2 / 5	0.00% covered (danger)	0.00%	0 / 1	7.46
extractActualText	60.00% covered (warning)	60.00%	9 / 15	0.00% covered (danger)	0.00%	0 / 1	8.30
decodeStringOperand	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
decodeTJArray	93.33% covered (success)	93.33%	28 / 30	0.00% covered (danger)	0.00%	0 / 1	23.16
parseStringOperand	88.89% covered (warning)	88.89%	8 / 9	0.00% covered (danger)	0.00%	0 / 1	6.05
unescapeLiteralString	76.19% covered (warning)	76.19%	16 / 21	0.00% covered (danger)	0.00%	0 / 1	15.28
readOctalOrLiteral	81.82% covered (warning)	81.82%	9 / 11	0.00% covered (danger)	0.00%	0 / 1	7.29
mapBytesToUnicode	91.30% covered (success)	91.30%	21 / 23	0.00% covered (danger)	0.00%	0 / 1	10.07
containsMultibyte	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	3
winAnsiFallback	92.86% covered (success)	92.86%	13 / 14	0.00% covered (danger)	0.00%	0 / 1	5.01
decodeName	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	1
loadFontMaps	85.71% covered (warning)	85.71%	6 / 7	0.00% covered (danger)	0.00%	0 / 1	2.01
loadFontMapsFromResources	67.57% covered (warning)	67.57%	25 / 37	0.00% covered (danger)	0.00%	0 / 1	29.05
extractSpaceWidth	92.31% covered (success)	92.31%	12 / 13	0.00% covered (danger)	0.00%	0 / 1	7.02
buildEncodingMap	30.00% covered (danger)	30.00%	9 / 30	0.00% covered (danger)	0.00%	0 / 1	92.17
getNamedEncodingTable	50.00% covered (danger)	50.00%	3 / 6	0.00% covered (danger)	0.00%	0 / 1	10.50
resolveValue	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	2
getContentStreamData	73.68% covered (warning)	73.68%	14 / 19	0.00% covered (danger)	0.00%	0 / 1	10.48
extractLiteralString	66.67% covered (warning)	66.67%	14 / 21	0.00% covered (danger)	0.00%	0 / 1	10.37
extractHexString	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	5

1	<?php
2
3	declare(strict_types=1);
4
5	namespace Phpdftk\Pdf\Reader;
6
7	use Phpdftk\Encoding\CMapParser;
8	use Phpdftk\Encoding\GlyphList;
9	use Phpdftk\Encoding\MacExpertEncodingTable;
10	use Phpdftk\Encoding\MacRomanTable;
11	use Phpdftk\Encoding\StandardEncodingTable;
12	use Phpdftk\Encoding\WinAnsiTable;
13	use Phpdftk\Pdf\Core\PdfArray;
14	use Phpdftk\Pdf\Core\PdfDictionary;
15	use Phpdftk\Pdf\Core\PdfName;
16	use Phpdftk\Pdf\Core\PdfNumber;
17	use Phpdftk\Pdf\Core\PdfReference;
18	use Phpdftk\Pdf\Core\PdfStream;
19	use Phpdftk\Pdf\Core\Serializable;
20	use Phpdftk\Pdf\Reader\Parser\ContentStreamOp;
21	use Phpdftk\Pdf\Reader\Parser\ContentStreamParser;
22
23	/**
24	* Extracts text content from a PDF page by interpreting content
25	* stream operators.
26	*
27	* Tracks text state (current font, position, spacing) and converts
28	* character codes to Unicode using:
29	* 1. /ToUnicode CMap (if present on the font)
30	* 2. /Encoding + /Differences (if present)
31	* 3. WinAnsi → GlyphList fallback (for standard fonts)
32	*
33	* Text positioning is used to insert spaces and newlines where the
34	* PDF moves the text cursor by significant amounts.
35	*/
36	final class TextExtractor
37	{
38	private readonly ContentStreamParser $parser;
39	private readonly ObjectResolver $resolver;
40
41	/** @var array<string, array<int, int>> Font name → char code → Unicode codepoint */
42	private array $fontMaps = [];
43
44	/** @var array<string, bool> Font names that use 2-byte CID encoding */
45	private array $cidFonts = [];
46
47	/** @var array<string, float> Font name → space character width in 1/1000 units */
48	private array $fontSpaceWidths = [];
49
50	/** Current font name (e.g., "F1") */
51	private string $currentFont = '';
52
53	/** Current font size */
54	private float $fontSize = 12.0;
55
56	/** Average character width for space detection (rough estimate) */
57	private float $spaceWidth = 0.0;
58
59	/** Current page/XObject resources dictionary for resolving XObjects */
60	private ?PdfDictionary $currentResources = null;
61
62	/** Recursion depth guard for nested Form XObjects */
63	private int $xObjectDepth = 0;
64
65	private const MAX_XOBJECT_DEPTH = 10;
66
67	public function __construct(ObjectResolver $resolver)
68	{
69	$this->parser = new ContentStreamParser();
70	$this->resolver = $resolver;
71	}
72
73	/**
74	* Extract text from a page dictionary.
75	*
76	* @param PdfDictionary $page The page dictionary (must have /Contents and /Resources)
77	*/
78	public function extractFromPage(PdfDictionary $page): string
79	{
80	// Pre-load font maps from page resources
81	$this->loadFontMaps($page);
82
83	// Track current resources for XObject resolution
84	$resources = $this->resolveValue($page->get('Resources'));
85	$this->currentResources = $resources instanceof PdfDictionary ? $resources : null;
86
87	// Get content stream data
88	$data = $this->getContentStreamData($page);
89	if ($data === '') {
90	return '';
91	}
92
93	// Parse content stream into operations
94	$ops = $this->parser->parse($data);
95
96	// Walk operations and extract text
97	return $this->processOps($ops);
98	}
99
100	/**
101	* Process a list of content stream operations and extract text.
102	*
103	* @param list<ContentStreamOp> $ops
104	*/
105	private function processOps(array $ops): string
106	{
107	$text = '';
108	$inTextBlock = false;
109	$lastX = 0.0;
110	$lastY = 0.0;
111	$currentX = 0.0;
112	$currentY = 0.0;
113
114	/** @var list<array{actualText: string\|null}> */
115	$markedContentStack = [];
116	$suppressText = false;
117
118	foreach ($ops as $op) {
119	switch ($op->operator) {
120	case 'BT':
121	$inTextBlock = true;
122	$currentX = 0.0;
123	$currentY = 0.0;
124	break;
125
126	case 'ET':
127	$inTextBlock = false;
128	break;
129
130	case 'Tf':
131	// Set font: /FontName fontSize Tf
132	if (count($op->operands) >= 2) {
133	$this->currentFont = $this->decodeName(ltrim($op->operands[0], '/'));
134	$this->fontSize = (float) $op->operands[1];
135	// Use font-specific space width if available, else estimate
136	$fontSpaceW = $this->fontSpaceWidths[$this->currentFont] ?? 0;
137	$this->spaceWidth = $fontSpaceW > 0
138	? $fontSpaceW * $this->fontSize / 1000
139	: $this->fontSize * 0.25;
140	}
141	break;
142
143	case 'Td':
144	// Move text position: tx ty Td
145	if (count($op->operands) >= 2) {
146	$tx = (float) $op->operands[0];
147	$ty = (float) $op->operands[1];
148	$lastX = $currentX;
149	$lastY = $currentY;
150	$currentX += $tx;
151	$currentY += $ty;
152	$text .= $this->inferSpacing($tx, $ty, $lastY, $currentY);
153	}
154	break;
155
156	case 'TD':
157	// Move text position and set leading: tx ty TD
158	if (count($op->operands) >= 2) {
159	$tx = (float) $op->operands[0];
160	$ty = (float) $op->operands[1];
161	$lastX = $currentX;
162	$lastY = $currentY;
163	$currentX += $tx;
164	$currentY += $ty;
165	$text .= $this->inferSpacing($tx, $ty, $lastY, $currentY);
166	}
167	break;
168
169	case 'Tm':
170	// Set text matrix: a b c d e f Tm
171	if (count($op->operands) >= 6) {
172	$newX = (float) $op->operands[4];
173	$newY = (float) $op->operands[5];
174	if ($text !== '' && abs($newY - $currentY) > $this->fontSize * 0.5) {
175	$text .= "\n";
176	} elseif ($text !== '' && abs($newX - $currentX) > $this->spaceWidth) {
177	$text .= ' ';
178	}
179	$currentX = $newX;
180	$currentY = $newY;
181	}
182	break;
183
184	case 'T*':
185	// Move to start of next line
186	if ($text !== '') {
187	$text .= "\n";
188	}
189	break;
190
191	case 'BMC':
192	// Begin Marked Content (no properties) — push null entry
193	$markedContentStack[] = ['actualText' => null];
194	break;
195
196	case 'BDC':
197	// Begin Marked Content with Properties
198	$actualText = null;
199	if (count($op->operands) >= 2) {
200	$actualText = $this->extractActualText($op->operands[1]);
201	}
202	$markedContentStack[] = ['actualText' => $actualText];
203	if ($actualText !== null) {
204	$suppressText = true;
205	}
206	break;
207
208	case 'EMC':
209	// End Marked Content
210	if (!empty($markedContentStack)) {
211	$entry = array_pop($markedContentStack);
212	if ($entry['actualText'] !== null) {
213	$text .= $entry['actualText'];
214	// Only clear suppress if no other ActualText entry remains on the stack
215	$suppressText = false;
216	foreach ($markedContentStack as $stackEntry) {
217	if ($stackEntry['actualText'] !== null) {
218	$suppressText = true;
219	break;
220	}
221	}
222	}
223	}
224	break;
225
226	case 'Tj':
227	// Show text string: (string) Tj
228	if (!$suppressText && count($op->operands) >= 1) {
229	$text .= $this->decodeStringOperand($op->operands[0]);
230	}
231	break;
232
233	case 'TJ':
234	// Show text with individual glyph positioning: [...] TJ
235	if (!$suppressText && count($op->operands) >= 1) {
236	$text .= $this->decodeTJArray($op->operands[0]);
237	}
238	break;
239
240	case "'":
241	// Move to next line and show text: (string) '
242	if ($text !== '') {
243	$text .= "\n";
244	}
245	if (!$suppressText && count($op->operands) >= 1) {
246	$text .= $this->decodeStringOperand($op->operands[0]);
247	}
248	break;
249
250	case '"':
251	// Set word/char spacing, move to next line, show text: aw ac (string) "
252	if ($text !== '') {
253	$text .= "\n";
254	}
255	if (!$suppressText && count($op->operands) >= 3) {
256	$text .= $this->decodeStringOperand($op->operands[2]);
257	}
258	break;
259
260	case 'Do':
261	// Invoke XObject: /Name Do — recurse into Form XObjects
262	if (count($op->operands) >= 1) {
263	$xobjText = $this->extractFromXObject(ltrim($op->operands[0], '/'));
264	if ($xobjText !== '') {
265	if ($text !== '' && !str_ends_with($text, "\n") && !str_ends_with($text, ' ')) {
266	$text .= ' ';
267	}
268	$text .= $xobjText;
269	}
270	}
271	break;
272	}
273	}
274
275	return trim($text);
276	}
277
278	/**
279	* Extract text from a Form XObject's content stream.
280	*
281	* Saves/restores font state and resources so the parent context is
282	* unaffected. Recurses up to MAX_XOBJECT_DEPTH levels to handle
283	* nested Form XObjects.
284	*/
285	private function extractFromXObject(string $name): string
286	{
287	if ($this->currentResources === null \|\| $this->xObjectDepth >= self::MAX_XOBJECT_DEPTH) {
288	return '';
289	}
290
291	$xobjects = $this->resolveValue($this->currentResources->get('XObject'));
292	if (!$xobjects instanceof PdfDictionary) {
293	return '';
294	}
295
296	$xobjRef = $xobjects->get($name);
297	if ($xobjRef === null) {
298	return '';
299	}
300
301	$xobj = $this->resolveValue($xobjRef);
302	if (!$xobj instanceof PdfStream) {
303	return '';
304	}
305
306	// Check if it's a Form XObject (not Image, PS, etc.)
307	$subtype = $xobj->dictionary->get('Subtype');
308	if (!$subtype instanceof PdfName \|\| $subtype->value !== 'Form') {
309	return '';
310	}
311
312	// Save parent state
313	$savedFontMaps = $this->fontMaps;
314	$savedCidFonts = $this->cidFonts;
315	$savedSpaceWidths = $this->fontSpaceWidths;
316	$savedFont = $this->currentFont;
317	$savedFontSize = $this->fontSize;
318	$savedSpaceWidth = $this->spaceWidth;
319	$savedResources = $this->currentResources;
320
321	// Load Form XObject's own resources (fonts, nested XObjects)
322	$xobjResources = $this->resolveValue($xobj->dictionary->get('Resources'));
323	if ($xobjResources instanceof PdfDictionary) {
324	$this->currentResources = $xobjResources;
325	$this->loadFontMapsFromResources($xobjResources);
326	}
327
328	// Parse and process the Form XObject's content stream
329	$this->xObjectDepth++;
330	$data = $xobj->data;
331	$text = '';
332	if ($data !== '') {
333	$ops = $this->parser->parse($data);
334	$text = $this->processOps($ops);
335	}
336	$this->xObjectDepth--;
337
338	// Restore parent state
339	$this->fontMaps = $savedFontMaps;
340	$this->cidFonts = $savedCidFonts;
341	$this->fontSpaceWidths = $savedSpaceWidths;
342	$this->currentFont = $savedFont;
343	$this->fontSize = $savedFontSize;
344	$this->spaceWidth = $savedSpaceWidth;
345	$this->currentResources = $savedResources;
346
347	return $text;
348	}
349
350	/**
351	* Infer whether a text position move implies a space or newline.
352	*/
353	private function inferSpacing(float $tx, float $ty, float $lastY, float $currentY): string
354	{
355	// Vertical movement larger than half the font size → newline
356	if (abs($ty) > $this->fontSize * 0.5) {
357	return "\n";
358	}
359	// Horizontal movement → space (if significant)
360	if (abs($tx) > $this->spaceWidth && $tx > 0) {
361	return ' ';
362	}
363	return '';
364	}
365
366	/**
367	* Extract /ActualText value from a BDC properties operand.
368	*
369	* The operand is either an inline dict like "<< /ActualText (text) /MCID 0 >>"
370	* or a name reference. Only inline dicts with /ActualText are handled.
371	*/
372	private function extractActualText(string $operand): ?string
373	{
374	$operand = trim($operand);
375
376	// Only parse inline dictionaries
377	if (!str_starts_with($operand, '<<')) {
378	return null;
379	}
380
381	// Try literal string: /ActualText (...)
382	if (preg_match('/\/ActualText\s+\(/', $operand, $matches, PREG_OFFSET_CAPTURE)) {
383	$startPos = (int) $matches[0][1];
384	// Find the opening paren
385	$parenPos = strpos($operand, '(', $startPos);
386	if ($parenPos !== false) {
387	$pos = $parenPos;
388	$str = $this->extractLiteralString($operand, $pos);
389	return $this->unescapeLiteralString($str);
390	}
391	}
392
393	// Try hex string: /ActualText <hex>
394	if (preg_match('/\/ActualText\s+<([0-9A-Fa-f\s]+)>/', $operand, $matches)) {
395	$hex = preg_replace('/\s+/', '', $matches[1]) ?? $matches[1];
396	$bytes = hex2bin($hex);
397	return $bytes !== false ? $bytes : null;
398	}
399
400	return null;
401	}
402
403	/**
404	* Decode a string operand from a Tj or ' or " operator.
405	*
406	* The operand is in raw PDF syntax: "(escaped text)" or "<hex>".
407	*/
408	private function decodeStringOperand(string $operand): string
409	{
410	$bytes = $this->parseStringOperand($operand);
411	return $this->mapBytesToUnicode($bytes);
412	}
413
414	/**
415	* Decode a TJ array operand: [<hex> -80 (text) 40 ...]
416	*
417	* String elements produce text. Numeric elements adjust positioning;
418	* large negative values (< -100) are treated as word spaces.
419	*/
420	private function decodeTJArray(string $arrayStr): string
421	{
422	$result = '';
423	$arrayStr = trim($arrayStr, '[] ');
424	$len = strlen($arrayStr);
425	$pos = 0;
426
427	while ($pos < $len) {
428	// Skip whitespace
429	while ($pos < $len && ($arrayStr[$pos] === ' ' \|\| $arrayStr[$pos] === "\n"
430	\|\| $arrayStr[$pos] === "\r" \|\| $arrayStr[$pos] === "\t")) {
431	$pos++;
432	}
433	if ($pos >= $len) {
434	break;
435	}
436
437	$ch = $arrayStr[$pos];
438
439	if ($ch === '(') {
440	// Literal string
441	$str = $this->extractLiteralString($arrayStr, $pos);
442	$bytes = $this->unescapeLiteralString($str);
443	$result .= $this->mapBytesToUnicode($bytes);
444	} elseif ($ch === '<') {
445	// Hex string
446	$hex = $this->extractHexString($arrayStr, $pos);
447	$bytes = hex2bin($hex) ?: '';
448	$result .= $this->mapBytesToUnicode($bytes);
449	} elseif ($ch === '-' \|\| $ch === '+' \|\| $ch === '.' \|\| ($ch >= '0' && $ch <= '9')) {
450	// Number — large negative = space
451	$numStr = '';
452	while ($pos < $len && ($arrayStr[$pos] === '-' \|\| $arrayStr[$pos] === '+'
453	\|\| $arrayStr[$pos] === '.' \|\| ($arrayStr[$pos] >= '0' && $arrayStr[$pos] <= '9'))) {
454	$numStr .= $arrayStr[$pos];
455	$pos++;
456	}
457	$num = (float) $numStr;
458	if ($num < -100) {
459	$result .= ' ';
460	}
461	} else {
462	$pos++;
463	}
464	}
465
466	return $result;
467	}
468
469	/**
470	* Parse a PDF string operand into raw bytes.
471	*/
472	private function parseStringOperand(string $operand): string
473	{
474	$operand = trim($operand);
475
476	if (str_starts_with($operand, '<') && str_ends_with($operand, '>')) {
477	// Hex string
478	$hex = substr($operand, 1, -1);
479	$hex = preg_replace('/\s+/', '', $hex) ?? $hex;
480	return hex2bin($hex) ?: '';
481	}
482
483	if (str_starts_with($operand, '(') && str_ends_with($operand, ')')) {
484	$inner = substr($operand, 1, -1);
485	return $this->unescapeLiteralString($inner);
486	}
487
488	return $operand;
489	}
490
491	/**
492	* Unescape a PDF literal string (content between outer parens).
493	*/
494	private function unescapeLiteralString(string $str): string
495	{
496	$result = '';
497	$len = strlen($str);
498	$i = 0;
499
500	while ($i < $len) {
501	$ch = $str[$i];
502	if ($ch === '\\' && $i + 1 < $len) {
503	$i++;
504	$next = $str[$i];
505	$result .= match ($next) {
506	'n' => "\n",
507	'r' => "\r",
508	't' => "\t",
509	'b' => "\x08",
510	'f' => "\x0C",
511	'(' => '(',
512	')' => ')',
513	'\\' => '\\',
514	default => $this->readOctalOrLiteral($str, $i, $next),
515	};
516	} else {
517	$result .= $ch;
518	}
519	$i++;
520	}
521
522	return $result;
523	}
524
525	private function readOctalOrLiteral(string $str, int &$i, string $ch): string
526	{
527	if ($ch >= '0' && $ch <= '7') {
528	$octal = $ch;
529	$len = strlen($str);
530	for ($j = 0; $j < 2 && $i + 1 < $len; $j++) {
531	$next = $str[$i + 1];
532	if ($next >= '0' && $next <= '7') {
533	$octal .= $next;
534	$i++;
535	} else {
536	break;
537	}
538	}
539	return chr((int) octdec($octal));
540	}
541	return $ch;
542	}
543
544	/**
545	* Map raw bytes to Unicode string using the current font's encoding.
546	*
547	* If the raw bytes contain valid multi-byte UTF-8 sequences, they
548	* are passed through directly. This handles the common case where
549	* PDF producers embed UTF-8 in content streams regardless of the
550	* declared encoding (technically non-conforming, but widespread).
551	*/
552	private function mapBytesToUnicode(string $bytes): string
553	{
554	// If bytes contain multi-byte UTF-8 sequences, pass through
555	// directly regardless of font encoding. This handles PDFs that
556	// embed UTF-8 in content streams (FPDF, many other producers).
557	if ($this->containsMultibyte($bytes) && mb_check_encoding($bytes, 'UTF-8')) {
558	return $bytes;
559	}
560
561	$fontMap = $this->fontMaps[$this->currentFont] ?? null;
562	$isCid = $this->cidFonts[$this->currentFont] ?? false;
563
564	if ($fontMap !== null && $isCid) {
565	// CID font: process bytes in pairs (2-byte GID codes)
566	$result = '';
567	$len = strlen($bytes);
568	for ($i = 0; $i + 1 < $len; $i += 2) {
569	$code = (ord($bytes[$i]) << 8) \| ord($bytes[$i + 1]);
570	if (isset($fontMap[$code])) {
571	$result .= mb_chr($fontMap[$code], 'UTF-8');
572	} else {
573	$result .= "\u{FFFD}"; // replacement character
574	}
575	}
576	return $result;
577	}
578
579	if ($fontMap !== null) {
580	$result = '';
581	$len = strlen($bytes);
582	for ($i = 0; $i < $len; $i++) {
583	$code = ord($bytes[$i]);
584	if (isset($fontMap[$code])) {
585	$result .= mb_chr($fontMap[$code], 'UTF-8');
586	} else {
587	// Unmapped — use raw byte as Latin-1
588	$result .= mb_chr($code, 'UTF-8');
589	}
590	}
591	return $result;
592	}
593
594	return $this->winAnsiFallback($bytes);
595	}
596
597	/**
598	* Check if a string contains any multi-byte UTF-8 sequences.
599	*/
600	private function containsMultibyte(string $bytes): bool
601	{
602	$len = strlen($bytes);
603	for ($i = 0; $i < $len; $i++) {
604	if (ord($bytes[$i]) > 127) {
605	return true;
606	}
607	}
608	return false;
609	}
610
611	/**
612	* Fallback: convert bytes using WinAnsi encoding → GlyphList → Unicode.
613	*/
614	private function winAnsiFallback(string $bytes): string
615	{
616	static $winAnsi = null;
617	static $glyphList = null;
618	if ($winAnsi === null) {
619	$winAnsi = WinAnsiTable::getTable();
620	$glyphList = GlyphList::getList();
621	}
622
623	$result = '';
624	$len = strlen($bytes);
625	for ($i = 0; $i < $len; $i++) {
626	$code = ord($bytes[$i]);
627	$glyphName = $winAnsi[$code] ?? null;
628	if ($glyphName !== null && isset($glyphList[$glyphName])) {
629	$result .= mb_chr($glyphList[$glyphName], 'UTF-8');
630	} else {
631	// Direct byte → character
632	$result .= mb_chr($code, 'UTF-8');
633	}
634	}
635	return $result;
636	}
637
638	/**
639	* Decode PDF name `#XX` hex escapes (PDF 1.2+) so a content-stream name
640	* like `/*Courier#20New` matches the literal-space resource key under
641	* which the font dict was registered.
642	*/
643	private function decodeName(string $name): string
644	{
645	return preg_replace_callback(
646	'/#([0-9A-Fa-f]{2})/',
647	static fn(array $m): string => chr((int) hexdec($m[1])),
648	$name,
649	);
650	}
651
652	/**
653	* Load font-to-Unicode mappings from the page's /Resources/Font dictionary.
654	*/
655	private function loadFontMaps(PdfDictionary $page): void
656	{
657	$this->fontMaps = [];
658	$this->cidFonts = [];
659	$this->fontSpaceWidths = [];
660
661	$resources = $this->resolveValue($page->get('Resources'));
662	if (!$resources instanceof PdfDictionary) {
663	return;
664	}
665
666	$this->loadFontMapsFromResources($resources);
667	}
668
669	/**
670	* Load font-to-Unicode mappings from a /Resources dictionary.
671	*
672	* Merges into existing maps so Form XObject fonts supplement page fonts.
673	*/
674	private function loadFontMapsFromResources(PdfDictionary $resources): void
675	{
676	$fonts = $this->resolveValue($resources->get('Font'));
677	if (!$fonts instanceof PdfDictionary) {
678	return;
679	}
680
681	$cmapParser = new CMapParser();
682
683	foreach ($fonts->entries as $fontName => $fontRef) {
684	$fontDict = $this->resolveValue($fontRef);
685	if (!$fontDict instanceof PdfDictionary) {
686	continue;
687	}
688
689	// Detect CID/Type0 fonts (use 2-byte character codes)
690	$subtype = $fontDict->get('Subtype');
691	if ($subtype instanceof PdfName && $subtype->value === 'Type0') {
692	$this->cidFonts[$fontName] = true;
693	}
694
695	// Extract space width from font metrics
696	$this->extractSpaceWidth($fontName, $fontDict);
697
698	// Try ToUnicode CMap first
699	$toUnicode = $fontDict->get('ToUnicode');
700	if ($toUnicode !== null) {
701	$toUnicodeStream = $this->resolveValue($toUnicode);
702	if ($toUnicodeStream instanceof PdfStream && $toUnicodeStream->data !== '') {
703	$map = $cmapParser->parse($toUnicodeStream->data);
704	if (!empty($map)) {
705	$this->fontMaps[$fontName] = $map;
706	continue;
707	}
708	}
709	}
710
711	// Try /Encoding with /Differences
712	$encoding = $fontDict->get('Encoding');
713	if ($encoding !== null) {
714	$map = $this->buildEncodingMap($encoding);
715	if (!empty($map)) {
716	$this->fontMaps[$fontName] = $map;
717	continue;
718	}
719	}
720
721	// Fallback: for Type1/TrueType fonts without ToUnicode or Encoding,
722	// use StandardEncoding (Type1 default) or WinAnsiEncoding (TrueType default)
723	if ($subtype instanceof PdfName && in_array($subtype->value, ['Type1', 'MMType1', 'TrueType'], true)) {
724	$fallbackTable = ($subtype->value === 'TrueType')
725	? WinAnsiTable::getTable()
726	: StandardEncodingTable::getTable();
727	$glyphList = GlyphList::getList();
728	$map = [];
729	foreach ($fallbackTable as $code => $glyphName) {
730	if (isset($glyphList[$glyphName])) {
731	$map[$code] = $glyphList[$glyphName];
732	}
733	}
734	if (!empty($map)) {
735	$this->fontMaps[$fontName] = $map;
736	}
737	}
738	}
739	}
740
741	/**
742	* Extract the space character width from font metrics.
743	*/
744	private function extractSpaceWidth(string $fontName, PdfDictionary $fontDict): void
745	{
746	// Try /Widths array (simple fonts)
747	$widths = $fontDict->get('Widths');
748	$firstChar = $fontDict->get('FirstChar');
749	if ($widths instanceof PdfArray && $firstChar instanceof PdfNumber) {
750	$fc = (int) $firstChar->toPdf();
751	$spaceIndex = 32 - $fc; // space = char code 32
752	if ($spaceIndex >= 0 && isset($widths->items[$spaceIndex])) {
753	$w = $widths->items[$spaceIndex];
754	if ($w instanceof PdfNumber) {
755	$this->fontSpaceWidths[$fontName] = (float) $w->toPdf();
756	return;
757	}
758	}
759	}
760
761	// Try /DW (default width for CID fonts) as fallback
762	$dw = $fontDict->get('DW');
763	if ($dw instanceof PdfNumber) {
764	$this->fontSpaceWidths[$fontName] = (float) $dw->toPdf();
765	}
766	}
767
768	/**
769	* Build a character map from an /Encoding entry.
770	*
771	* @return array<int, int> char code → Unicode codepoint
772	*/
773	private function buildEncodingMap(mixed $encoding): array
774	{
775	$glyphList = GlyphList::getList();
776	$map = [];
777
778	if ($encoding instanceof PdfName) {
779	$table = $this->getNamedEncodingTable($encoding->value);
780	if ($table !== null) {
781	foreach ($table as $code => $glyphName) {
782	if (isset($glyphList[$glyphName])) {
783	$map[$code] = $glyphList[$glyphName];
784	}
785	}
786	}
787	return $map;
788	}
789
790	$encodingDict = $this->resolveValue($encoding);
791	if (!$encodingDict instanceof PdfDictionary) {
792	return $map;
793	}
794
795	// Start with base encoding
796	$baseEnc = $encodingDict->get('BaseEncoding');
797	if ($baseEnc instanceof PdfName) {
798	$table = $this->getNamedEncodingTable($baseEnc->value);
799	if ($table !== null) {
800	foreach ($table as $code => $glyphName) {
801	if (isset($glyphList[$glyphName])) {
802	$map[$code] = $glyphList[$glyphName];
803	}
804	}
805	}
806	}
807
808	// Apply /Differences
809	$diffs = $encodingDict->get('Differences');
810	if ($diffs instanceof PdfArray) {
811	$code = 0;
812	foreach ($diffs->items as $item) {
813	if ($item instanceof PdfNumber) {
814	$code = (int) $item->toPdf();
815	} elseif ($item instanceof PdfName) {
816	if (isset($glyphList[$item->value])) {
817	$map[$code] = $glyphList[$item->value];
818	}
819	$code++;
820	}
821	}
822	}
823
824	return $map;
825	}
826
827	/**
828	* Get the glyph name table for a named encoding.
829	*
830	* @return array<int, string>\|null byte → glyph name table, or null if unknown
831	*/
832	private function getNamedEncodingTable(string $name): ?array
833	{
834	return match ($name) {
835	'WinAnsiEncoding' => WinAnsiTable::getTable(),
836	'MacRomanEncoding' => MacRomanTable::getTable(),
837	'StandardEncoding' => StandardEncodingTable::getTable(),
838	'MacExpertEncoding' => MacExpertEncodingTable::getTable(),
839	default => null,
840	};
841	}
842
843	/**
844	* Resolve a value that might be a PdfReference.
845	*/
846	private function resolveValue(mixed $value): mixed
847	{
848	if ($value instanceof PdfReference) {
849	return $this->resolver->resolveReference($value);
850	}
851	return $value;
852	}
853
854	/**
855	* Get the concatenated content stream data from a page.
856	*/
857	private function getContentStreamData(PdfDictionary $page): string
858	{
859	$contents = $page->get('Contents');
860	if ($contents === null) {
861	return '';
862	}
863
864	if ($contents instanceof PdfReference) {
865	$obj = $this->resolver->resolveReference($contents);
866	if ($obj instanceof PdfStream) {
867	return $obj->data;
868	}
869	// Could be an array of content stream refs
870	if ($obj instanceof PdfArray) {
871	$contents = $obj;
872	} else {
873	return '';
874	}
875	}
876
877	if ($contents instanceof PdfArray) {
878	$data = '';
879	foreach ($contents->items as $ref) {
880	if ($ref instanceof PdfReference) {
881	$stream = $this->resolver->resolveReference($ref);
882	if ($stream instanceof PdfStream) {
883	$data .= $stream->data . "\n";
884	}
885	}
886	}
887	return $data;
888	}
889
890	return '';
891	}
892
893	private function extractLiteralString(string $data, int &$pos): string
894	{
895	$pos++; // skip (
896	$result = '';
897	$depth = 1;
898	$len = strlen($data);
899
900	while ($pos < $len && $depth > 0) {
901	$ch = $data[$pos];
902	if ($ch === '(') {
903	$depth++;
904	$result .= '(';
905	} elseif ($ch === ')') {
906	$depth--;
907	if ($depth > 0) {
908	$result .= ')';
909	}
910	} elseif ($ch === '\\') {
911	$result .= '\\';
912	$pos++;
913	if ($pos < $len) {
914	$result .= $data[$pos];
915	}
916	} else {
917	$result .= $ch;
918	}
919	$pos++;
920	}
921
922	return $result;
923	}
924
925	private function extractHexString(string $data, int &$pos): string
926	{
927	$pos++; // skip <
928	$hex = '';
929	$len = strlen($data);
930
931	while ($pos < $len && $data[$pos] !== '>') {
932	if (!ctype_space($data[$pos])) {
933	$hex .= $data[$pos];
934	}
935	$pos++;
936	}
937	if ($pos < $len) {
938	$pos++; // skip >
939	}
940	return $hex;
941	}
942	}