Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
85.67% |
580 / 677 |
|
42.42% |
14 / 33 |
CRAP | |
0.00% |
0 / 1 |
| PositionedTextExtractor | |
85.67% |
580 / 677 |
|
42.42% |
14 / 33 |
487.02 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| extractFromPage | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
3 | |||
| processOps | |
89.21% |
124 / 139 |
|
0.00% |
0 / 1 |
58.80 | |||
| showString | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
2 | |||
| showTJArray | |
95.77% |
68 / 71 |
|
0.00% |
0 / 1 |
30 | |||
| buildSpanForText | |
94.12% |
16 / 17 |
|
0.00% |
0 / 1 |
2.00 | |||
| computeStringDisplacement | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
6 | |||
| advanceTextMatrix | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| textToUserSpace | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
| getEffectiveFontSize | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
| multiplyMatrices | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
| extractFromXObject | |
81.63% |
40 / 49 |
|
0.00% |
0 / 1 |
15.21 | |||
| decodeName | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
| loadFontData | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
2 | |||
| loadFontDataFromResources | |
94.12% |
16 / 17 |
|
0.00% |
0 / 1 |
7.01 | |||
| loadEncodingMap | |
96.00% |
24 / 25 |
|
0.00% |
0 / 1 |
13 | |||
| loadGlyphWidths | |
96.88% |
31 / 32 |
|
0.00% |
0 / 1 |
14 | |||
| loadCidWidths | |
60.00% |
18 / 30 |
|
0.00% |
0 / 1 |
18.74 | |||
| tryLoadStandardFontWidths | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
5 | |||
| loadDefaultWidth | |
86.67% |
13 / 15 |
|
0.00% |
0 / 1 |
6.09 | |||
| parseStringOperand | |
88.89% |
8 / 9 |
|
0.00% |
0 / 1 |
6.05 | |||
| mapBytesToUnicode | |
91.30% |
21 / 23 |
|
0.00% |
0 / 1 |
10.07 | |||
| containsMultibyte | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
| winAnsiFallback | |
92.86% |
13 / 14 |
|
0.00% |
0 / 1 |
5.01 | |||
| buildEncodingMap | |
30.00% |
9 / 30 |
|
0.00% |
0 / 1 |
92.17 | |||
| getNamedEncodingTable | |
50.00% |
3 / 6 |
|
0.00% |
0 / 1 |
10.50 | |||
| extractActualText | |
86.67% |
13 / 15 |
|
0.00% |
0 / 1 |
6.09 | |||
| resolveValue | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
| getContentStreamData | |
36.84% |
7 / 19 |
|
0.00% |
0 / 1 |
29.41 | |||
| unescapeLiteralString | |
76.19% |
16 / 21 |
|
0.00% |
0 / 1 |
15.28 | |||
| readOctalOrLiteral | |
81.82% |
9 / 11 |
|
0.00% |
0 / 1 |
7.29 | |||
| extractLiteralString | |
85.71% |
18 / 21 |
|
0.00% |
0 / 1 |
8.19 | |||
| extractHexString | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
5 | |||
| 1 | <?php |
| 2 | |
| 3 | declare(strict_types=1); |
| 4 | |
| 5 | namespace Phpdftk\Pdf\Reader; |
| 6 | |
| 7 | use Phpdftk\Encoding\CMapParser; |
| 8 | use Phpdftk\Encoding\GlyphList; |
| 9 | use Phpdftk\Encoding\MacExpertEncodingTable; |
| 10 | use Phpdftk\Encoding\MacRomanTable; |
| 11 | use Phpdftk\Encoding\StandardEncodingTable; |
| 12 | use Phpdftk\Encoding\WinAnsiTable; |
| 13 | use Phpdftk\FontMetrics\StandardFontMetrics; |
| 14 | use Phpdftk\Pdf\Core\PdfArray; |
| 15 | use Phpdftk\Pdf\Core\PdfDictionary; |
| 16 | use Phpdftk\Pdf\Core\PdfName; |
| 17 | use Phpdftk\Pdf\Core\PdfNumber; |
| 18 | use Phpdftk\Pdf\Core\PdfReference; |
| 19 | use Phpdftk\Pdf\Core\PdfStream; |
| 20 | use Phpdftk\Pdf\Reader\Parser\ContentStreamOp; |
| 21 | use Phpdftk\Pdf\Reader\Parser\ContentStreamParser; |
| 22 | |
| 23 | /** |
| 24 | * Extracts text with precise positioning from a PDF page. |
| 25 | * |
| 26 | * Implements a full text state machine per ISO 32000-2 §9: |
| 27 | * - Tracks the current transformation matrix (CTM) via `cm` operator |
| 28 | * - Tracks the text matrix (Tm) and text line matrix |
| 29 | * - Applies character spacing (Tc), word spacing (Tw), horizontal scaling (Tz), |
| 30 | * text leading (TL), and text rise (Ts) |
| 31 | * - Resolves glyph widths from font /Widths arrays, /W arrays (CID fonts), |
| 32 | * embedded font data, and standard font metrics (14 built-in fonts) |
| 33 | * - Computes per-span bounding boxes in user space coordinates |
| 34 | * |
| 35 | * Each text-showing operator (Tj, TJ, ', ") produces one or more TextSpan |
| 36 | * objects with the computed position and dimensions. |
| 37 | */ |
| 38 | final class PositionedTextExtractor |
| 39 | { |
| 40 | private readonly ContentStreamParser $parser; |
| 41 | private readonly ObjectResolver $resolver; |
| 42 | |
| 43 | // --- Font state --- |
| 44 | |
| 45 | /** @var array<string, array<int, int>> Font name → char code → Unicode codepoint */ |
| 46 | private array $fontMaps = []; |
| 47 | |
| 48 | /** @var array<string, bool> Font names using 2-byte CID encoding */ |
| 49 | private array $cidFonts = []; |
| 50 | |
| 51 | /** @var array<string, array<int, float>> Font name → char code → width in 1/1000 units */ |
| 52 | private array $fontWidths = []; |
| 53 | |
| 54 | /** @var array<string, float> Font name → default/missing width in 1/1000 units */ |
| 55 | private array $fontDefaultWidths = []; |
| 56 | |
| 57 | /** @var array<string, string|null> Font name → base font (PostScript) name */ |
| 58 | private array $fontBaseNames = []; |
| 59 | |
| 60 | // --- Graphics state --- |
| 61 | |
| 62 | /** Current transformation matrix [a, b, c, d, e, f] */ |
| 63 | private array $ctm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]; |
| 64 | |
| 65 | /** @var list<array{ctm: array<float>}> */ |
| 66 | private array $graphicsStateStack = []; |
| 67 | |
| 68 | // --- Text state (persists across BT/ET) --- |
| 69 | |
| 70 | private float $charSpacing = 0.0; // Tc |
| 71 | private float $wordSpacing = 0.0; // Tw |
| 72 | private float $horizontalScaling = 100.0; // Tz (percentage) |
| 73 | private float $textLeading = 0.0; // TL |
| 74 | private float $textRise = 0.0; // Ts |
| 75 | private string $currentFont = ''; |
| 76 | private float $fontSize = 12.0; |
| 77 | |
| 78 | // --- Text object state (reset at BT) --- |
| 79 | |
| 80 | /** Text matrix [a, b, c, d, e, f] */ |
| 81 | private array $textMatrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]; |
| 82 | |
| 83 | /** Text line matrix [a, b, c, d, e, f] */ |
| 84 | private array $textLineMatrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]; |
| 85 | |
| 86 | // --- XObject state --- |
| 87 | |
| 88 | private ?PdfDictionary $currentResources = null; |
| 89 | private int $xObjectDepth = 0; |
| 90 | private const MAX_XOBJECT_DEPTH = 10; |
| 91 | |
| 92 | // --- Marked content --- |
| 93 | |
| 94 | /** @var list<array{actualText: string|null}> */ |
| 95 | private array $markedContentStack = []; |
| 96 | private bool $suppressText = false; |
| 97 | |
| 98 | public function __construct(ObjectResolver $resolver) |
| 99 | { |
| 100 | $this->parser = new ContentStreamParser(); |
| 101 | $this->resolver = $resolver; |
| 102 | } |
| 103 | |
| 104 | /** |
| 105 | * Extract positioned text spans from a page dictionary. |
| 106 | * |
| 107 | * @return list<TextSpan> |
| 108 | */ |
| 109 | public function extractFromPage(PdfDictionary $page): array |
| 110 | { |
| 111 | $this->loadFontData($page); |
| 112 | |
| 113 | $resources = $this->resolveValue($page->get('Resources')); |
| 114 | $this->currentResources = $resources instanceof PdfDictionary ? $resources : null; |
| 115 | |
| 116 | $data = $this->getContentStreamData($page); |
| 117 | if ($data === '') { |
| 118 | return []; |
| 119 | } |
| 120 | |
| 121 | $ops = $this->parser->parse($data); |
| 122 | return $this->processOps($ops); |
| 123 | } |
| 124 | |
| 125 | /** |
| 126 | * Process content stream operations and produce positioned text spans. |
| 127 | * |
| 128 | * @param list<ContentStreamOp> $ops |
| 129 | * @return list<TextSpan> |
| 130 | */ |
| 131 | private function processOps(array $ops): array |
| 132 | { |
| 133 | $spans = []; |
| 134 | |
| 135 | foreach ($ops as $op) { |
| 136 | switch ($op->operator) { |
| 137 | // --- Graphics state --- |
| 138 | case 'q': |
| 139 | $this->graphicsStateStack[] = ['ctm' => $this->ctm]; |
| 140 | break; |
| 141 | |
| 142 | case 'Q': |
| 143 | if (!empty($this->graphicsStateStack)) { |
| 144 | $saved = array_pop($this->graphicsStateStack); |
| 145 | $this->ctm = $saved['ctm']; |
| 146 | } |
| 147 | break; |
| 148 | |
| 149 | case 'cm': |
| 150 | if (count($op->operands) >= 6) { |
| 151 | $m = array_map('floatval', array_slice($op->operands, 0, 6)); |
| 152 | $this->ctm = $this->multiplyMatrices($m, $this->ctm); |
| 153 | } |
| 154 | break; |
| 155 | |
| 156 | // --- Text object --- |
| 157 | case 'BT': |
| 158 | $this->textMatrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]; |
| 159 | $this->textLineMatrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]; |
| 160 | $this->markedContentStack = []; |
| 161 | $this->suppressText = false; |
| 162 | break; |
| 163 | |
| 164 | case 'ET': |
| 165 | break; |
| 166 | |
| 167 | // --- Text state operators --- |
| 168 | case 'Tc': |
| 169 | if (count($op->operands) >= 1) { |
| 170 | $this->charSpacing = (float) $op->operands[0]; |
| 171 | } |
| 172 | break; |
| 173 | |
| 174 | case 'Tw': |
| 175 | if (count($op->operands) >= 1) { |
| 176 | $this->wordSpacing = (float) $op->operands[0]; |
| 177 | } |
| 178 | break; |
| 179 | |
| 180 | case 'Tz': |
| 181 | if (count($op->operands) >= 1) { |
| 182 | $this->horizontalScaling = (float) $op->operands[0]; |
| 183 | } |
| 184 | break; |
| 185 | |
| 186 | case 'TL': |
| 187 | if (count($op->operands) >= 1) { |
| 188 | $this->textLeading = (float) $op->operands[0]; |
| 189 | } |
| 190 | break; |
| 191 | |
| 192 | case 'Ts': |
| 193 | if (count($op->operands) >= 1) { |
| 194 | $this->textRise = (float) $op->operands[0]; |
| 195 | } |
| 196 | break; |
| 197 | |
| 198 | case 'Tf': |
| 199 | if (count($op->operands) >= 2) { |
| 200 | $this->currentFont = $this->decodeName(ltrim($op->operands[0], '/')); |
| 201 | $this->fontSize = (float) $op->operands[1]; |
| 202 | } |
| 203 | break; |
| 204 | |
| 205 | // --- Text positioning --- |
| 206 | case 'Td': |
| 207 | if (count($op->operands) >= 2) { |
| 208 | $tx = (float) $op->operands[0]; |
| 209 | $ty = (float) $op->operands[1]; |
| 210 | $m = [1.0, 0.0, 0.0, 1.0, $tx, $ty]; |
| 211 | $this->textLineMatrix = $this->multiplyMatrices($m, $this->textLineMatrix); |
| 212 | $this->textMatrix = $this->textLineMatrix; |
| 213 | } |
| 214 | break; |
| 215 | |
| 216 | case 'TD': |
| 217 | if (count($op->operands) >= 2) { |
| 218 | $tx = (float) $op->operands[0]; |
| 219 | $ty = (float) $op->operands[1]; |
| 220 | $this->textLeading = -$ty; |
| 221 | $m = [1.0, 0.0, 0.0, 1.0, $tx, $ty]; |
| 222 | $this->textLineMatrix = $this->multiplyMatrices($m, $this->textLineMatrix); |
| 223 | $this->textMatrix = $this->textLineMatrix; |
| 224 | } |
| 225 | break; |
| 226 | |
| 227 | case 'Tm': |
| 228 | if (count($op->operands) >= 6) { |
| 229 | $tm = array_map('floatval', array_slice($op->operands, 0, 6)); |
| 230 | $this->textMatrix = $tm; |
| 231 | $this->textLineMatrix = $tm; |
| 232 | } |
| 233 | break; |
| 234 | |
| 235 | case 'T*': |
| 236 | $m = [1.0, 0.0, 0.0, 1.0, 0.0, -$this->textLeading]; |
| 237 | $this->textLineMatrix = $this->multiplyMatrices($m, $this->textLineMatrix); |
| 238 | $this->textMatrix = $this->textLineMatrix; |
| 239 | break; |
| 240 | |
| 241 | // --- Marked content --- |
| 242 | case 'BMC': |
| 243 | $this->markedContentStack[] = ['actualText' => null]; |
| 244 | break; |
| 245 | |
| 246 | case 'BDC': |
| 247 | $actualText = null; |
| 248 | if (count($op->operands) >= 2) { |
| 249 | $actualText = $this->extractActualText($op->operands[1]); |
| 250 | } |
| 251 | $this->markedContentStack[] = ['actualText' => $actualText]; |
| 252 | if ($actualText !== null) { |
| 253 | $this->suppressText = true; |
| 254 | } |
| 255 | break; |
| 256 | |
| 257 | case 'EMC': |
| 258 | if (!empty($this->markedContentStack)) { |
| 259 | $entry = array_pop($this->markedContentStack); |
| 260 | if ($entry['actualText'] !== null) { |
| 261 | // Emit the ActualText as a span at the current position |
| 262 | $span = $this->buildSpanForText($entry['actualText']); |
| 263 | if ($span !== null) { |
| 264 | $spans[] = $span; |
| 265 | } |
| 266 | $this->suppressText = false; |
| 267 | foreach ($this->markedContentStack as $stackEntry) { |
| 268 | if ($stackEntry['actualText'] !== null) { |
| 269 | $this->suppressText = true; |
| 270 | break; |
| 271 | } |
| 272 | } |
| 273 | } |
| 274 | } |
| 275 | break; |
| 276 | |
| 277 | // --- Text showing operators --- |
| 278 | case 'Tj': |
| 279 | if (!$this->suppressText && count($op->operands) >= 1) { |
| 280 | $span = $this->showString($op->operands[0]); |
| 281 | if ($span !== null) { |
| 282 | $spans[] = $span; |
| 283 | } |
| 284 | } |
| 285 | break; |
| 286 | |
| 287 | case 'TJ': |
| 288 | if (!$this->suppressText && count($op->operands) >= 1) { |
| 289 | $newSpans = $this->showTJArray($op->operands[0]); |
| 290 | array_push($spans, ...$newSpans); |
| 291 | } |
| 292 | break; |
| 293 | |
| 294 | case "'": |
| 295 | // T* then Tj |
| 296 | $mStar = [1.0, 0.0, 0.0, 1.0, 0.0, -$this->textLeading]; |
| 297 | $this->textLineMatrix = $this->multiplyMatrices($mStar, $this->textLineMatrix); |
| 298 | $this->textMatrix = $this->textLineMatrix; |
| 299 | if (!$this->suppressText && count($op->operands) >= 1) { |
| 300 | $span = $this->showString($op->operands[0]); |
| 301 | if ($span !== null) { |
| 302 | $spans[] = $span; |
| 303 | } |
| 304 | } |
| 305 | break; |
| 306 | |
| 307 | case '"': |
| 308 | // Set Tw, Tc, then T* then Tj |
| 309 | if (count($op->operands) >= 3) { |
| 310 | $this->wordSpacing = (float) $op->operands[0]; |
| 311 | $this->charSpacing = (float) $op->operands[1]; |
| 312 | $mStar = [1.0, 0.0, 0.0, 1.0, 0.0, -$this->textLeading]; |
| 313 | $this->textLineMatrix = $this->multiplyMatrices($mStar, $this->textLineMatrix); |
| 314 | $this->textMatrix = $this->textLineMatrix; |
| 315 | if (!$this->suppressText) { |
| 316 | $span = $this->showString($op->operands[2]); |
| 317 | if ($span !== null) { |
| 318 | $spans[] = $span; |
| 319 | } |
| 320 | } |
| 321 | } |
| 322 | break; |
| 323 | |
| 324 | // --- XObject invocation --- |
| 325 | case 'Do': |
| 326 | if (count($op->operands) >= 1) { |
| 327 | $xobjSpans = $this->extractFromXObject(ltrim($op->operands[0], '/')); |
| 328 | array_push($spans, ...$xobjSpans); |
| 329 | } |
| 330 | break; |
| 331 | } |
| 332 | } |
| 333 | |
| 334 | return $spans; |
| 335 | } |
| 336 | |
| 337 | /** |
| 338 | * Show a single string operand (Tj operator) and advance the text matrix. |
| 339 | * |
| 340 | * Returns a TextSpan with the decoded text and computed position/dimensions, |
| 341 | * or null if the string decodes to empty. |
| 342 | */ |
| 343 | private function showString(string $operand): ?TextSpan |
| 344 | { |
| 345 | $bytes = $this->parseStringOperand($operand); |
| 346 | $text = $this->mapBytesToUnicode($bytes); |
| 347 | |
| 348 | if ($text === '') { |
| 349 | return null; |
| 350 | } |
| 351 | |
| 352 | // Compute position in user space before advancing |
| 353 | $userPos = $this->textToUserSpace($this->textMatrix); |
| 354 | $x = $userPos[0]; |
| 355 | $y = $userPos[1]; |
| 356 | |
| 357 | // Compute total displacement for this string |
| 358 | $width = $this->computeStringDisplacement($bytes); |
| 359 | |
| 360 | // Build the span |
| 361 | $effectiveFontSize = $this->getEffectiveFontSize(); |
| 362 | $span = new TextSpan( |
| 363 | text: $text, |
| 364 | x: $x, |
| 365 | y: $y, |
| 366 | width: abs($width), |
| 367 | height: $effectiveFontSize, |
| 368 | fontSize: $this->fontSize, |
| 369 | fontName: $this->currentFont, |
| 370 | ); |
| 371 | |
| 372 | // Advance text matrix by the displacement |
| 373 | $this->advanceTextMatrix($width); |
| 374 | |
| 375 | return $span; |
| 376 | } |
| 377 | |
| 378 | /** |
| 379 | * Process a TJ array: [(string) num (string) num ...] |
| 380 | * |
| 381 | * String elements produce spans. Numeric elements adjust positioning |
| 382 | * (in thousandths of a unit of text space, negative = advance right). |
| 383 | * |
| 384 | * Adjacent string elements separated only by small adjustments are |
| 385 | * merged into a single span for usability. |
| 386 | * |
| 387 | * @return list<TextSpan> |
| 388 | */ |
| 389 | private function showTJArray(string $arrayStr): array |
| 390 | { |
| 391 | $spans = []; |
| 392 | $arrayStr = trim($arrayStr, '[] '); |
| 393 | $len = strlen($arrayStr); |
| 394 | $pos = 0; |
| 395 | |
| 396 | // Track current run for merging |
| 397 | $runText = ''; |
| 398 | $runStartX = 0.0; |
| 399 | $runStartY = 0.0; |
| 400 | $runWidth = 0.0; |
| 401 | $runStarted = false; |
| 402 | |
| 403 | while ($pos < $len) { |
| 404 | // Skip whitespace |
| 405 | while ($pos < $len && ($arrayStr[$pos] === ' ' || $arrayStr[$pos] === "\n" |
| 406 | || $arrayStr[$pos] === "\r" || $arrayStr[$pos] === "\t")) { |
| 407 | $pos++; |
| 408 | } |
| 409 | if ($pos >= $len) { |
| 410 | break; |
| 411 | } |
| 412 | |
| 413 | $ch = $arrayStr[$pos]; |
| 414 | |
| 415 | if ($ch === '(' || $ch === '<') { |
| 416 | // String element |
| 417 | if ($ch === '(') { |
| 418 | $str = $this->extractLiteralString($arrayStr, $pos); |
| 419 | $bytes = $this->unescapeLiteralString($str); |
| 420 | } else { |
| 421 | $hex = $this->extractHexString($arrayStr, $pos); |
| 422 | $bytes = hex2bin($hex) ?: ''; |
| 423 | } |
| 424 | |
| 425 | $text = $this->mapBytesToUnicode($bytes); |
| 426 | if ($text === '') { |
| 427 | continue; |
| 428 | } |
| 429 | |
| 430 | if (!$runStarted) { |
| 431 | $userPos = $this->textToUserSpace($this->textMatrix); |
| 432 | $runStartX = $userPos[0]; |
| 433 | $runStartY = $userPos[1]; |
| 434 | $runStarted = true; |
| 435 | } |
| 436 | |
| 437 | $displacement = $this->computeStringDisplacement($bytes); |
| 438 | $runText .= $text; |
| 439 | $runWidth += $displacement; |
| 440 | $this->advanceTextMatrix($displacement); |
| 441 | |
| 442 | } elseif ($ch === '-' || $ch === '+' || $ch === '.' || ($ch >= '0' && $ch <= '9')) { |
| 443 | // Numeric adjustment |
| 444 | $numStr = ''; |
| 445 | while ($pos < $len && ($arrayStr[$pos] === '-' || $arrayStr[$pos] === '+' |
| 446 | || $arrayStr[$pos] === '.' || ($arrayStr[$pos] >= '0' && $arrayStr[$pos] <= '9'))) { |
| 447 | $numStr .= $arrayStr[$pos]; |
| 448 | $pos++; |
| 449 | } |
| 450 | $num = (float) $numStr; |
| 451 | |
| 452 | // Convert to text space displacement: |
| 453 | // TJ numbers are in thousandths of text space, negative = advance right |
| 454 | $displacement = -$num / 1000.0 * $this->fontSize * ($this->horizontalScaling / 100.0); |
| 455 | |
| 456 | if ($runStarted) { |
| 457 | // Large negative number (>100 in magnitude) typically means word space |
| 458 | if ($num > 100) { |
| 459 | // Flush current run as a span, then start new run |
| 460 | if ($runText !== '') { |
| 461 | $spans[] = new TextSpan( |
| 462 | text: $runText, |
| 463 | x: $runStartX, |
| 464 | y: $runStartY, |
| 465 | width: abs($runWidth), |
| 466 | height: $this->getEffectiveFontSize(), |
| 467 | fontSize: $this->fontSize, |
| 468 | fontName: $this->currentFont, |
| 469 | ); |
| 470 | } |
| 471 | $runText = ''; |
| 472 | $runWidth = 0.0; |
| 473 | $runStarted = false; |
| 474 | } else { |
| 475 | $runWidth += $displacement; |
| 476 | } |
| 477 | } |
| 478 | |
| 479 | $this->advanceTextMatrix($displacement); |
| 480 | } else { |
| 481 | $pos++; |
| 482 | } |
| 483 | } |
| 484 | |
| 485 | // Flush remaining run |
| 486 | if ($runStarted && $runText !== '') { |
| 487 | $spans[] = new TextSpan( |
| 488 | text: $runText, |
| 489 | x: $runStartX, |
| 490 | y: $runStartY, |
| 491 | width: abs($runWidth), |
| 492 | height: $this->getEffectiveFontSize(), |
| 493 | fontSize: $this->fontSize, |
| 494 | fontName: $this->currentFont, |
| 495 | ); |
| 496 | } |
| 497 | |
| 498 | return $spans; |
| 499 | } |
| 500 | |
| 501 | /** |
| 502 | * Build a TextSpan for a given text string at the current text matrix position. |
| 503 | * |
| 504 | * Used for /ActualText where we don't have raw bytes to measure widths. |
| 505 | * The width is estimated from the text length and a default glyph width. |
| 506 | */ |
| 507 | private function buildSpanForText(string $text): ?TextSpan |
| 508 | { |
| 509 | if ($text === '') { |
| 510 | return null; |
| 511 | } |
| 512 | |
| 513 | $userPos = $this->textToUserSpace($this->textMatrix); |
| 514 | $effectiveFontSize = $this->getEffectiveFontSize(); |
| 515 | |
| 516 | // Estimate width: use average glyph width * number of characters |
| 517 | $charCount = mb_strlen($text, 'UTF-8'); |
| 518 | $defaultWidth = ($this->fontDefaultWidths[$this->currentFont] ?? 500.0); |
| 519 | $estimatedWidth = $charCount * $defaultWidth / 1000.0 * $this->fontSize |
| 520 | * ($this->horizontalScaling / 100.0); |
| 521 | |
| 522 | return new TextSpan( |
| 523 | text: $text, |
| 524 | x: $userPos[0], |
| 525 | y: $userPos[1], |
| 526 | width: abs($estimatedWidth), |
| 527 | height: $effectiveFontSize, |
| 528 | fontSize: $this->fontSize, |
| 529 | fontName: $this->currentFont, |
| 530 | ); |
| 531 | } |
| 532 | |
| 533 | // ----------------------------------------------------------------------- |
| 534 | // Text displacement computation |
| 535 | // ----------------------------------------------------------------------- |
| 536 | |
| 537 | /** |
| 538 | * Compute the total horizontal displacement for a string in text space. |
| 539 | * |
| 540 | * Per ISO 32000-2 §9.4.4, for each character code c in the string: |
| 541 | * tx = ((w0 - Tj/1000) * Tfs + Tc + Tw_if_space) * Th |
| 542 | * |
| 543 | * Returns the displacement in user-space points (after scaling). |
| 544 | */ |
| 545 | private function computeStringDisplacement(string $bytes): float |
| 546 | { |
| 547 | $isCid = $this->cidFonts[$this->currentFont] ?? false; |
| 548 | $widths = $this->fontWidths[$this->currentFont] ?? []; |
| 549 | $defaultWidth = $this->fontDefaultWidths[$this->currentFont] ?? ($isCid ? 1000.0 : 500.0); |
| 550 | $th = $this->horizontalScaling / 100.0; |
| 551 | |
| 552 | $totalDisplacement = 0.0; |
| 553 | $len = strlen($bytes); |
| 554 | |
| 555 | if ($isCid) { |
| 556 | for ($i = 0; $i + 1 < $len; $i += 2) { |
| 557 | $code = (ord($bytes[$i]) << 8) | ord($bytes[$i + 1]); |
| 558 | $w0 = ($widths[$code] ?? $defaultWidth) / 1000.0; |
| 559 | $tx = ($w0 * $this->fontSize + $this->charSpacing) * $th; |
| 560 | $totalDisplacement += $tx; |
| 561 | } |
| 562 | } else { |
| 563 | for ($i = 0; $i < $len; $i++) { |
| 564 | $code = ord($bytes[$i]); |
| 565 | $w0 = ($widths[$code] ?? $defaultWidth) / 1000.0; |
| 566 | $tx = ($w0 * $this->fontSize + $this->charSpacing) * $th; |
| 567 | // Add word spacing for space character (code 32) |
| 568 | if ($code === 32) { |
| 569 | $tx += $this->wordSpacing * $th; |
| 570 | } |
| 571 | $totalDisplacement += $tx; |
| 572 | } |
| 573 | } |
| 574 | |
| 575 | return $totalDisplacement; |
| 576 | } |
| 577 | |
| 578 | /** |
| 579 | * Advance the text matrix by a horizontal displacement. |
| 580 | */ |
| 581 | private function advanceTextMatrix(float $tx): void |
| 582 | { |
| 583 | // Translate text matrix: Tm = [1 0 0 1 tx 0] * Tm |
| 584 | $this->textMatrix[4] += $tx * $this->textMatrix[0]; |
| 585 | $this->textMatrix[5] += $tx * $this->textMatrix[1]; |
| 586 | } |
| 587 | |
| 588 | /** |
| 589 | * Convert a text-space position to user-space via CTM. |
| 590 | * |
| 591 | * The rendering matrix is: Trm = Tm × CTM |
| 592 | * The position in user space is (Trm[4], Trm[5] + Ts). |
| 593 | * |
| 594 | * @return array{float, float} [x, y] |
| 595 | */ |
| 596 | private function textToUserSpace(array $tm): array |
| 597 | { |
| 598 | // First apply text rise to the text matrix position |
| 599 | $tmWithRise = $tm; |
| 600 | $tmWithRise[4] += $this->textRise * $tm[2]; // rise affects y via text matrix |
| 601 | $tmWithRise[5] += $this->textRise * $tm[3]; |
| 602 | |
| 603 | // Multiply Tm × CTM |
| 604 | $trm = $this->multiplyMatrices($tmWithRise, $this->ctm); |
| 605 | |
| 606 | return [$trm[4], $trm[5]]; |
| 607 | } |
| 608 | |
| 609 | /** |
| 610 | * Get the effective font size in user space (accounts for text matrix scaling and CTM). |
| 611 | */ |
| 612 | private function getEffectiveFontSize(): float |
| 613 | { |
| 614 | // The effective size is |fontSize * Tm[3] * CTM[3]| approximately |
| 615 | // For a more accurate computation, use the full matrix scale factor |
| 616 | $tm = $this->textMatrix; |
| 617 | $scaleY = sqrt($tm[2] * $tm[2] + $tm[3] * $tm[3]); |
| 618 | $ctmScaleY = sqrt($this->ctm[2] * $this->ctm[2] + $this->ctm[3] * $this->ctm[3]); |
| 619 | return abs($this->fontSize * $scaleY * $ctmScaleY); |
| 620 | } |
| 621 | |
| 622 | // ----------------------------------------------------------------------- |
| 623 | // Matrix math |
| 624 | // ----------------------------------------------------------------------- |
| 625 | |
| 626 | /** |
| 627 | * Multiply two 3×3 matrices (represented as [a,b,c,d,e,f]). |
| 628 | * |
| 629 | * [a1 b1 0] [a2 b2 0] |
| 630 | * [c1 d1 0] × [c2 d2 0] |
| 631 | * [e1 f1 1] [e2 f2 1] |
| 632 | * |
| 633 | * @param array<float> $m1 |
| 634 | * @param array<float> $m2 |
| 635 | * @return array<float> |
| 636 | */ |
| 637 | private function multiplyMatrices(array $m1, array $m2): array |
| 638 | { |
| 639 | return [ |
| 640 | $m1[0] * $m2[0] + $m1[1] * $m2[2], |
| 641 | $m1[0] * $m2[1] + $m1[1] * $m2[3], |
| 642 | $m1[2] * $m2[0] + $m1[3] * $m2[2], |
| 643 | $m1[2] * $m2[1] + $m1[3] * $m2[3], |
| 644 | $m1[4] * $m2[0] + $m1[5] * $m2[2] + $m2[4], |
| 645 | $m1[4] * $m2[1] + $m1[5] * $m2[3] + $m2[5], |
| 646 | ]; |
| 647 | } |
| 648 | |
| 649 | // ----------------------------------------------------------------------- |
| 650 | // Form XObject handling |
| 651 | // ----------------------------------------------------------------------- |
| 652 | |
| 653 | /** |
| 654 | * Extract positioned text from a Form XObject. |
| 655 | * |
| 656 | * @return list<TextSpan> |
| 657 | */ |
| 658 | private function extractFromXObject(string $name): array |
| 659 | { |
| 660 | if ($this->currentResources === null || $this->xObjectDepth >= self::MAX_XOBJECT_DEPTH) { |
| 661 | return []; |
| 662 | } |
| 663 | |
| 664 | $xobjects = $this->resolveValue($this->currentResources->get('XObject')); |
| 665 | if (!$xobjects instanceof PdfDictionary) { |
| 666 | return []; |
| 667 | } |
| 668 | |
| 669 | $xobjRef = $xobjects->get($name); |
| 670 | if ($xobjRef === null) { |
| 671 | return []; |
| 672 | } |
| 673 | |
| 674 | $xobj = $this->resolveValue($xobjRef); |
| 675 | if (!$xobj instanceof PdfStream) { |
| 676 | return []; |
| 677 | } |
| 678 | |
| 679 | $subtype = $xobj->dictionary->get('Subtype'); |
| 680 | if (!$subtype instanceof PdfName || $subtype->value !== 'Form') { |
| 681 | return []; |
| 682 | } |
| 683 | |
| 684 | // Save state |
| 685 | $savedFontMaps = $this->fontMaps; |
| 686 | $savedCidFonts = $this->cidFonts; |
| 687 | $savedFontWidths = $this->fontWidths; |
| 688 | $savedDefaultWidths = $this->fontDefaultWidths; |
| 689 | $savedBaseNames = $this->fontBaseNames; |
| 690 | $savedFont = $this->currentFont; |
| 691 | $savedFontSize = $this->fontSize; |
| 692 | $savedResources = $this->currentResources; |
| 693 | $savedCtm = $this->ctm; |
| 694 | |
| 695 | // Apply the XObject's matrix if present |
| 696 | $matrix = $xobj->dictionary->get('Matrix'); |
| 697 | if ($matrix instanceof PdfArray && count($matrix->items) >= 6) { |
| 698 | $m = []; |
| 699 | foreach ($matrix->items as $item) { |
| 700 | $m[] = $item instanceof PdfNumber ? (float) $item->toPdf() : 0.0; |
| 701 | } |
| 702 | $this->ctm = $this->multiplyMatrices(array_slice($m, 0, 6), $this->ctm); |
| 703 | } |
| 704 | |
| 705 | // Load XObject's resources |
| 706 | $xobjResources = $this->resolveValue($xobj->dictionary->get('Resources')); |
| 707 | if ($xobjResources instanceof PdfDictionary) { |
| 708 | $this->currentResources = $xobjResources; |
| 709 | $this->loadFontDataFromResources($xobjResources); |
| 710 | } |
| 711 | |
| 712 | // Process |
| 713 | $this->xObjectDepth++; |
| 714 | $spans = []; |
| 715 | if ($xobj->data !== '') { |
| 716 | $ops = $this->parser->parse($xobj->data); |
| 717 | $spans = $this->processOps($ops); |
| 718 | } |
| 719 | $this->xObjectDepth--; |
| 720 | |
| 721 | // Restore state |
| 722 | $this->fontMaps = $savedFontMaps; |
| 723 | $this->cidFonts = $savedCidFonts; |
| 724 | $this->fontWidths = $savedFontWidths; |
| 725 | $this->fontDefaultWidths = $savedDefaultWidths; |
| 726 | $this->fontBaseNames = $savedBaseNames; |
| 727 | $this->currentFont = $savedFont; |
| 728 | $this->fontSize = $savedFontSize; |
| 729 | $this->currentResources = $savedResources; |
| 730 | $this->ctm = $savedCtm; |
| 731 | |
| 732 | return $spans; |
| 733 | } |
| 734 | |
| 735 | // ----------------------------------------------------------------------- |
| 736 | // Font data loading |
| 737 | // ----------------------------------------------------------------------- |
| 738 | |
| 739 | /** |
| 740 | * Decode PDF name `#XX` hex escapes (PDF 1.2+) so a content-stream name |
| 741 | * like `/*Courier#20New` matches the literal-space resource key. |
| 742 | */ |
| 743 | private function decodeName(string $name): string |
| 744 | { |
| 745 | return preg_replace_callback( |
| 746 | '/#([0-9A-Fa-f]{2})/', |
| 747 | static fn(array $m): string => chr((int) hexdec($m[1])), |
| 748 | $name, |
| 749 | ); |
| 750 | } |
| 751 | |
| 752 | /** |
| 753 | * Load font encoding maps AND width data from the page's resources. |
| 754 | */ |
| 755 | private function loadFontData(PdfDictionary $page): void |
| 756 | { |
| 757 | $this->fontMaps = []; |
| 758 | $this->cidFonts = []; |
| 759 | $this->fontWidths = []; |
| 760 | $this->fontDefaultWidths = []; |
| 761 | $this->fontBaseNames = []; |
| 762 | |
| 763 | $resources = $this->resolveValue($page->get('Resources')); |
| 764 | if ($resources instanceof PdfDictionary) { |
| 765 | $this->loadFontDataFromResources($resources); |
| 766 | } |
| 767 | } |
| 768 | |
| 769 | private function loadFontDataFromResources(PdfDictionary $resources): void |
| 770 | { |
| 771 | $fonts = $this->resolveValue($resources->get('Font')); |
| 772 | if (!$fonts instanceof PdfDictionary) { |
| 773 | return; |
| 774 | } |
| 775 | |
| 776 | $cmapParser = new CMapParser(); |
| 777 | |
| 778 | foreach ($fonts->entries as $fontName => $fontRef) { |
| 779 | $fontDict = $this->resolveValue($fontRef); |
| 780 | if (!$fontDict instanceof PdfDictionary) { |
| 781 | continue; |
| 782 | } |
| 783 | |
| 784 | $subtype = $fontDict->get('Subtype'); |
| 785 | $isType0 = $subtype instanceof PdfName && $subtype->value === 'Type0'; |
| 786 | if ($isType0) { |
| 787 | $this->cidFonts[$fontName] = true; |
| 788 | } |
| 789 | |
| 790 | // Base font name (for standard font metric lookup) |
| 791 | $baseFont = $fontDict->get('BaseFont'); |
| 792 | $baseFontName = $baseFont instanceof PdfName ? $baseFont->value : null; |
| 793 | $this->fontBaseNames[$fontName] = $baseFontName; |
| 794 | |
| 795 | // --- Load encoding map (same as TextExtractor) --- |
| 796 | $this->loadEncodingMap($fontName, $fontDict, $cmapParser, $subtype); |
| 797 | |
| 798 | // --- Load glyph widths --- |
| 799 | $this->loadGlyphWidths($fontName, $fontDict, $isType0, $baseFontName); |
| 800 | } |
| 801 | } |
| 802 | |
| 803 | private function loadEncodingMap( |
| 804 | string $fontName, |
| 805 | PdfDictionary $fontDict, |
| 806 | CMapParser $cmapParser, |
| 807 | ?PdfName $subtype, |
| 808 | ): void { |
| 809 | // ToUnicode CMap |
| 810 | $toUnicode = $fontDict->get('ToUnicode'); |
| 811 | if ($toUnicode !== null) { |
| 812 | $toUnicodeStream = $this->resolveValue($toUnicode); |
| 813 | if ($toUnicodeStream instanceof PdfStream && $toUnicodeStream->data !== '') { |
| 814 | $map = $cmapParser->parse($toUnicodeStream->data); |
| 815 | if (!empty($map)) { |
| 816 | $this->fontMaps[$fontName] = $map; |
| 817 | return; |
| 818 | } |
| 819 | } |
| 820 | } |
| 821 | |
| 822 | // /Encoding with /Differences |
| 823 | $encoding = $fontDict->get('Encoding'); |
| 824 | if ($encoding !== null) { |
| 825 | $map = $this->buildEncodingMap($encoding); |
| 826 | if (!empty($map)) { |
| 827 | $this->fontMaps[$fontName] = $map; |
| 828 | return; |
| 829 | } |
| 830 | } |
| 831 | |
| 832 | // Fallback for simple fonts |
| 833 | if ($subtype instanceof PdfName && in_array($subtype->value, ['Type1', 'MMType1', 'TrueType'], true)) { |
| 834 | $fallbackTable = ($subtype->value === 'TrueType') |
| 835 | ? WinAnsiTable::getTable() |
| 836 | : StandardEncodingTable::getTable(); |
| 837 | $glyphList = GlyphList::getList(); |
| 838 | $map = []; |
| 839 | foreach ($fallbackTable as $code => $glyphName) { |
| 840 | if (isset($glyphList[$glyphName])) { |
| 841 | $map[$code] = $glyphList[$glyphName]; |
| 842 | } |
| 843 | } |
| 844 | if (!empty($map)) { |
| 845 | $this->fontMaps[$fontName] = $map; |
| 846 | } |
| 847 | } |
| 848 | } |
| 849 | |
| 850 | /** |
| 851 | * Load glyph widths from the font dictionary. |
| 852 | * |
| 853 | * Tries in order: |
| 854 | * 1. /Widths array (simple fonts) |
| 855 | * 2. /DescendantFonts → /W array (CID fonts) |
| 856 | * 3. /DescendantFonts → /DW (CID default width) |
| 857 | * 4. Standard 14 font metrics |
| 858 | * 5. FontDescriptor /MissingWidth |
| 859 | */ |
| 860 | private function loadGlyphWidths( |
| 861 | string $fontName, |
| 862 | PdfDictionary $fontDict, |
| 863 | bool $isType0, |
| 864 | ?string $baseFontName, |
| 865 | ): void { |
| 866 | // 1. /Widths array (simple fonts: Type1, TrueType) |
| 867 | $widthsArr = $fontDict->get('Widths'); |
| 868 | $firstChar = $fontDict->get('FirstChar'); |
| 869 | if ($widthsArr instanceof PdfArray && $firstChar instanceof PdfNumber) { |
| 870 | $fc = (int) $firstChar->toPdf(); |
| 871 | $widths = []; |
| 872 | foreach ($widthsArr->items as $i => $w) { |
| 873 | if ($w instanceof PdfNumber) { |
| 874 | $widths[$fc + $i] = (float) $w->toPdf(); |
| 875 | } |
| 876 | } |
| 877 | if (!empty($widths)) { |
| 878 | $this->fontWidths[$fontName] = $widths; |
| 879 | $this->loadDefaultWidth($fontName, $fontDict, $baseFontName); |
| 880 | return; |
| 881 | } |
| 882 | } |
| 883 | |
| 884 | // 2. CID font /W and /DW from /DescendantFonts |
| 885 | if ($isType0) { |
| 886 | $descendants = $fontDict->get('DescendantFonts'); |
| 887 | if ($descendants instanceof PdfArray && !empty($descendants->items)) { |
| 888 | $cidFontDict = $this->resolveValue($descendants->items[0]); |
| 889 | if ($cidFontDict instanceof PdfDictionary) { |
| 890 | $this->loadCidWidths($fontName, $cidFontDict); |
| 891 | |
| 892 | // CID base font name for standard metrics |
| 893 | $cidBaseFont = $cidFontDict->get('BaseFont'); |
| 894 | if ($cidBaseFont instanceof PdfName) { |
| 895 | $baseFontName = $cidBaseFont->value; |
| 896 | $this->fontBaseNames[$fontName] = $baseFontName; |
| 897 | } |
| 898 | |
| 899 | // /DW default width |
| 900 | $dw = $cidFontDict->get('DW'); |
| 901 | if ($dw instanceof PdfNumber) { |
| 902 | $this->fontDefaultWidths[$fontName] = (float) $dw->toPdf(); |
| 903 | } else { |
| 904 | $this->fontDefaultWidths[$fontName] = 1000.0; |
| 905 | } |
| 906 | return; |
| 907 | } |
| 908 | } |
| 909 | } |
| 910 | |
| 911 | // 3. Standard 14 font metrics |
| 912 | if ($baseFontName !== null) { |
| 913 | $this->tryLoadStandardFontWidths($fontName, $baseFontName); |
| 914 | if (isset($this->fontWidths[$fontName])) { |
| 915 | return; |
| 916 | } |
| 917 | } |
| 918 | |
| 919 | // 4. FontDescriptor /MissingWidth fallback |
| 920 | $this->loadDefaultWidth($fontName, $fontDict, $baseFontName); |
| 921 | } |
| 922 | |
| 923 | /** |
| 924 | * Load CID font /W array into fontWidths. |
| 925 | * |
| 926 | * The /W array format is: [cid_first cid_last width] or [cid [w1 w2 ...]] |
| 927 | */ |
| 928 | private function loadCidWidths(string $fontName, PdfDictionary $cidFontDict): void |
| 929 | { |
| 930 | $wArray = $cidFontDict->get('W'); |
| 931 | if (!$wArray instanceof PdfArray) { |
| 932 | return; |
| 933 | } |
| 934 | |
| 935 | $widths = []; |
| 936 | $items = $wArray->items; |
| 937 | $count = count($items); |
| 938 | $i = 0; |
| 939 | |
| 940 | while ($i < $count) { |
| 941 | $first = $items[$i] ?? null; |
| 942 | if (!$first instanceof PdfNumber) { |
| 943 | $i++; |
| 944 | continue; |
| 945 | } |
| 946 | $firstCid = (int) $first->toPdf(); |
| 947 | |
| 948 | $second = $items[$i + 1] ?? null; |
| 949 | if ($second instanceof PdfArray) { |
| 950 | // [cid [w1 w2 ...]] |
| 951 | foreach ($second->items as $j => $w) { |
| 952 | if ($w instanceof PdfNumber) { |
| 953 | $widths[$firstCid + $j] = (float) $w->toPdf(); |
| 954 | } |
| 955 | } |
| 956 | $i += 2; |
| 957 | } elseif ($second instanceof PdfNumber) { |
| 958 | $lastCid = (int) $second->toPdf(); |
| 959 | $width = $items[$i + 2] ?? null; |
| 960 | if ($width instanceof PdfNumber) { |
| 961 | $w = (float) $width->toPdf(); |
| 962 | for ($c = $firstCid; $c <= $lastCid; $c++) { |
| 963 | $widths[$c] = $w; |
| 964 | } |
| 965 | } |
| 966 | $i += 3; |
| 967 | } else { |
| 968 | $i++; |
| 969 | } |
| 970 | } |
| 971 | |
| 972 | if (!empty($widths)) { |
| 973 | $this->fontWidths[$fontName] = $widths; |
| 974 | } |
| 975 | } |
| 976 | |
| 977 | /** |
| 978 | * Try to load widths from the 14 standard PDF fonts. |
| 979 | */ |
| 980 | private function tryLoadStandardFontWidths(string $fontName, string $baseFontName): void |
| 981 | { |
| 982 | // Strip subset prefix (e.g., "ABCDEF+Helvetica" → "Helvetica") |
| 983 | $cleanName = preg_replace('/^[A-Z]{6}\+/', '', $baseFontName) ?? $baseFontName; |
| 984 | |
| 985 | try { |
| 986 | $afm = StandardFontMetrics::get($cleanName); |
| 987 | } catch (\InvalidArgumentException) { |
| 988 | return; |
| 989 | } |
| 990 | |
| 991 | // Convert glyph-name-keyed widths to char-code-keyed widths via WinAnsi encoding |
| 992 | $winAnsi = WinAnsiTable::getTable(); |
| 993 | $widths = []; |
| 994 | foreach ($winAnsi as $code => $glyphName) { |
| 995 | $w = $afm->widths[$glyphName] ?? null; |
| 996 | if ($w !== null) { |
| 997 | $widths[$code] = (float) $w; |
| 998 | } |
| 999 | } |
| 1000 | |
| 1001 | if (!empty($widths)) { |
| 1002 | $this->fontWidths[$fontName] = $widths; |
| 1003 | $this->fontDefaultWidths[$fontName] = $afm->missingWidth; |
| 1004 | } |
| 1005 | } |
| 1006 | |
| 1007 | /** |
| 1008 | * Set the default/missing width for a font from its FontDescriptor or |
| 1009 | * standard metrics. |
| 1010 | */ |
| 1011 | private function loadDefaultWidth(string $fontName, PdfDictionary $fontDict, ?string $baseFontName): void |
| 1012 | { |
| 1013 | if (isset($this->fontDefaultWidths[$fontName])) { |
| 1014 | return; |
| 1015 | } |
| 1016 | |
| 1017 | // Try FontDescriptor /MissingWidth |
| 1018 | $descriptor = $this->resolveValue($fontDict->get('FontDescriptor')); |
| 1019 | if ($descriptor instanceof PdfDictionary) { |
| 1020 | $mw = $descriptor->get('MissingWidth'); |
| 1021 | if ($mw instanceof PdfNumber) { |
| 1022 | $this->fontDefaultWidths[$fontName] = (float) $mw->toPdf(); |
| 1023 | return; |
| 1024 | } |
| 1025 | } |
| 1026 | |
| 1027 | // Standard font fallback |
| 1028 | if ($baseFontName !== null) { |
| 1029 | $cleanName = preg_replace('/^[A-Z]{6}\+/', '', $baseFontName) ?? $baseFontName; |
| 1030 | try { |
| 1031 | $afm = StandardFontMetrics::get($cleanName); |
| 1032 | $this->fontDefaultWidths[$fontName] = $afm->missingWidth; |
| 1033 | return; |
| 1034 | } catch (\InvalidArgumentException) { |
| 1035 | // Not a standard font |
| 1036 | } |
| 1037 | } |
| 1038 | |
| 1039 | $this->fontDefaultWidths[$fontName] = 500.0; |
| 1040 | } |
| 1041 | |
| 1042 | // ----------------------------------------------------------------------- |
| 1043 | // String / encoding helpers (mirrored from TextExtractor) |
| 1044 | // ----------------------------------------------------------------------- |
| 1045 | |
| 1046 | private function parseStringOperand(string $operand): string |
| 1047 | { |
| 1048 | $operand = trim($operand); |
| 1049 | |
| 1050 | if (str_starts_with($operand, '<') && str_ends_with($operand, '>')) { |
| 1051 | $hex = substr($operand, 1, -1); |
| 1052 | $hex = preg_replace('/\s+/', '', $hex) ?? $hex; |
| 1053 | return hex2bin($hex) ?: ''; |
| 1054 | } |
| 1055 | |
| 1056 | if (str_starts_with($operand, '(') && str_ends_with($operand, ')')) { |
| 1057 | $inner = substr($operand, 1, -1); |
| 1058 | return $this->unescapeLiteralString($inner); |
| 1059 | } |
| 1060 | |
| 1061 | return $operand; |
| 1062 | } |
| 1063 | |
| 1064 | private function mapBytesToUnicode(string $bytes): string |
| 1065 | { |
| 1066 | if ($this->containsMultibyte($bytes) && mb_check_encoding($bytes, 'UTF-8')) { |
| 1067 | return $bytes; |
| 1068 | } |
| 1069 | |
| 1070 | $fontMap = $this->fontMaps[$this->currentFont] ?? null; |
| 1071 | $isCid = $this->cidFonts[$this->currentFont] ?? false; |
| 1072 | |
| 1073 | if ($fontMap !== null && $isCid) { |
| 1074 | $result = ''; |
| 1075 | $len = strlen($bytes); |
| 1076 | for ($i = 0; $i + 1 < $len; $i += 2) { |
| 1077 | $code = (ord($bytes[$i]) << 8) | ord($bytes[$i + 1]); |
| 1078 | if (isset($fontMap[$code])) { |
| 1079 | $result .= mb_chr($fontMap[$code], 'UTF-8'); |
| 1080 | } else { |
| 1081 | $result .= "\u{FFFD}"; |
| 1082 | } |
| 1083 | } |
| 1084 | return $result; |
| 1085 | } |
| 1086 | |
| 1087 | if ($fontMap !== null) { |
| 1088 | $result = ''; |
| 1089 | $len = strlen($bytes); |
| 1090 | for ($i = 0; $i < $len; $i++) { |
| 1091 | $code = ord($bytes[$i]); |
| 1092 | if (isset($fontMap[$code])) { |
| 1093 | $result .= mb_chr($fontMap[$code], 'UTF-8'); |
| 1094 | } else { |
| 1095 | $result .= mb_chr($code, 'UTF-8'); |
| 1096 | } |
| 1097 | } |
| 1098 | return $result; |
| 1099 | } |
| 1100 | |
| 1101 | return $this->winAnsiFallback($bytes); |
| 1102 | } |
| 1103 | |
| 1104 | private function containsMultibyte(string $bytes): bool |
| 1105 | { |
| 1106 | $len = strlen($bytes); |
| 1107 | for ($i = 0; $i < $len; $i++) { |
| 1108 | if (ord($bytes[$i]) > 127) { |
| 1109 | return true; |
| 1110 | } |
| 1111 | } |
| 1112 | return false; |
| 1113 | } |
| 1114 | |
| 1115 | private function winAnsiFallback(string $bytes): string |
| 1116 | { |
| 1117 | static $winAnsi = null; |
| 1118 | static $glyphList = null; |
| 1119 | if ($winAnsi === null) { |
| 1120 | $winAnsi = WinAnsiTable::getTable(); |
| 1121 | $glyphList = GlyphList::getList(); |
| 1122 | } |
| 1123 | |
| 1124 | $result = ''; |
| 1125 | $len = strlen($bytes); |
| 1126 | for ($i = 0; $i < $len; $i++) { |
| 1127 | $code = ord($bytes[$i]); |
| 1128 | $glyphName = $winAnsi[$code] ?? null; |
| 1129 | if ($glyphName !== null && isset($glyphList[$glyphName])) { |
| 1130 | $result .= mb_chr($glyphList[$glyphName], 'UTF-8'); |
| 1131 | } else { |
| 1132 | $result .= mb_chr($code, 'UTF-8'); |
| 1133 | } |
| 1134 | } |
| 1135 | return $result; |
| 1136 | } |
| 1137 | |
| 1138 | /** |
| 1139 | * @return array<int, int> |
| 1140 | */ |
| 1141 | private function buildEncodingMap(mixed $encoding): array |
| 1142 | { |
| 1143 | $glyphList = GlyphList::getList(); |
| 1144 | $map = []; |
| 1145 | |
| 1146 | if ($encoding instanceof PdfName) { |
| 1147 | $table = $this->getNamedEncodingTable($encoding->value); |
| 1148 | if ($table !== null) { |
| 1149 | foreach ($table as $code => $glyphName) { |
| 1150 | if (isset($glyphList[$glyphName])) { |
| 1151 | $map[$code] = $glyphList[$glyphName]; |
| 1152 | } |
| 1153 | } |
| 1154 | } |
| 1155 | return $map; |
| 1156 | } |
| 1157 | |
| 1158 | $encodingDict = $this->resolveValue($encoding); |
| 1159 | if (!$encodingDict instanceof PdfDictionary) { |
| 1160 | return $map; |
| 1161 | } |
| 1162 | |
| 1163 | $baseEnc = $encodingDict->get('BaseEncoding'); |
| 1164 | if ($baseEnc instanceof PdfName) { |
| 1165 | $table = $this->getNamedEncodingTable($baseEnc->value); |
| 1166 | if ($table !== null) { |
| 1167 | foreach ($table as $code => $glyphName) { |
| 1168 | if (isset($glyphList[$glyphName])) { |
| 1169 | $map[$code] = $glyphList[$glyphName]; |
| 1170 | } |
| 1171 | } |
| 1172 | } |
| 1173 | } |
| 1174 | |
| 1175 | $diffs = $encodingDict->get('Differences'); |
| 1176 | if ($diffs instanceof PdfArray) { |
| 1177 | $code = 0; |
| 1178 | foreach ($diffs->items as $item) { |
| 1179 | if ($item instanceof PdfNumber) { |
| 1180 | $code = (int) $item->toPdf(); |
| 1181 | } elseif ($item instanceof PdfName) { |
| 1182 | if (isset($glyphList[$item->value])) { |
| 1183 | $map[$code] = $glyphList[$item->value]; |
| 1184 | } |
| 1185 | $code++; |
| 1186 | } |
| 1187 | } |
| 1188 | } |
| 1189 | |
| 1190 | return $map; |
| 1191 | } |
| 1192 | |
| 1193 | /** @return array<int, string>|null */ |
| 1194 | private function getNamedEncodingTable(string $name): ?array |
| 1195 | { |
| 1196 | return match ($name) { |
| 1197 | 'WinAnsiEncoding' => WinAnsiTable::getTable(), |
| 1198 | 'MacRomanEncoding' => MacRomanTable::getTable(), |
| 1199 | 'StandardEncoding' => StandardEncodingTable::getTable(), |
| 1200 | 'MacExpertEncoding' => MacExpertEncodingTable::getTable(), |
| 1201 | default => null, |
| 1202 | }; |
| 1203 | } |
| 1204 | |
| 1205 | private function extractActualText(string $operand): ?string |
| 1206 | { |
| 1207 | $operand = trim($operand); |
| 1208 | if (!str_starts_with($operand, '<<')) { |
| 1209 | return null; |
| 1210 | } |
| 1211 | |
| 1212 | if (preg_match('/\/ActualText\s+\(/', $operand, $matches, PREG_OFFSET_CAPTURE)) { |
| 1213 | $startPos = (int) $matches[0][1]; |
| 1214 | $parenPos = strpos($operand, '(', $startPos); |
| 1215 | if ($parenPos !== false) { |
| 1216 | $pos = $parenPos; |
| 1217 | $str = $this->extractLiteralString($operand, $pos); |
| 1218 | return $this->unescapeLiteralString($str); |
| 1219 | } |
| 1220 | } |
| 1221 | |
| 1222 | if (preg_match('/\/ActualText\s+<([0-9A-Fa-f\s]+)>/', $operand, $matches)) { |
| 1223 | $hex = preg_replace('/\s+/', '', $matches[1]) ?? $matches[1]; |
| 1224 | $bytes = hex2bin($hex); |
| 1225 | return $bytes !== false ? $bytes : null; |
| 1226 | } |
| 1227 | |
| 1228 | return null; |
| 1229 | } |
| 1230 | |
| 1231 | private function resolveValue(mixed $value): mixed |
| 1232 | { |
| 1233 | if ($value instanceof PdfReference) { |
| 1234 | return $this->resolver->resolveReference($value); |
| 1235 | } |
| 1236 | return $value; |
| 1237 | } |
| 1238 | |
| 1239 | private function getContentStreamData(PdfDictionary $page): string |
| 1240 | { |
| 1241 | $contents = $page->get('Contents'); |
| 1242 | if ($contents === null) { |
| 1243 | return ''; |
| 1244 | } |
| 1245 | |
| 1246 | if ($contents instanceof PdfReference) { |
| 1247 | $obj = $this->resolver->resolveReference($contents); |
| 1248 | if ($obj instanceof PdfStream) { |
| 1249 | return $obj->data; |
| 1250 | } |
| 1251 | if ($obj instanceof PdfArray) { |
| 1252 | $contents = $obj; |
| 1253 | } else { |
| 1254 | return ''; |
| 1255 | } |
| 1256 | } |
| 1257 | |
| 1258 | if ($contents instanceof PdfArray) { |
| 1259 | $data = ''; |
| 1260 | foreach ($contents->items as $ref) { |
| 1261 | if ($ref instanceof PdfReference) { |
| 1262 | $stream = $this->resolver->resolveReference($ref); |
| 1263 | if ($stream instanceof PdfStream) { |
| 1264 | $data .= $stream->data . "\n"; |
| 1265 | } |
| 1266 | } |
| 1267 | } |
| 1268 | return $data; |
| 1269 | } |
| 1270 | |
| 1271 | return ''; |
| 1272 | } |
| 1273 | |
| 1274 | private function unescapeLiteralString(string $str): string |
| 1275 | { |
| 1276 | $result = ''; |
| 1277 | $len = strlen($str); |
| 1278 | $i = 0; |
| 1279 | |
| 1280 | while ($i < $len) { |
| 1281 | $ch = $str[$i]; |
| 1282 | if ($ch === '\\' && $i + 1 < $len) { |
| 1283 | $i++; |
| 1284 | $next = $str[$i]; |
| 1285 | $result .= match ($next) { |
| 1286 | 'n' => "\n", |
| 1287 | 'r' => "\r", |
| 1288 | 't' => "\t", |
| 1289 | 'b' => "\x08", |
| 1290 | 'f' => "\x0C", |
| 1291 | '(' => '(', |
| 1292 | ')' => ')', |
| 1293 | '\\' => '\\', |
| 1294 | default => $this->readOctalOrLiteral($str, $i, $next), |
| 1295 | }; |
| 1296 | } else { |
| 1297 | $result .= $ch; |
| 1298 | } |
| 1299 | $i++; |
| 1300 | } |
| 1301 | |
| 1302 | return $result; |
| 1303 | } |
| 1304 | |
| 1305 | private function readOctalOrLiteral(string $str, int &$i, string $ch): string |
| 1306 | { |
| 1307 | if ($ch >= '0' && $ch <= '7') { |
| 1308 | $octal = $ch; |
| 1309 | $len = strlen($str); |
| 1310 | for ($j = 0; $j < 2 && $i + 1 < $len; $j++) { |
| 1311 | $next = $str[$i + 1]; |
| 1312 | if ($next >= '0' && $next <= '7') { |
| 1313 | $octal .= $next; |
| 1314 | $i++; |
| 1315 | } else { |
| 1316 | break; |
| 1317 | } |
| 1318 | } |
| 1319 | return chr((int) octdec($octal)); |
| 1320 | } |
| 1321 | return $ch; |
| 1322 | } |
| 1323 | |
| 1324 | private function extractLiteralString(string $data, int &$pos): string |
| 1325 | { |
| 1326 | $pos++; // skip ( |
| 1327 | $result = ''; |
| 1328 | $depth = 1; |
| 1329 | $len = strlen($data); |
| 1330 | |
| 1331 | while ($pos < $len && $depth > 0) { |
| 1332 | $ch = $data[$pos]; |
| 1333 | if ($ch === '(') { |
| 1334 | $depth++; |
| 1335 | $result .= '('; |
| 1336 | } elseif ($ch === ')') { |
| 1337 | $depth--; |
| 1338 | if ($depth > 0) { |
| 1339 | $result .= ')'; |
| 1340 | } |
| 1341 | } elseif ($ch === '\\') { |
| 1342 | $result .= '\\'; |
| 1343 | $pos++; |
| 1344 | if ($pos < $len) { |
| 1345 | $result .= $data[$pos]; |
| 1346 | } |
| 1347 | } else { |
| 1348 | $result .= $ch; |
| 1349 | } |
| 1350 | $pos++; |
| 1351 | } |
| 1352 | |
| 1353 | return $result; |
| 1354 | } |
| 1355 | |
| 1356 | private function extractHexString(string $data, int &$pos): string |
| 1357 | { |
| 1358 | $pos++; // skip < |
| 1359 | $hex = ''; |
| 1360 | $len = strlen($data); |
| 1361 | |
| 1362 | while ($pos < $len && $data[$pos] !== '>') { |
| 1363 | if (!ctype_space($data[$pos])) { |
| 1364 | $hex .= $data[$pos]; |
| 1365 | } |
| 1366 | $pos++; |
| 1367 | } |
| 1368 | if ($pos < $len) { |
| 1369 | $pos++; // skip > |
| 1370 | } |
| 1371 | return $hex; |
| 1372 | } |
| 1373 | } |