Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
CRAP | |
100.00% |
1 / 1 |
| ObjectScanner | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
3 | |
100.00% |
1 / 1 |
| scan | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
3 | |||
| 1 | <?php |
| 2 | |
| 3 | declare(strict_types=1); |
| 4 | |
| 5 | namespace Phpdftk\Pdf\Reader\Parser; |
| 6 | |
| 7 | /** |
| 8 | * Scans raw PDF bytes for indirect object definitions to reconstruct |
| 9 | * a cross-reference table when the normal xref is corrupted. |
| 10 | */ |
| 11 | final class ObjectScanner |
| 12 | { |
| 13 | /** |
| 14 | * Scan the PDF bytes for all `N M obj` patterns. |
| 15 | * |
| 16 | * Tolerates malformed object headers (no trailing whitespace after |
| 17 | * `obj`, e.g. `0 0 objParams`). Validates that the digits are |
| 18 | * preceded by a non-digit byte (or BOF) so we don't match the |
| 19 | * tail of a longer number. |
| 20 | * |
| 21 | * @return array<int, int> objectNumber => byte offset |
| 22 | */ |
| 23 | public static function scan(string $data): array |
| 24 | { |
| 25 | $map = []; |
| 26 | |
| 27 | // Match `N M obj` allowing any character (or end-of-string) after |
| 28 | // `obj` instead of a strict word boundary. PDFium fuzz inputs have |
| 29 | // `0 0 objParams` style headers that the strict `\bobj\b` rejects. |
| 30 | if (preg_match_all('/(?<![0-9])(\d+)[ \t\r\n]+(\d+)[ \t\r\n]+obj/', $data, $matches, PREG_OFFSET_CAPTURE)) { |
| 31 | foreach ($matches[0] as $i => $match) { |
| 32 | $objNum = (int) $matches[1][$i][0]; |
| 33 | $byteOffset = (int) $match[1]; |
| 34 | |
| 35 | // Keep the LAST occurrence (latest revision wins) |
| 36 | $map[$objNum] = $byteOffset; |
| 37 | } |
| 38 | } |
| 39 | |
| 40 | ksort($map); |
| 41 | |
| 42 | return $map; |
| 43 | } |
| 44 | } |