Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
93.65% |
59 / 63 |
|
80.00% |
4 / 5 |
CRAP | |
0.00% |
0 / 1 |
| Jbig2Parser | |
93.65% |
59 / 63 |
|
80.00% |
4 / 5 |
22.12 | |
0.00% |
0 / 1 |
| parseFile | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
| parse | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
| parseFileFormat | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
3 | |||
| parseSegments | |
90.70% |
39 / 43 |
|
0.00% |
0 / 1 |
14.16 | |||
| readUint32 | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
| 1 | <?php |
| 2 | |
| 3 | declare(strict_types=1); |
| 4 | |
| 5 | namespace Phpdftk\ImageMetadata; |
| 6 | |
| 7 | use Phpdftk\Filesystem\LocalFilesystem; |
| 8 | |
| 9 | /** |
| 10 | * Parse JBIG2 image headers. |
| 11 | * |
| 12 | * JBIG2 is a bi-level (1-bit) image compression format used for |
| 13 | * scanned documents. The format has two variants: |
| 14 | * - Sequential: file header + segments in order |
| 15 | * - Embedded: segments only (used inside PDF streams — no file header) |
| 16 | * |
| 17 | * This parser handles the file-based format with the standard |
| 18 | * 8-byte file header (0x974A4232 0D0A1A0A). |
| 19 | * |
| 20 | * Page dimensions come from the Page Information segment (type 48). |
| 21 | */ |
| 22 | final class Jbig2Parser |
| 23 | { |
| 24 | /** JBIG2 file header signature */ |
| 25 | private const SIGNATURE = "\x97\x4A\x42\x32\x0D\x0A\x1A\x0A"; |
| 26 | |
| 27 | public static function parseFile(string $path): ImageInfo |
| 28 | { |
| 29 | $fh = LocalFilesystem::openReadable($path, "image file"); |
| 30 | try { |
| 31 | // Read enough for header + first few segments |
| 32 | $data = fread($fh, min(filesize($path), 4096)); |
| 33 | } finally { |
| 34 | fclose($fh); |
| 35 | } |
| 36 | return self::parse($data); |
| 37 | } |
| 38 | |
| 39 | public static function parse(string $data): ImageInfo |
| 40 | { |
| 41 | $len = strlen($data); |
| 42 | |
| 43 | if ($len >= 8 && str_starts_with($data, self::SIGNATURE)) { |
| 44 | return self::parseFileFormat($data, $len); |
| 45 | } |
| 46 | |
| 47 | // Try parsing as embedded format (segment stream without file header). |
| 48 | // The first bytes should be a segment header. |
| 49 | return self::parseSegments($data, 0, $len); |
| 50 | } |
| 51 | |
| 52 | /** |
| 53 | * Parse JBIG2 file format with standard header. |
| 54 | * |
| 55 | * Header layout (ISO 14492): |
| 56 | * bytes 0-7: signature (97 4A 42 32 0D 0A 1A 0A) |
| 57 | * byte 8: flags (bit 0 = sequential org, bit 1 = unknown page count) |
| 58 | * bytes 9-12: number of pages (only if bit 1 of flags is 0) |
| 59 | */ |
| 60 | private static function parseFileFormat(string $data, int $len): ImageInfo |
| 61 | { |
| 62 | if ($len < 9) { |
| 63 | throw new \RuntimeException('JBIG2: file too short'); |
| 64 | } |
| 65 | |
| 66 | $flags = ord($data[8]); |
| 67 | $knownPageCount = ($flags & 0x02) === 0; |
| 68 | |
| 69 | // Segment data starts after header |
| 70 | $pos = 9; |
| 71 | if ($knownPageCount) { |
| 72 | $pos = 13; // skip 4-byte page count |
| 73 | } |
| 74 | |
| 75 | return self::parseSegments($data, $pos, $len); |
| 76 | } |
| 77 | |
| 78 | /** |
| 79 | * Walk segments looking for a Page Information segment (type 48) |
| 80 | * which contains the page width and height. |
| 81 | */ |
| 82 | private static function parseSegments(string $data, int $pos, int $len): ImageInfo |
| 83 | { |
| 84 | // Parse segment headers to find page information (segment type 48) |
| 85 | $maxSegments = 100; // safety limit |
| 86 | for ($i = 0; $i < $maxSegments && $pos + 6 <= $len; $i++) { |
| 87 | // Segment header: |
| 88 | // 4 bytes: segment number |
| 89 | // 1 byte: flags (bits 0-5 = type, bit 6 = page association size, bit 7 = deferred) |
| 90 | // variable: referred-to segments count + list |
| 91 | // 1 or 4 bytes: page association |
| 92 | // 4 bytes: data length |
| 93 | |
| 94 | $segNum = self::readUint32($data, $pos); |
| 95 | $pos += 4; |
| 96 | |
| 97 | if ($pos >= $len) { |
| 98 | break; |
| 99 | } |
| 100 | $segFlags = ord($data[$pos]); |
| 101 | $pos++; |
| 102 | |
| 103 | $segType = $segFlags & 0x3F; |
| 104 | $pageAssocSizeLarge = ($segFlags & 0x40) !== 0; |
| 105 | |
| 106 | // Referred-to segment count (bits 5-7 of next byte, or long form) |
| 107 | if ($pos >= $len) { |
| 108 | break; |
| 109 | } |
| 110 | $retainByte = ord($data[$pos]); |
| 111 | $refCount = ($retainByte >> 5) & 0x07; |
| 112 | $pos++; |
| 113 | |
| 114 | if ($refCount === 7) { |
| 115 | // Long form: next 4 bytes are the count |
| 116 | if ($pos + 4 > $len) { |
| 117 | break; |
| 118 | } |
| 119 | $refCount = self::readUint32($data, $pos) & 0x1FFFFFFF; |
| 120 | $pos += 4; |
| 121 | } |
| 122 | |
| 123 | // Skip referred-to segment numbers |
| 124 | $refSize = ($segNum <= 256) ? 1 : (($segNum <= 65536) ? 2 : 4); |
| 125 | $pos += $refCount * $refSize; |
| 126 | |
| 127 | // Page association |
| 128 | $pageAssocSize = $pageAssocSizeLarge ? 4 : 1; |
| 129 | $pos += $pageAssocSize; |
| 130 | |
| 131 | // Data length |
| 132 | if ($pos + 4 > $len) { |
| 133 | break; |
| 134 | } |
| 135 | $dataLen = self::readUint32($data, $pos); |
| 136 | $pos += 4; |
| 137 | |
| 138 | // Segment type 48 = Page Information |
| 139 | if ($segType === 48 && $pos + 8 <= $len) { |
| 140 | $width = self::readUint32($data, $pos); |
| 141 | $height = self::readUint32($data, $pos + 4); |
| 142 | |
| 143 | // JBIG2 is always 1-bit bi-level |
| 144 | return new ImageInfo( |
| 145 | width: $width, |
| 146 | height: $height, |
| 147 | colorSpace: 'DeviceGray', |
| 148 | bitsPerComponent: 1, |
| 149 | format: 'jbig2', |
| 150 | hasAlpha: false, |
| 151 | ); |
| 152 | } |
| 153 | |
| 154 | // Skip segment data (0xFFFFFFFF means unknown length — bail) |
| 155 | if ($dataLen === 0xFFFFFFFF) { |
| 156 | break; |
| 157 | } |
| 158 | $pos += $dataLen; |
| 159 | } |
| 160 | |
| 161 | throw new \RuntimeException('JBIG2: unable to find page dimensions'); |
| 162 | } |
| 163 | |
| 164 | private static function readUint32(string $data, int $offset): int |
| 165 | { |
| 166 | return (ord($data[$offset]) << 24) |
| 167 | | (ord($data[$offset + 1]) << 16) |
| 168 | | (ord($data[$offset + 2]) << 8) |
| 169 | | ord($data[$offset + 3]); |
| 170 | } |
| 171 | } |