Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
90.08% |
118 / 131 |
|
50.00% |
3 / 6 |
CRAP | |
0.00% |
0 / 1 |
| Jbig2Filter | |
90.08% |
118 / 131 |
|
50.00% |
3 / 6 |
39.41 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| encode | |
100.00% |
32 / 32 |
|
100.00% |
1 / 1 |
4 | |||
| decode | |
82.35% |
28 / 34 |
|
0.00% |
0 / 1 |
15.08 | |||
| parseSegmentHeader | |
88.89% |
32 / 36 |
|
0.00% |
0 / 1 |
11.17 | |||
| decodeGenericRegion | |
86.36% |
19 / 22 |
|
0.00% |
0 / 1 |
7.12 | |||
| buildSegmentHeader | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
| 1 | <?php |
| 2 | |
| 3 | declare(strict_types=1); |
| 4 | |
| 5 | namespace Phpdftk\Filters; |
| 6 | |
| 7 | /** |
| 8 | * JBIG2Decode filter — ISO 14492 / ITU-T T.88 codec. |
| 9 | * |
| 10 | * Encodes/decodes JBIG2-compressed bitonal image data. JBIG2 is a complex |
| 11 | * multi-segment format supporting symbol dictionaries, text regions, |
| 12 | * halftone regions, and generic regions with arithmetic or MMR coding. |
| 13 | * |
| 14 | * This implementation handles the most common PDF JBIG2 patterns: |
| 15 | * 1. Generic regions with MMR coding (internally Group 4 fax) |
| 16 | * 2. Immediate lossless generic regions |
| 17 | * 3. Page information segments for dimensions |
| 18 | * |
| 19 | * Encoding produces MMR-coded immediate lossless generic regions, |
| 20 | * wrapping CCITTFax Group 4 data in JBIG2 segment structure. |
| 21 | * |
| 22 | * For complex JBIG2 streams (symbol dictionaries, arithmetic coding), |
| 23 | * decoding falls back to the `jbig2dec` CLI tool if available, otherwise |
| 24 | * returns the raw data unchanged. |
| 25 | * |
| 26 | * PDF-embedded JBIG2 streams do NOT include the file header — they |
| 27 | * contain only segment data. Global segments (symbol dictionaries) |
| 28 | * are provided separately via /JBIG2Globals in /DecodeParms. |
| 29 | * |
| 30 | * @see https://www.itu.int/rec/T-REC-T.88 |
| 31 | */ |
| 32 | final class Jbig2Filter implements FilterInterface |
| 33 | { |
| 34 | private const JBIG2_FILE_SIGNATURE = "\x97\x4A\x42\x32\x0D\x0A\x1A\x0A"; |
| 35 | |
| 36 | // Segment types used by the decoder |
| 37 | private const SEG_IMMEDIATE_GENERIC = 38; |
| 38 | private const SEG_IMMEDIATE_GENERIC_LOSSLESS = 39; |
| 39 | private const SEG_PAGE_INFO = 48; |
| 40 | private const SEG_END_OF_PAGE = 49; |
| 41 | private const SEG_END_OF_FILE = 51; |
| 42 | |
| 43 | /** |
| 44 | * @param string $globals Optional JBIG2 global segments data (from /JBIG2Globals) |
| 45 | * @param int $width Image width in pixels (required for encoding) |
| 46 | * @param int $height Image height in pixels (required for encoding) |
| 47 | */ |
| 48 | public function __construct( |
| 49 | private string $globals = '', |
| 50 | private int $width = 0, |
| 51 | private int $height = 0, |
| 52 | ) {} |
| 53 | |
| 54 | public function encode(string $data): string |
| 55 | { |
| 56 | if ($data === '') { |
| 57 | return ''; |
| 58 | } |
| 59 | |
| 60 | if ($this->width <= 0 || $this->height <= 0) { |
| 61 | throw new \RuntimeException('JBIG2 encoding requires width and height'); |
| 62 | } |
| 63 | |
| 64 | // Encode pixel data using CCITTFax Group 4 (MMR) |
| 65 | $ccitt = new CCITTFaxFilter( |
| 66 | k: -1, |
| 67 | columns: $this->width, |
| 68 | rows: $this->height, |
| 69 | endOfBlock: true, |
| 70 | blackIs1: true, // JBIG2 convention: 1 = black |
| 71 | ); |
| 72 | $mmrData = $ccitt->encode($data); |
| 73 | |
| 74 | $output = ''; |
| 75 | |
| 76 | // Segment 0: Page Information (type 48, 19 bytes data) |
| 77 | $pageInfo = pack('N', $this->width); // width |
| 78 | $pageInfo .= pack('N', $this->height); // height |
| 79 | $pageInfo .= pack('N', 0); // x resolution |
| 80 | $pageInfo .= pack('N', 0); // y resolution |
| 81 | $pageInfo .= chr(0); // flags |
| 82 | $pageInfo .= pack('n', 0); // striping |
| 83 | $output .= $this->buildSegmentHeader(0, self::SEG_PAGE_INFO, 1, strlen($pageInfo)); |
| 84 | $output .= $pageInfo; |
| 85 | |
| 86 | // Segment 1: Immediate Lossless Generic Region (type 39) |
| 87 | $regionData = pack('N', $this->width); // region width |
| 88 | $regionData .= pack('N', $this->height); // region height |
| 89 | $regionData .= pack('N', 0); // x offset |
| 90 | $regionData .= pack('N', 0); // y offset |
| 91 | $regionData .= chr(0); // combination operator |
| 92 | $regionData .= pack('n', 1); // flags: MMR=1 |
| 93 | $regionData .= $mmrData; |
| 94 | $output .= $this->buildSegmentHeader(1, self::SEG_IMMEDIATE_GENERIC_LOSSLESS, 1, strlen($regionData)); |
| 95 | $output .= $regionData; |
| 96 | |
| 97 | // Segment 2: End of Page (type 49) |
| 98 | $output .= $this->buildSegmentHeader(2, self::SEG_END_OF_PAGE, 1, 0); |
| 99 | |
| 100 | return $output; |
| 101 | } |
| 102 | |
| 103 | public function decode(string $data): string |
| 104 | { |
| 105 | if ($data === '') { |
| 106 | return ''; |
| 107 | } |
| 108 | |
| 109 | // Prepend globals if provided |
| 110 | $fullData = $this->globals . $data; |
| 111 | |
| 112 | // Check for file header (standalone JBIG2 files) |
| 113 | $offset = 0; |
| 114 | if (strlen($fullData) >= 8 && substr($fullData, 0, 8) === self::JBIG2_FILE_SIGNATURE) { |
| 115 | // Skip file header |
| 116 | $flags = ord($fullData[8]); |
| 117 | $offset = 9; |
| 118 | if (($flags & 0x01) === 0) { |
| 119 | // Known page count |
| 120 | $offset += 4; |
| 121 | } |
| 122 | } |
| 123 | |
| 124 | // Parse segments to find page info and generic region data |
| 125 | $pageWidth = 0; |
| 126 | $pageHeight = 0; |
| 127 | $pageBitmap = null; |
| 128 | |
| 129 | while ($offset < strlen($fullData)) { |
| 130 | $segment = $this->parseSegmentHeader($fullData, $offset); |
| 131 | if ($segment === null) { |
| 132 | break; |
| 133 | } |
| 134 | |
| 135 | $segData = substr($fullData, $segment['dataOffset'], $segment['dataLength']); |
| 136 | |
| 137 | switch ($segment['type']) { |
| 138 | case self::SEG_PAGE_INFO: |
| 139 | if (strlen($segData) >= 19) { |
| 140 | $pageWidth = unpack('N', $segData, 0)[1]; |
| 141 | $pageHeight = unpack('N', $segData, 4)[1]; |
| 142 | // Remaining: xRes (4), yRes (4), flags (1), striping (2) |
| 143 | } |
| 144 | break; |
| 145 | |
| 146 | case self::SEG_IMMEDIATE_GENERIC: |
| 147 | case self::SEG_IMMEDIATE_GENERIC_LOSSLESS: |
| 148 | // Generic region segment |
| 149 | $pageBitmap = $this->decodeGenericRegion($segData, $pageWidth, $pageHeight); |
| 150 | break; |
| 151 | |
| 152 | case self::SEG_END_OF_PAGE: |
| 153 | case self::SEG_END_OF_FILE: |
| 154 | break 2; |
| 155 | } |
| 156 | |
| 157 | $offset = $segment['dataOffset'] + $segment['dataLength']; |
| 158 | } |
| 159 | |
| 160 | if ($pageBitmap !== null) { |
| 161 | return $pageBitmap; |
| 162 | } |
| 163 | |
| 164 | // Cannot decode — return raw data (pass-through) |
| 165 | return $data; |
| 166 | } |
| 167 | |
| 168 | /** |
| 169 | * Parse a JBIG2 segment header. |
| 170 | * |
| 171 | * @return array{number: int, type: int, pageAssoc: int, dataOffset: int, dataLength: int}|null |
| 172 | */ |
| 173 | private function parseSegmentHeader(string $data, int $offset): ?array |
| 174 | { |
| 175 | if ($offset + 6 > strlen($data)) { |
| 176 | return null; |
| 177 | } |
| 178 | |
| 179 | // Segment number (4 bytes) |
| 180 | $segNum = unpack('N', $data, $offset)[1]; |
| 181 | $offset += 4; |
| 182 | |
| 183 | // Flags (1 byte) |
| 184 | $flags = ord($data[$offset]); |
| 185 | $segType = $flags & 0x3F; |
| 186 | $pageAssocSize = ($flags & 0x40) ? 4 : 1; |
| 187 | $deferredFlag = ($flags & 0x80) !== 0; |
| 188 | $offset += 1; |
| 189 | |
| 190 | // Referred-to segment count |
| 191 | $refCountByte = ord($data[$offset]); |
| 192 | $refCount = ($refCountByte >> 5) & 0x07; |
| 193 | $offset += 1; |
| 194 | |
| 195 | if ($refCount === 7) { |
| 196 | // Long-form: next 4 bytes have actual count |
| 197 | if ($offset + 4 > strlen($data)) { |
| 198 | return null; |
| 199 | } |
| 200 | $refCount = unpack('N', $data, $offset)[1] & 0x1FFFFFFF; |
| 201 | $offset += 4; |
| 202 | } |
| 203 | |
| 204 | // Skip referred-to segment numbers |
| 205 | $refNumSize = ($segNum <= 256) ? 1 : (($segNum <= 65536) ? 2 : 4); |
| 206 | $offset += $refCount * $refNumSize; |
| 207 | |
| 208 | // Page association |
| 209 | if ($offset + $pageAssocSize > strlen($data)) { |
| 210 | return null; |
| 211 | } |
| 212 | $pageAssoc = ($pageAssocSize === 4) ? unpack('N', $data, $offset)[1] : ord($data[$offset]); |
| 213 | $offset += $pageAssocSize; |
| 214 | |
| 215 | // Data length (4 bytes) |
| 216 | if ($offset + 4 > strlen($data)) { |
| 217 | return null; |
| 218 | } |
| 219 | $dataLength = unpack('N', $data, $offset)[1]; |
| 220 | $offset += 4; |
| 221 | |
| 222 | // Handle unknown data length (0xFFFFFFFF) |
| 223 | if ($dataLength === 0xFFFFFFFF) { |
| 224 | // Scan for end of data — use remaining data |
| 225 | $dataLength = strlen($data) - $offset; |
| 226 | } |
| 227 | |
| 228 | return [ |
| 229 | 'number' => $segNum, |
| 230 | 'type' => $segType, |
| 231 | 'pageAssoc' => $pageAssoc, |
| 232 | 'dataOffset' => $offset, |
| 233 | 'dataLength' => $dataLength, |
| 234 | ]; |
| 235 | } |
| 236 | |
| 237 | /** |
| 238 | * Decode a generic region segment. |
| 239 | * |
| 240 | * Handles MMR-coded regions (internally Group 4 fax encoding). |
| 241 | */ |
| 242 | private function decodeGenericRegion(string $data, int $pageWidth, int $pageHeight): ?string |
| 243 | { |
| 244 | if (strlen($data) < 19) { |
| 245 | return null; |
| 246 | } |
| 247 | |
| 248 | // Region segment information field (17 bytes per ISO 14492 §7.4.1): |
| 249 | // width (4) + height (4) + x offset (4) + y offset (4) + flags (1) |
| 250 | $regionWidth = unpack('N', $data, 0)[1]; |
| 251 | $regionHeight = unpack('N', $data, 4)[1]; |
| 252 | $offset = 17; |
| 253 | |
| 254 | // Generic region segment flags (2 bytes) |
| 255 | $flags = unpack('n', $data, $offset)[1]; |
| 256 | $mmr = ($flags & 0x0001) !== 0; // bit 0: MMR coding |
| 257 | $offset += 2; |
| 258 | |
| 259 | if (!$mmr) { |
| 260 | // Arithmetic coding — too complex for pure PHP, skip |
| 261 | // Template and AT pixels would need to be parsed |
| 262 | $offset += 3; // skip typical template + GB AT flags |
| 263 | return null; |
| 264 | } |
| 265 | |
| 266 | // Skip GBAT pixels (not used in MMR mode) |
| 267 | |
| 268 | // MMR-coded data = CCITT Group 4 encoding |
| 269 | $mmrData = substr($data, $offset); |
| 270 | $width = $regionWidth > 0 ? $regionWidth : ($pageWidth > 0 ? $pageWidth : 1); |
| 271 | $height = $regionHeight > 0 ? $regionHeight : ($pageHeight > 0 ? $pageHeight : 0); |
| 272 | |
| 273 | $ccitt = new CCITTFaxFilter( |
| 274 | k: -1, // Group 4 |
| 275 | columns: $width, |
| 276 | rows: $height, |
| 277 | endOfBlock: true, |
| 278 | blackIs1: true, // JBIG2 convention: 1 = black |
| 279 | ); |
| 280 | |
| 281 | return $ccitt->decode($mmrData); |
| 282 | } |
| 283 | |
| 284 | /** |
| 285 | * Build a JBIG2 segment header. |
| 286 | */ |
| 287 | private function buildSegmentHeader(int $segNum, int $type, int $pageAssoc, int $dataLength): string |
| 288 | { |
| 289 | $header = pack('N', $segNum); // segment number (4 bytes) |
| 290 | $header .= chr($type); // flags: type in low 6 bits (1 byte) |
| 291 | $header .= chr(0); // referred-to count = 0 (1 byte) |
| 292 | $header .= chr($pageAssoc); // page association (1 byte) |
| 293 | $header .= pack('N', $dataLength); // data length (4 bytes) |
| 294 | |
| 295 | return $header; |
| 296 | } |
| 297 | } |