Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
100.00% |
80 / 80 |
|
100.00% |
3 / 3 |
CRAP | |
100.00% |
1 / 1 |
| PdfDocEncodingTable | |
100.00% |
80 / 80 |
|
100.00% |
3 / 3 |
15 | |
100.00% |
1 / 1 |
| getTable | |
100.00% |
66 / 66 |
|
100.00% |
1 / 1 |
4 | |||
| decode | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
3 | |||
| decodeTextString | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
8 | |||
| 1 | <?php |
| 2 | |
| 3 | declare(strict_types=1); |
| 4 | |
| 5 | namespace Phpdftk\Encoding; |
| 6 | |
| 7 | /** |
| 8 | * PDFDocEncoding — the encoding for PDF text strings (Info dict, bookmarks, annotations) |
| 9 | * when they don't start with the UTF-16BE BOM (U+FEFF / 0xFE 0xFF). |
| 10 | * Per PDF spec ISO 32000-2:2020, Table D.2. |
| 11 | * |
| 12 | * Maps byte values 0-255 directly to Unicode code points (not glyph names). |
| 13 | */ |
| 14 | final class PdfDocEncodingTable |
| 15 | { |
| 16 | /** |
| 17 | * @return array<int, int|null> byte value (0-255) to Unicode code point, null = undefined |
| 18 | */ |
| 19 | public static function getTable(): array |
| 20 | { |
| 21 | $table = []; |
| 22 | |
| 23 | // 0-7: special PDF control codes / undefined |
| 24 | $table[0] = null; // undefined |
| 25 | $table[1] = null; // undefined |
| 26 | $table[2] = null; // undefined |
| 27 | $table[3] = null; // undefined |
| 28 | $table[4] = null; // undefined |
| 29 | $table[5] = null; // undefined |
| 30 | $table[6] = null; // undefined |
| 31 | $table[7] = null; // undefined |
| 32 | |
| 33 | // 8-12: defined control codes |
| 34 | $table[8] = 0x0008; // BACKSPACE |
| 35 | $table[9] = 0x0009; // HORIZONTAL TAB |
| 36 | $table[10] = 0x000A; // LINE FEED |
| 37 | $table[11] = 0x000B; // VERTICAL TAB |
| 38 | $table[12] = 0x000C; // FORM FEED |
| 39 | $table[13] = 0x000D; // CARRIAGE RETURN |
| 40 | |
| 41 | // 14-15: undefined |
| 42 | $table[14] = null; |
| 43 | $table[15] = null; |
| 44 | |
| 45 | // 16-23: undefined |
| 46 | for ($i = 16; $i <= 23; $i++) { |
| 47 | $table[$i] = null; |
| 48 | } |
| 49 | |
| 50 | // 24-31: special and undefined |
| 51 | $table[24] = 0x02D8; // BREVE |
| 52 | $table[25] = 0x02C7; // CARON |
| 53 | $table[26] = 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT |
| 54 | $table[27] = 0x02D9; // DOT ABOVE |
| 55 | $table[28] = 0x02DD; // DOUBLE ACUTE ACCENT |
| 56 | $table[29] = 0x02DB; // OGONEK |
| 57 | $table[30] = 0x02DA; // RING ABOVE |
| 58 | $table[31] = 0x02DC; // SMALL TILDE |
| 59 | |
| 60 | // 32-126: same as Unicode (ASCII) |
| 61 | for ($i = 32; $i <= 126; $i++) { |
| 62 | $table[$i] = $i; |
| 63 | } |
| 64 | |
| 65 | // 127: undefined |
| 66 | $table[127] = null; |
| 67 | |
| 68 | // 128-159: Windows-1252-like characters |
| 69 | $table[128] = 0x2022; // BULLET |
| 70 | $table[129] = 0x2020; // DAGGER |
| 71 | $table[130] = 0x2021; // DOUBLE DAGGER |
| 72 | $table[131] = 0x2026; // HORIZONTAL ELLIPSIS |
| 73 | $table[132] = 0x2014; // EM DASH |
| 74 | $table[133] = 0x2013; // EN DASH |
| 75 | $table[134] = 0x0192; // LATIN SMALL LETTER F WITH HOOK |
| 76 | $table[135] = 0x2044; // FRACTION SLASH |
| 77 | $table[136] = 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
| 78 | $table[137] = 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
| 79 | $table[138] = 0x2212; // MINUS SIGN |
| 80 | $table[139] = 0x2030; // PER MILLE SIGN |
| 81 | $table[140] = 0x201E; // DOUBLE LOW-9 QUOTATION MARK |
| 82 | $table[141] = 0x201C; // LEFT DOUBLE QUOTATION MARK |
| 83 | $table[142] = 0x201D; // RIGHT DOUBLE QUOTATION MARK |
| 84 | $table[143] = 0x2018; // LEFT SINGLE QUOTATION MARK |
| 85 | $table[144] = 0x2019; // RIGHT SINGLE QUOTATION MARK |
| 86 | $table[145] = 0x201A; // SINGLE LOW-9 QUOTATION MARK |
| 87 | $table[146] = 0x2122; // TRADE MARK SIGN |
| 88 | $table[147] = 0xFB01; // LATIN SMALL LIGATURE FI |
| 89 | $table[148] = 0xFB02; // LATIN SMALL LIGATURE FL |
| 90 | $table[149] = 0x0141; // LATIN CAPITAL LETTER L WITH STROKE |
| 91 | $table[150] = 0x0152; // LATIN CAPITAL LIGATURE OE |
| 92 | $table[151] = 0x0160; // LATIN CAPITAL LETTER S WITH CARON |
| 93 | $table[152] = 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS |
| 94 | $table[153] = 0x017D; // LATIN CAPITAL LETTER Z WITH CARON |
| 95 | $table[154] = 0x0131; // LATIN SMALL LETTER DOTLESS I |
| 96 | $table[155] = 0x0142; // LATIN SMALL LETTER L WITH STROKE |
| 97 | $table[156] = 0x0153; // LATIN SMALL LIGATURE OE |
| 98 | $table[157] = 0x0161; // LATIN SMALL LETTER S WITH CARON |
| 99 | $table[158] = 0x017E; // LATIN SMALL LETTER Z WITH CARON |
| 100 | $table[159] = null; // undefined |
| 101 | |
| 102 | // 160: EURO SIGN (PDF 1.7+) |
| 103 | $table[160] = 0x20AC; // EURO SIGN |
| 104 | |
| 105 | // 161-255: same as Unicode/ISO 8859-1 (Latin-1) |
| 106 | for ($i = 161; $i <= 255; $i++) { |
| 107 | $table[$i] = $i; |
| 108 | } |
| 109 | |
| 110 | // Override: 173 = SOFT HYPHEN in ISO 8859-1, same code point |
| 111 | // (already correct from the loop above) |
| 112 | |
| 113 | return $table; |
| 114 | } |
| 115 | |
| 116 | /** |
| 117 | * Decode a PDFDocEncoding byte string to a UTF-8 string. |
| 118 | * |
| 119 | * PDF text strings use either PDFDocEncoding (single-byte) or UTF-16BE |
| 120 | * (indicated by a BOM prefix 0xFE 0xFF). This method handles only the |
| 121 | * PDFDocEncoding case. |
| 122 | */ |
| 123 | public static function decode(string $bytes): string |
| 124 | { |
| 125 | $table = self::getTable(); |
| 126 | $result = ''; |
| 127 | $len = strlen($bytes); |
| 128 | for ($i = 0; $i < $len; $i++) { |
| 129 | $code = ord($bytes[$i]); |
| 130 | $unicode = $table[$code]; |
| 131 | if ($unicode !== null) { |
| 132 | $result .= mb_chr($unicode, 'UTF-8'); |
| 133 | } |
| 134 | // Skip undefined code points |
| 135 | } |
| 136 | return $result; |
| 137 | } |
| 138 | |
| 139 | /** |
| 140 | * Decode a PDF text string — auto-detects UTF-16BE (BOM) vs PDFDocEncoding. |
| 141 | */ |
| 142 | public static function decodeTextString(string $bytes): string |
| 143 | { |
| 144 | // Check for UTF-16BE BOM |
| 145 | if (strlen($bytes) >= 2 && $bytes[0] === "\xFE" && $bytes[1] === "\xFF") { |
| 146 | return mb_convert_encoding(substr($bytes, 2), 'UTF-8', 'UTF-16BE'); |
| 147 | } |
| 148 | |
| 149 | // Check for UTF-8 BOM (PDF 2.0) |
| 150 | if (strlen($bytes) >= 3 && $bytes[0] === "\xEF" && $bytes[1] === "\xBB" && $bytes[2] === "\xBF") { |
| 151 | return substr($bytes, 3); |
| 152 | } |
| 153 | |
| 154 | return self::decode($bytes); |
| 155 | } |
| 156 | } |