Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
88.24% |
30 / 34 |
|
33.33% |
1 / 3 |
CRAP | |
0.00% |
0 / 1 |
| WinAnsiEncoder | |
88.24% |
30 / 34 |
|
33.33% |
1 / 3 |
14.32 | |
0.00% |
0 / 1 |
| encode | |
83.33% |
15 / 18 |
|
0.00% |
0 / 1 |
7.23 | |||
| getMissingCodepoints | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| map | |
93.33% |
14 / 15 |
|
0.00% |
0 / 1 |
6.01 | |||
| 1 | <?php |
| 2 | |
| 3 | declare(strict_types=1); |
| 4 | |
| 5 | namespace Phpdftk\Encoding; |
| 6 | |
| 7 | /** |
| 8 | * Encodes UTF-8 strings to WinAnsi (single-byte, ISO 8859-1 + Microsoft |
| 9 | * additions in 0x80–0x9F) for use with Type1 standard fonts and any other |
| 10 | * font whose /Encoding is WinAnsiEncoding. |
| 11 | */ |
| 12 | final class WinAnsiEncoder implements TextEncoder |
| 13 | { |
| 14 | /** @var array<int, int>|null Codepoint → WinAnsi byte. Built lazily once per process. */ |
| 15 | private static ?array $codepointToByte = null; |
| 16 | |
| 17 | /** @var list<int> */ |
| 18 | private array $missing = []; |
| 19 | |
| 20 | public function encode(string $utf8): string |
| 21 | { |
| 22 | $map = self::map(); |
| 23 | $out = ''; |
| 24 | // mb_str_split decodes UTF-8 grapheme-by-grapheme; we want codepoints, |
| 25 | // and WinAnsi has no grapheme/codepoint distinction in its range, |
| 26 | // so a per-codepoint split is correct. |
| 27 | foreach (mb_str_split($utf8, 1, 'UTF-8') as $char) { |
| 28 | $cp = mb_ord($char, 'UTF-8'); |
| 29 | if ($cp === false) { |
| 30 | $this->missing[] = -1; |
| 31 | $out .= '?'; |
| 32 | continue; |
| 33 | } |
| 34 | $byte = $map[$cp] ?? null; |
| 35 | if ($byte === null) { |
| 36 | // Pass C0/C1 control characters through unchanged. PDF |
| 37 | // content streams may contain literal whitespace such as |
| 38 | // \n or \t, and the WinAnsi forward table flags those as |
| 39 | // .notdef even though their byte values are identical in |
| 40 | // UTF-8 and WinAnsi. |
| 41 | if ($cp < 0x20 || ($cp >= 0x7F && $cp < 0xA0)) { |
| 42 | $out .= chr($cp); |
| 43 | continue; |
| 44 | } |
| 45 | $this->missing[] = $cp; |
| 46 | $out .= '?'; |
| 47 | continue; |
| 48 | } |
| 49 | $out .= chr($byte); |
| 50 | } |
| 51 | return $out; |
| 52 | } |
| 53 | |
| 54 | public function getMissingCodepoints(): array |
| 55 | { |
| 56 | return $this->missing; |
| 57 | } |
| 58 | |
| 59 | /** |
| 60 | * Build the reverse WinAnsi map (codepoint → byte) from the forward |
| 61 | * byte → glyph-name table and the Adobe Glyph List. |
| 62 | * |
| 63 | * @return array<int, int> |
| 64 | */ |
| 65 | private static function map(): array |
| 66 | { |
| 67 | if (self::$codepointToByte !== null) { |
| 68 | return self::$codepointToByte; |
| 69 | } |
| 70 | |
| 71 | $reverse = []; |
| 72 | foreach (WinAnsiTable::getTable() as $byte => $glyph) { |
| 73 | if ($glyph === '.notdef') { |
| 74 | continue; |
| 75 | } |
| 76 | $cp = GlyphList::glyphToUnicode($glyph); |
| 77 | if ($cp === null) { |
| 78 | continue; |
| 79 | } |
| 80 | // First mapping wins — WinAnsi has 0xA0 and 0x20 both glyphed as |
| 81 | // 'space', and we want 0x20 to be the canonical encoding of U+0020. |
| 82 | if (!isset($reverse[$cp])) { |
| 83 | $reverse[$cp] = $byte; |
| 84 | } |
| 85 | } |
| 86 | |
| 87 | // /Encoding /WinAnsiEncoding implies bytes 32-255 are mapped, with |
| 88 | // /hyphen at both 0x2D and 0xAD per the spec. Keep the canonical |
| 89 | // ASCII byte for U+002D. |
| 90 | $reverse[0x2D] = 0x2D; |
| 91 | // Soft hyphen U+00AD also maps to 0xAD. |
| 92 | $reverse[0x00AD] = 0xAD; |
| 93 | |
| 94 | self::$codepointToByte = $reverse; |
| 95 | return $reverse; |
| 96 | } |
| 97 | } |