Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
89.19% |
165 / 185 |
|
27.27% |
3 / 11 |
CRAP | |
0.00% |
0 / 1 |
| Type1Parser | |
89.19% |
165 / 185 |
|
27.27% |
3 / 11 |
92.92 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| fromBytes | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
2.03 | |||
| parse | |
97.50% |
39 / 40 |
|
0.00% |
0 / 1 |
7 | |||
| parsePfb | |
100.00% |
27 / 27 |
|
100.00% |
1 / 1 |
7 | |||
| parsePfa | |
78.12% |
25 / 32 |
|
0.00% |
0 / 1 |
20.03 | |||
| parseAsciiHeader | |
95.12% |
39 / 41 |
|
0.00% |
0 / 1 |
17 | |||
| parseEncoding | |
85.71% |
12 / 14 |
|
0.00% |
0 / 1 |
8.19 | |||
| parseCharStringNames | |
50.00% |
3 / 6 |
|
0.00% |
0 / 1 |
10.50 | |||
| parseGlyphWidths | |
50.00% |
3 / 6 |
|
0.00% |
0 / 1 |
6.00 | |||
| buildPfbBytes | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| buildFlags | |
91.67% |
11 / 12 |
|
0.00% |
0 / 1 |
14.11 | |||
| 1 | <?php |
| 2 | |
| 3 | declare(strict_types=1); |
| 4 | |
| 5 | namespace Phpdftk\FontParser; |
| 6 | |
| 7 | use Phpdftk\Filesystem\LocalFilesystem; |
| 8 | use Phpdftk\Encoding\GlyphList; |
| 9 | use Phpdftk\Encoding\StandardEncodingTable; |
| 10 | |
| 11 | /** |
| 12 | * Parses Type 1 font files (PFB binary and PFA ASCII formats). |
| 13 | * |
| 14 | * Extracts font metrics, encoding, glyph widths, and segment lengths |
| 15 | * needed for PDF embedding via Type1FontFile. |
| 16 | */ |
| 17 | class Type1Parser |
| 18 | { |
| 19 | public function __construct(private readonly string $path) {} |
| 20 | |
| 21 | /** |
| 22 | * Create a parser from raw font bytes instead of a file path. |
| 23 | */ |
| 24 | public static function fromBytes(string $fontBytes): self |
| 25 | { |
| 26 | $tmp = tempnam(sys_get_temp_dir(), 'phpdftk_t1_'); |
| 27 | if ($tmp === false) { |
| 28 | throw new \RuntimeException('Cannot create temp file for font data'); |
| 29 | } |
| 30 | file_put_contents($tmp, $fontBytes); |
| 31 | return new self($tmp); |
| 32 | } |
| 33 | |
| 34 | public function parse(): Type1Data |
| 35 | { |
| 36 | $raw = LocalFilesystem::readFile($this->path, "font file"); |
| 37 | |
| 38 | // Detect format and extract segments |
| 39 | if (strlen($raw) >= 2 && ord($raw[0]) === 0x80) { |
| 40 | // PFB (binary) format |
| 41 | [$asciiSegment, $binarySegment, $trailerSegment, $length1, $length2, $length3] = $this->parsePfb($raw); |
| 42 | } else { |
| 43 | // PFA (ASCII) format |
| 44 | [$asciiSegment, $binarySegment, $trailerSegment, $length1, $length2, $length3] = $this->parsePfa($raw); |
| 45 | } |
| 46 | |
| 47 | // Parse metrics from ASCII header |
| 48 | $metrics = $this->parseAsciiHeader($asciiSegment); |
| 49 | |
| 50 | // Parse encoding from ASCII header |
| 51 | $encoding = $this->parseEncoding($asciiSegment); |
| 52 | |
| 53 | // Parse CharStrings to discover available glyph names |
| 54 | $charStringGlyphs = $this->parseCharStringNames($asciiSegment); |
| 55 | |
| 56 | // Build glyph widths from /CharMetrics or /Metrics if available |
| 57 | // Type 1 fonts encode widths in the charstrings (encrypted), but |
| 58 | // many also declare them in the ASCII header via /Metrics or via |
| 59 | // the font's built-in metrics dictionary. |
| 60 | $glyphWidths = $this->parseGlyphWidths($asciiSegment); |
| 61 | |
| 62 | // Build character widths and Unicode map from encoding |
| 63 | $glyphList = GlyphList::getList(); |
| 64 | $charWidths = []; |
| 65 | $unicodeMap = []; |
| 66 | foreach ($encoding as $code => $glyphName) { |
| 67 | if ($glyphName === '.notdef') { |
| 68 | continue; |
| 69 | } |
| 70 | if (isset($glyphWidths[$glyphName])) { |
| 71 | $charWidths[$code] = $glyphWidths[$glyphName]; |
| 72 | } |
| 73 | if (isset($glyphList[$glyphName])) { |
| 74 | $unicodeMap[$code] = $glyphList[$glyphName]; |
| 75 | } |
| 76 | } |
| 77 | |
| 78 | // Rebuild font bytes in PFB format for embedding |
| 79 | $fontBytes = $this->buildPfbBytes($asciiSegment, $binarySegment, $trailerSegment); |
| 80 | |
| 81 | // Determine flags |
| 82 | $flags = $this->buildFlags($metrics); |
| 83 | |
| 84 | return new Type1Data( |
| 85 | postScriptName: $metrics['fontName'], |
| 86 | familyName: $metrics['familyName'], |
| 87 | ascent: $metrics['ascent'], |
| 88 | descent: $metrics['descent'], |
| 89 | capHeight: $metrics['capHeight'], |
| 90 | xHeight: $metrics['xHeight'], |
| 91 | italicAngle: $metrics['italicAngle'], |
| 92 | stemV: $metrics['stemV'], |
| 93 | flags: $flags, |
| 94 | fontBBox: $metrics['fontBBox'], |
| 95 | charWidths: $charWidths, |
| 96 | unicodeMap: $unicodeMap, |
| 97 | fontBytes: $fontBytes, |
| 98 | length1: $length1, |
| 99 | length2: $length2, |
| 100 | length3: $length3, |
| 101 | glyphWidths: $glyphWidths, |
| 102 | encoding: $encoding, |
| 103 | ); |
| 104 | } |
| 105 | |
| 106 | /** |
| 107 | * Parse PFB (Printer Font Binary) format. |
| 108 | * |
| 109 | * PFB files consist of segments, each with a 6-byte header: |
| 110 | * byte 0: 0x80 (start marker) |
| 111 | * byte 1: segment type (1=ASCII, 2=binary, 3=EOF) |
| 112 | * bytes 2-5: segment length (little-endian uint32) |
| 113 | * |
| 114 | * @return array{string, string, string, int, int, int} |
| 115 | */ |
| 116 | private function parsePfb(string $data): array |
| 117 | { |
| 118 | $offset = 0; |
| 119 | $ascii = ''; |
| 120 | $binary = ''; |
| 121 | $trailer = ''; |
| 122 | $length1 = 0; |
| 123 | $length2 = 0; |
| 124 | $length3 = 0; |
| 125 | $len = strlen($data); |
| 126 | |
| 127 | while ($offset < $len) { |
| 128 | if (ord($data[$offset]) !== 0x80) { |
| 129 | break; |
| 130 | } |
| 131 | $type = ord($data[$offset + 1]); |
| 132 | if ($type === 3) { |
| 133 | // EOF marker |
| 134 | break; |
| 135 | } |
| 136 | $segLen = unpack('V', substr($data, $offset + 2, 4))[1]; |
| 137 | $segData = substr($data, $offset + 6, $segLen); |
| 138 | $offset += 6 + $segLen; |
| 139 | |
| 140 | if ($type === 1) { |
| 141 | // ASCII segment |
| 142 | if ($binary === '') { |
| 143 | $ascii .= $segData; |
| 144 | $length1 += $segLen; |
| 145 | } else { |
| 146 | $trailer .= $segData; |
| 147 | $length3 += $segLen; |
| 148 | } |
| 149 | } elseif ($type === 2) { |
| 150 | // Binary segment |
| 151 | $binary .= $segData; |
| 152 | $length2 += $segLen; |
| 153 | } |
| 154 | } |
| 155 | |
| 156 | return [$ascii, $binary, $trailer, $length1, $length2, $length3]; |
| 157 | } |
| 158 | |
| 159 | /** |
| 160 | * Parse PFA (Printer Font ASCII) format. |
| 161 | * |
| 162 | * PFA files are plain text. The binary segment is hex-encoded between |
| 163 | * "eexec" and "cleartomark" (or 512 zeros). |
| 164 | * |
| 165 | * @return array{string, string, string, int, int, int} |
| 166 | */ |
| 167 | private function parsePfa(string $data): array |
| 168 | { |
| 169 | // Find eexec marker — marks the boundary between ASCII and encrypted sections |
| 170 | $eexecPos = strpos($data, 'eexec'); |
| 171 | if ($eexecPos === false) { |
| 172 | throw new \RuntimeException('Invalid PFA: no eexec marker found'); |
| 173 | } |
| 174 | |
| 175 | // ASCII section includes the "eexec" keyword and trailing whitespace |
| 176 | $afterEexec = $eexecPos + 5; |
| 177 | // Skip one whitespace char after eexec |
| 178 | if ($afterEexec < strlen($data) && ($data[$afterEexec] === "\n" || $data[$afterEexec] === "\r" || $data[$afterEexec] === ' ')) { |
| 179 | $afterEexec++; |
| 180 | if ($afterEexec < strlen($data) && $data[$afterEexec - 1] === "\r" && $data[$afterEexec] === "\n") { |
| 181 | $afterEexec++; |
| 182 | } |
| 183 | } |
| 184 | |
| 185 | $asciiSegment = substr($data, 0, $afterEexec); |
| 186 | |
| 187 | // Find the cleartomark/zeros trailer |
| 188 | $remaining = substr($data, $afterEexec); |
| 189 | |
| 190 | // The trailer starts with 512 zeros (hex "0" characters) or "cleartomark" |
| 191 | $trailerPos = strrpos($remaining, 'cleartomark'); |
| 192 | if ($trailerPos !== false) { |
| 193 | // Look for the start of the zeros block before cleartomark |
| 194 | $zeroBlockStart = $trailerPos; |
| 195 | // Search backwards for the first non-hex character block of zeros |
| 196 | $searchBack = $trailerPos; |
| 197 | while ($searchBack > 0) { |
| 198 | $ch = $remaining[$searchBack - 1]; |
| 199 | if ($ch === '0' || $ch === "\n" || $ch === "\r" || $ch === ' ') { |
| 200 | $searchBack--; |
| 201 | } else { |
| 202 | break; |
| 203 | } |
| 204 | } |
| 205 | $hexPart = substr($remaining, 0, $searchBack); |
| 206 | $trailerPart = substr($remaining, $searchBack); |
| 207 | } else { |
| 208 | // No cleartomark — look for the zero block (512 ASCII zeros) |
| 209 | if (preg_match('/\n(0{512,})/', $remaining, $m, PREG_OFFSET_CAPTURE)) { |
| 210 | $hexPart = substr($remaining, 0, $m[0][1]); |
| 211 | $trailerPart = substr($remaining, $m[0][1]); |
| 212 | } else { |
| 213 | $hexPart = $remaining; |
| 214 | $trailerPart = ''; |
| 215 | } |
| 216 | } |
| 217 | |
| 218 | // Decode hex to binary |
| 219 | $hexClean = preg_replace('/\s+/', '', $hexPart); |
| 220 | $binarySegment = hex2bin($hexClean) ?: ''; |
| 221 | |
| 222 | $length1 = strlen($asciiSegment); |
| 223 | $length2 = strlen($binarySegment); |
| 224 | $length3 = strlen($trailerPart); |
| 225 | |
| 226 | return [$asciiSegment, $binarySegment, $trailerPart, $length1, $length2, $length3]; |
| 227 | } |
| 228 | |
| 229 | /** |
| 230 | * Parse font metrics from the ASCII header section. |
| 231 | * |
| 232 | * @return array<string, mixed> |
| 233 | */ |
| 234 | private function parseAsciiHeader(string $ascii): array |
| 235 | { |
| 236 | $metrics = [ |
| 237 | 'fontName' => 'Unknown', |
| 238 | 'familyName' => 'Unknown', |
| 239 | 'italicAngle' => 0.0, |
| 240 | 'isFixedPitch' => false, |
| 241 | 'fontBBox' => [0, 0, 0, 0], |
| 242 | 'ascent' => 0, |
| 243 | 'descent' => 0, |
| 244 | 'capHeight' => 0, |
| 245 | 'xHeight' => 0, |
| 246 | 'stemV' => 0, |
| 247 | 'underlinePosition' => 0, |
| 248 | 'underlineThickness' => 0, |
| 249 | ]; |
| 250 | |
| 251 | // /FontName |
| 252 | if (preg_match('/\/FontName\s*\/(\S+)/', $ascii, $m)) { |
| 253 | $metrics['fontName'] = $m[1]; |
| 254 | } |
| 255 | |
| 256 | // /FullName |
| 257 | if (preg_match('/\/FullName\s*\(([^)]*)\)/', $ascii, $m)) { |
| 258 | $metrics['familyName'] = $m[1]; |
| 259 | } |
| 260 | // /FamilyName as fallback |
| 261 | if ($metrics['familyName'] === 'Unknown' && preg_match('/\/FamilyName\s*\(([^)]*)\)/', $ascii, $m)) { |
| 262 | $metrics['familyName'] = $m[1]; |
| 263 | } |
| 264 | |
| 265 | // /ItalicAngle |
| 266 | if (preg_match('/\/ItalicAngle\s+([-\d.]+)/', $ascii, $m)) { |
| 267 | $metrics['italicAngle'] = (float) $m[1]; |
| 268 | } |
| 269 | |
| 270 | // /isFixedPitch |
| 271 | if (preg_match('/\/isFixedPitch\s+(true|false)/i', $ascii, $m)) { |
| 272 | $metrics['isFixedPitch'] = strtolower($m[1]) === 'true'; |
| 273 | } |
| 274 | |
| 275 | // /FontBBox |
| 276 | if (preg_match('/\/FontBBox\s*\{?\s*([-\d.]+)\s+([-\d.]+)\s+([-\d.]+)\s+([-\d.]+)\s*\}?/', $ascii, $m)) { |
| 277 | $metrics['fontBBox'] = [(int) $m[1], (int) $m[2], (int) $m[3], (int) $m[4]]; |
| 278 | } |
| 279 | |
| 280 | // /UnderlinePosition |
| 281 | if (preg_match('/\/UnderlinePosition\s+([-\d.]+)/', $ascii, $m)) { |
| 282 | $metrics['underlinePosition'] = (int) $m[1]; |
| 283 | } |
| 284 | |
| 285 | // /UnderlineThickness |
| 286 | if (preg_match('/\/UnderlineThickness\s+([-\d.]+)/', $ascii, $m)) { |
| 287 | $metrics['underlineThickness'] = (int) $m[1]; |
| 288 | } |
| 289 | |
| 290 | // Derive ascent/descent/capHeight from FontBBox |
| 291 | $bbox = $metrics['fontBBox']; |
| 292 | $metrics['ascent'] = $bbox[3] > 0 ? $bbox[3] : 800; |
| 293 | $metrics['descent'] = $bbox[1] < 0 ? $bbox[1] : -200; |
| 294 | // Estimate cap height as ~70% of ascent |
| 295 | $metrics['capHeight'] = (int) ($metrics['ascent'] * 0.7); |
| 296 | |
| 297 | // Estimate stemV from font name |
| 298 | $name = strtolower($metrics['fontName']); |
| 299 | if (str_contains($name, 'bold') || str_contains($name, 'black') || str_contains($name, 'heavy')) { |
| 300 | $metrics['stemV'] = 120; |
| 301 | } elseif (str_contains($name, 'light') || str_contains($name, 'thin')) { |
| 302 | $metrics['stemV'] = 50; |
| 303 | } else { |
| 304 | $metrics['stemV'] = 80; |
| 305 | } |
| 306 | |
| 307 | return $metrics; |
| 308 | } |
| 309 | |
| 310 | /** |
| 311 | * Parse the Encoding array from the ASCII header. |
| 312 | * |
| 313 | * Type 1 fonts can define encoding as: |
| 314 | * - StandardEncoding (default reference) |
| 315 | * - ISOLatin1Encoding |
| 316 | * - A custom encoding with "dup N /glyphname put" entries |
| 317 | * |
| 318 | * @return array<int, string> byte => glyph name |
| 319 | */ |
| 320 | private function parseEncoding(string $ascii): array |
| 321 | { |
| 322 | // Check for standard encoding reference |
| 323 | if (preg_match('/\/Encoding\s+StandardEncoding\s+def/', $ascii)) { |
| 324 | return StandardEncodingTable::getTable(); |
| 325 | } |
| 326 | |
| 327 | // Check for ISOLatin1Encoding (maps to WinAnsi-like) |
| 328 | if (preg_match('/\/Encoding\s+ISOLatin1Encoding\s+def/', $ascii)) { |
| 329 | return \Phpdftk\Encoding\WinAnsiTable::getTable(); |
| 330 | } |
| 331 | |
| 332 | // Parse custom encoding array |
| 333 | // Format: /Encoding 256 array |
| 334 | // 0 1 255 { 1 index exch /.notdef put } for |
| 335 | // dup N /glyphname put |
| 336 | // ... |
| 337 | // readonly def |
| 338 | if (preg_match('/\/Encoding\s+(\d+)\s+array\b/s', $ascii, $m)) { |
| 339 | // Start with all .notdef |
| 340 | $encoding = array_fill(0, 256, '.notdef'); |
| 341 | |
| 342 | // Find all "dup N /glyphname put" entries |
| 343 | if (preg_match_all('/dup\s+(\d+)\s+\/(\S+)\s+put/', $ascii, $matches, PREG_SET_ORDER)) { |
| 344 | foreach ($matches as $match) { |
| 345 | $code = (int) $match[1]; |
| 346 | $glyph = $match[2]; |
| 347 | if ($code >= 0 && $code <= 255) { |
| 348 | $encoding[$code] = $glyph; |
| 349 | } |
| 350 | } |
| 351 | } |
| 352 | |
| 353 | return $encoding; |
| 354 | } |
| 355 | |
| 356 | // Default: use StandardEncoding |
| 357 | return StandardEncodingTable::getTable(); |
| 358 | } |
| 359 | |
| 360 | /** |
| 361 | * Parse CharString glyph names from the ASCII header. |
| 362 | * |
| 363 | * The charstrings section looks like: |
| 364 | * /CharStrings N dict dup begin |
| 365 | * /glyphname N RD ... ND |
| 366 | * |
| 367 | * We only extract the names, not the encrypted charstring data. |
| 368 | * |
| 369 | * @return list<string> |
| 370 | */ |
| 371 | private function parseCharStringNames(string $ascii): array |
| 372 | { |
| 373 | $names = []; |
| 374 | if (preg_match_all('/^\s*\/(\S+)\s+\d+\s+(?:RD|R|-|)\s/m', $ascii, $matches)) { |
| 375 | foreach ($matches[1] as $name) { |
| 376 | if ($name !== 'CharStrings' && $name !== 'Encoding' && $name !== 'FontName') { |
| 377 | $names[] = $name; |
| 378 | } |
| 379 | } |
| 380 | } |
| 381 | return $names; |
| 382 | } |
| 383 | |
| 384 | /** |
| 385 | * Parse glyph widths from the ASCII header. |
| 386 | * |
| 387 | * Looks for /Metrics or /CharMetrics dictionaries, or extracts widths |
| 388 | * from the font's built-in data. Many Type 1 fonts don't expose widths |
| 389 | * in the ASCII section (they're in the encrypted charstrings), so this |
| 390 | * returns what it can find. |
| 391 | * |
| 392 | * @return array<string, int> glyph name => width in 1000 units/em |
| 393 | */ |
| 394 | private function parseGlyphWidths(string $ascii): array |
| 395 | { |
| 396 | $widths = []; |
| 397 | |
| 398 | // Try /Metrics dictionary: /glyphname [wx wy] or /glyphname N |
| 399 | if (preg_match('/\/Metrics\s+\d+\s+dict\s+(?:dup\s+)?begin\s+(.*?)(?:end|readonly)/s', $ascii, $metricsBlock)) { |
| 400 | if (preg_match_all('/\/(\S+)\s+\[\s*([-\d.]+)/', $metricsBlock[1], $m, PREG_SET_ORDER)) { |
| 401 | foreach ($m as $match) { |
| 402 | $widths[$match[1]] = (int) round((float) $match[2]); |
| 403 | } |
| 404 | } |
| 405 | } |
| 406 | |
| 407 | return $widths; |
| 408 | } |
| 409 | |
| 410 | /** |
| 411 | * Build PFB-format bytes from the three segments. |
| 412 | * |
| 413 | * For embedding in PDF, the font program must be in PFB-like format |
| 414 | * (raw segments without the PFB headers, but with correct Length1/2/3). |
| 415 | * PDF expects the concatenated raw segments without PFB segment markers. |
| 416 | */ |
| 417 | private function buildPfbBytes(string $ascii, string $binary, string $trailer): string |
| 418 | { |
| 419 | return $ascii . $binary . $trailer; |
| 420 | } |
| 421 | |
| 422 | /** |
| 423 | * Build PDF font flags from parsed metrics. |
| 424 | * |
| 425 | * ISO 32000-2, Table 123: |
| 426 | * Bit 1: FixedPitch |
| 427 | * Bit 2: Serif (assume serif unless name says otherwise) |
| 428 | * Bit 3: Symbolic |
| 429 | * Bit 4: Script |
| 430 | * Bit 6: Nonsymbolic |
| 431 | * Bit 7: Italic |
| 432 | */ |
| 433 | /** @param array<string, mixed> $metrics */ |
| 434 | private function buildFlags(array $metrics): int |
| 435 | { |
| 436 | $flags = 0; |
| 437 | |
| 438 | if ($metrics['isFixedPitch']) { |
| 439 | $flags |= (1 << 0); // FixedPitch |
| 440 | } |
| 441 | |
| 442 | // Assume non-symbolic (standard Latin encoding) unless font name indicates otherwise |
| 443 | $name = strtolower($metrics['fontName']); |
| 444 | if (str_contains($name, 'symbol') || str_contains($name, 'zapf') || str_contains($name, 'dingbat') || str_contains($name, 'wingding')) { |
| 445 | $flags |= (1 << 2); // Symbolic |
| 446 | } else { |
| 447 | $flags |= (1 << 5); // Nonsymbolic |
| 448 | } |
| 449 | |
| 450 | // Serif detection |
| 451 | if (str_contains($name, 'sans') || str_contains($name, 'arial') || str_contains($name, 'helvetica') || str_contains($name, 'gothic') || str_contains($name, 'futura')) { |
| 452 | // Sans-serif — don't set serif flag |
| 453 | } else { |
| 454 | $flags |= (1 << 1); // Serif |
| 455 | } |
| 456 | |
| 457 | // Italic |
| 458 | if ($metrics['italicAngle'] != 0.0 || str_contains($name, 'italic') || str_contains($name, 'oblique')) { |
| 459 | $flags |= (1 << 6); // Italic |
| 460 | } |
| 461 | |
| 462 | return $flags; |
| 463 | } |
| 464 | } |