Code Coverage for /home/runner/work/phpdftk/phpdftk/packages/font-parser/src/Type1Parser.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	89.19% covered (warning)	89.19%	165 / 185	27.27% covered (danger)	27.27%	3 / 11	CRAP	0.00% covered (danger)	0.00%	0 / 1
Type1Parser	89.19% covered (warning)	89.19%	165 / 185	27.27% covered (danger)	27.27%	3 / 11	92.92	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
fromBytes	80.00% covered (warning)	80.00%	4 / 5	0.00% covered (danger)	0.00%	0 / 1	2.03
parse	97.50% covered (success)	97.50%	39 / 40	0.00% covered (danger)	0.00%	0 / 1	7
parsePfb	100.00% covered (success)	100.00%	27 / 27	100.00% covered (success)	100.00%	1 / 1	7
parsePfa	78.12% covered (warning)	78.12%	25 / 32	0.00% covered (danger)	0.00%	0 / 1	20.03
parseAsciiHeader	95.12% covered (success)	95.12%	39 / 41	0.00% covered (danger)	0.00%	0 / 1	17
parseEncoding	85.71% covered (warning)	85.71%	12 / 14	0.00% covered (danger)	0.00%	0 / 1	8.19
parseCharStringNames	50.00% covered (danger)	50.00%	3 / 6	0.00% covered (danger)	0.00%	0 / 1	10.50
parseGlyphWidths	50.00% covered (danger)	50.00%	3 / 6	0.00% covered (danger)	0.00%	0 / 1	6.00
buildPfbBytes	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
buildFlags	91.67% covered (success)	91.67%	11 / 12	0.00% covered (danger)	0.00%	0 / 1	14.11

1	<?php
2
3	declare(strict_types=1);
4
5	namespace Phpdftk\FontParser;
6
7	use Phpdftk\Filesystem\LocalFilesystem;
8	use Phpdftk\Encoding\GlyphList;
9	use Phpdftk\Encoding\StandardEncodingTable;
10
11	/**
12	* Parses Type 1 font files (PFB binary and PFA ASCII formats).
13	*
14	* Extracts font metrics, encoding, glyph widths, and segment lengths
15	* needed for PDF embedding via Type1FontFile.
16	*/
17	class Type1Parser
18	{
19	public function __construct(private readonly string $path) {}
20
21	/**
22	* Create a parser from raw font bytes instead of a file path.
23	*/
24	public static function fromBytes(string $fontBytes): self
25	{
26	$tmp = tempnam(sys_get_temp_dir(), 'phpdftk_t1_');
27	if ($tmp === false) {
28	throw new \RuntimeException('Cannot create temp file for font data');
29	}
30	file_put_contents($tmp, $fontBytes);
31	return new self($tmp);
32	}
33
34	public function parse(): Type1Data
35	{
36	$raw = LocalFilesystem::readFile($this->path, "font file");
37
38	// Detect format and extract segments
39	if (strlen($raw) >= 2 && ord($raw[0]) === 0x80) {
40	// PFB (binary) format
41	[$asciiSegment, $binarySegment, $trailerSegment, $length1, $length2, $length3] = $this->parsePfb($raw);
42	} else {
43	// PFA (ASCII) format
44	[$asciiSegment, $binarySegment, $trailerSegment, $length1, $length2, $length3] = $this->parsePfa($raw);
45	}
46
47	// Parse metrics from ASCII header
48	$metrics = $this->parseAsciiHeader($asciiSegment);
49
50	// Parse encoding from ASCII header
51	$encoding = $this->parseEncoding($asciiSegment);
52
53	// Parse CharStrings to discover available glyph names
54	$charStringGlyphs = $this->parseCharStringNames($asciiSegment);
55
56	// Build glyph widths from /CharMetrics or /Metrics if available
57	// Type 1 fonts encode widths in the charstrings (encrypted), but
58	// many also declare them in the ASCII header via /Metrics or via
59	// the font's built-in metrics dictionary.
60	$glyphWidths = $this->parseGlyphWidths($asciiSegment);
61
62	// Build character widths and Unicode map from encoding
63	$glyphList = GlyphList::getList();
64	$charWidths = [];
65	$unicodeMap = [];
66	foreach ($encoding as $code => $glyphName) {
67	if ($glyphName === '.notdef') {
68	continue;
69	}
70	if (isset($glyphWidths[$glyphName])) {
71	$charWidths[$code] = $glyphWidths[$glyphName];
72	}
73	if (isset($glyphList[$glyphName])) {
74	$unicodeMap[$code] = $glyphList[$glyphName];
75	}
76	}
77
78	// Rebuild font bytes in PFB format for embedding
79	$fontBytes = $this->buildPfbBytes($asciiSegment, $binarySegment, $trailerSegment);
80
81	// Determine flags
82	$flags = $this->buildFlags($metrics);
83
84	return new Type1Data(
85	postScriptName: $metrics['fontName'],
86	familyName: $metrics['familyName'],
87	ascent: $metrics['ascent'],
88	descent: $metrics['descent'],
89	capHeight: $metrics['capHeight'],
90	xHeight: $metrics['xHeight'],
91	italicAngle: $metrics['italicAngle'],
92	stemV: $metrics['stemV'],
93	flags: $flags,
94	fontBBox: $metrics['fontBBox'],
95	charWidths: $charWidths,
96	unicodeMap: $unicodeMap,
97	fontBytes: $fontBytes,
98	length1: $length1,
99	length2: $length2,
100	length3: $length3,
101	glyphWidths: $glyphWidths,
102	encoding: $encoding,
103	);
104	}
105
106	/**
107	* Parse PFB (Printer Font Binary) format.
108	*
109	* PFB files consist of segments, each with a 6-byte header:
110	* byte 0: 0x80 (start marker)
111	* byte 1: segment type (1=ASCII, 2=binary, 3=EOF)
112	* bytes 2-5: segment length (little-endian uint32)
113	*
114	* @return array{string, string, string, int, int, int}
115	*/
116	private function parsePfb(string $data): array
117	{
118	$offset = 0;
119	$ascii = '';
120	$binary = '';
121	$trailer = '';
122	$length1 = 0;
123	$length2 = 0;
124	$length3 = 0;
125	$len = strlen($data);
126
127	while ($offset < $len) {
128	if (ord($data[$offset]) !== 0x80) {
129	break;
130	}
131	$type = ord($data[$offset + 1]);
132	if ($type === 3) {
133	// EOF marker
134	break;
135	}
136	$segLen = unpack('V', substr($data, $offset + 2, 4))[1];
137	$segData = substr($data, $offset + 6, $segLen);
138	$offset += 6 + $segLen;
139
140	if ($type === 1) {
141	// ASCII segment
142	if ($binary === '') {
143	$ascii .= $segData;
144	$length1 += $segLen;
145	} else {
146	$trailer .= $segData;
147	$length3 += $segLen;
148	}
149	} elseif ($type === 2) {
150	// Binary segment
151	$binary .= $segData;
152	$length2 += $segLen;
153	}
154	}
155
156	return [$ascii, $binary, $trailer, $length1, $length2, $length3];
157	}
158
159	/**
160	* Parse PFA (Printer Font ASCII) format.
161	*
162	* PFA files are plain text. The binary segment is hex-encoded between
163	* "eexec" and "cleartomark" (or 512 zeros).
164	*
165	* @return array{string, string, string, int, int, int}
166	*/
167	private function parsePfa(string $data): array
168	{
169	// Find eexec marker — marks the boundary between ASCII and encrypted sections
170	$eexecPos = strpos($data, 'eexec');
171	if ($eexecPos === false) {
172	throw new \RuntimeException('Invalid PFA: no eexec marker found');
173	}
174
175	// ASCII section includes the "eexec" keyword and trailing whitespace
176	$afterEexec = $eexecPos + 5;
177	// Skip one whitespace char after eexec
178	if ($afterEexec < strlen($data) && ($data[$afterEexec] === "\n" \|\| $data[$afterEexec] === "\r" \|\| $data[$afterEexec] === ' ')) {
179	$afterEexec++;
180	if ($afterEexec < strlen($data) && $data[$afterEexec - 1] === "\r" && $data[$afterEexec] === "\n") {
181	$afterEexec++;
182	}
183	}
184
185	$asciiSegment = substr($data, 0, $afterEexec);
186
187	// Find the cleartomark/zeros trailer
188	$remaining = substr($data, $afterEexec);
189
190	// The trailer starts with 512 zeros (hex "0" characters) or "cleartomark"
191	$trailerPos = strrpos($remaining, 'cleartomark');
192	if ($trailerPos !== false) {
193	// Look for the start of the zeros block before cleartomark
194	$zeroBlockStart = $trailerPos;
195	// Search backwards for the first non-hex character block of zeros
196	$searchBack = $trailerPos;
197	while ($searchBack > 0) {
198	$ch = $remaining[$searchBack - 1];
199	if ($ch === '0' \|\| $ch === "\n" \|\| $ch === "\r" \|\| $ch === ' ') {
200	$searchBack--;
201	} else {
202	break;
203	}
204	}
205	$hexPart = substr($remaining, 0, $searchBack);
206	$trailerPart = substr($remaining, $searchBack);
207	} else {
208	// No cleartomark — look for the zero block (512 ASCII zeros)
209	if (preg_match('/\n(0{512,})/', $remaining, $m, PREG_OFFSET_CAPTURE)) {
210	$hexPart = substr($remaining, 0, $m[0][1]);
211	$trailerPart = substr($remaining, $m[0][1]);
212	} else {
213	$hexPart = $remaining;
214	$trailerPart = '';
215	}
216	}
217
218	// Decode hex to binary
219	$hexClean = preg_replace('/\s+/', '', $hexPart);
220	$binarySegment = hex2bin($hexClean) ?: '';
221
222	$length1 = strlen($asciiSegment);
223	$length2 = strlen($binarySegment);
224	$length3 = strlen($trailerPart);
225
226	return [$asciiSegment, $binarySegment, $trailerPart, $length1, $length2, $length3];
227	}
228
229	/**
230	* Parse font metrics from the ASCII header section.
231	*
232	* @return array<string, mixed>
233	*/
234	private function parseAsciiHeader(string $ascii): array
235	{
236	$metrics = [
237	'fontName' => 'Unknown',
238	'familyName' => 'Unknown',
239	'italicAngle' => 0.0,
240	'isFixedPitch' => false,
241	'fontBBox' => [0, 0, 0, 0],
242	'ascent' => 0,
243	'descent' => 0,
244	'capHeight' => 0,
245	'xHeight' => 0,
246	'stemV' => 0,
247	'underlinePosition' => 0,
248	'underlineThickness' => 0,
249	];
250
251	// /FontName
252	if (preg_match('/\/FontName\s*\/(\S+)/', $ascii, $m)) {
253	$metrics['fontName'] = $m[1];
254	}
255
256	// /FullName
257	if (preg_match('/\/FullName\s$([^)])$/', $ascii, $m)) {
258	$metrics['familyName'] = $m[1];
259	}
260	// /FamilyName as fallback
261	if ($metrics['familyName'] === 'Unknown' && preg_match('/\/FamilyName\s$([^)])$/', $ascii, $m)) {
262	$metrics['familyName'] = $m[1];
263	}
264
265	// /ItalicAngle
266	if (preg_match('/\/ItalicAngle\s+([-\d.]+)/', $ascii, $m)) {
267	$metrics['italicAngle'] = (float) $m[1];
268	}
269
270	// /isFixedPitch
271	if (preg_match('/\/isFixedPitch\s+(true\|false)/i', $ascii, $m)) {
272	$metrics['isFixedPitch'] = strtolower($m[1]) === 'true';
273	}
274
275	// /FontBBox
276	if (preg_match('/\/FontBBox\s\{?\s([-\d.]+)\s+([-\d.]+)\s+([-\d.]+)\s+([-\d.]+)\s*\}?/', $ascii, $m)) {
277	$metrics['fontBBox'] = [(int) $m[1], (int) $m[2], (int) $m[3], (int) $m[4]];
278	}
279
280	// /UnderlinePosition
281	if (preg_match('/\/UnderlinePosition\s+([-\d.]+)/', $ascii, $m)) {
282	$metrics['underlinePosition'] = (int) $m[1];
283	}
284
285	// /UnderlineThickness
286	if (preg_match('/\/UnderlineThickness\s+([-\d.]+)/', $ascii, $m)) {
287	$metrics['underlineThickness'] = (int) $m[1];
288	}
289
290	// Derive ascent/descent/capHeight from FontBBox
291	$bbox = $metrics['fontBBox'];
292	$metrics['ascent'] = $bbox[3] > 0 ? $bbox[3] : 800;
293	$metrics['descent'] = $bbox[1] < 0 ? $bbox[1] : -200;
294	// Estimate cap height as ~70% of ascent
295	$metrics['capHeight'] = (int) ($metrics['ascent'] * 0.7);
296
297	// Estimate stemV from font name
298	$name = strtolower($metrics['fontName']);
299	if (str_contains($name, 'bold') \|\| str_contains($name, 'black') \|\| str_contains($name, 'heavy')) {
300	$metrics['stemV'] = 120;
301	} elseif (str_contains($name, 'light') \|\| str_contains($name, 'thin')) {
302	$metrics['stemV'] = 50;
303	} else {
304	$metrics['stemV'] = 80;
305	}
306
307	return $metrics;
308	}
309
310	/**
311	* Parse the Encoding array from the ASCII header.
312	*
313	* Type 1 fonts can define encoding as:
314	* - StandardEncoding (default reference)
315	* - ISOLatin1Encoding
316	* - A custom encoding with "dup N /glyphname put" entries
317	*
318	* @return array<int, string> byte => glyph name
319	*/
320	private function parseEncoding(string $ascii): array
321	{
322	// Check for standard encoding reference
323	if (preg_match('/\/Encoding\s+StandardEncoding\s+def/', $ascii)) {
324	return StandardEncodingTable::getTable();
325	}
326
327	// Check for ISOLatin1Encoding (maps to WinAnsi-like)
328	if (preg_match('/\/Encoding\s+ISOLatin1Encoding\s+def/', $ascii)) {
329	return \Phpdftk\Encoding\WinAnsiTable::getTable();
330	}
331
332	// Parse custom encoding array
333	// Format: /Encoding 256 array
334	// 0 1 255 { 1 index exch /.notdef put } for
335	// dup N /glyphname put
336	// ...
337	// readonly def
338	if (preg_match('/\/Encoding\s+(\d+)\s+array\b/s', $ascii, $m)) {
339	// Start with all .notdef
340	$encoding = array_fill(0, 256, '.notdef');
341
342	// Find all "dup N /glyphname put" entries
343	if (preg_match_all('/dup\s+(\d+)\s+\/(\S+)\s+put/', $ascii, $matches, PREG_SET_ORDER)) {
344	foreach ($matches as $match) {
345	$code = (int) $match[1];
346	$glyph = $match[2];
347	if ($code >= 0 && $code <= 255) {
348	$encoding[$code] = $glyph;
349	}
350	}
351	}
352
353	return $encoding;
354	}
355
356	// Default: use StandardEncoding
357	return StandardEncodingTable::getTable();
358	}
359
360	/**
361	* Parse CharString glyph names from the ASCII header.
362	*
363	* The charstrings section looks like:
364	* /CharStrings N dict dup begin
365	* /glyphname N RD ... ND
366	*
367	* We only extract the names, not the encrypted charstring data.
368	*
369	* @return list<string>
370	*/
371	private function parseCharStringNames(string $ascii): array
372	{
373	$names = [];
374	if (preg_match_all('/^\s*\/(\S+)\s+\d+\s+(?:RD\|R\|-\|)\s/m', $ascii, $matches)) {
375	foreach ($matches[1] as $name) {
376	if ($name !== 'CharStrings' && $name !== 'Encoding' && $name !== 'FontName') {
377	$names[] = $name;
378	}
379	}
380	}
381	return $names;
382	}
383
384	/**
385	* Parse glyph widths from the ASCII header.
386	*
387	* Looks for /Metrics or /CharMetrics dictionaries, or extracts widths
388	* from the font's built-in data. Many Type 1 fonts don't expose widths
389	* in the ASCII section (they're in the encrypted charstrings), so this
390	* returns what it can find.
391	*
392	* @return array<string, int> glyph name => width in 1000 units/em
393	*/
394	private function parseGlyphWidths(string $ascii): array
395	{
396	$widths = [];
397
398	// Try /Metrics dictionary: /glyphname [wx wy] or /glyphname N
399	if (preg_match('/\/Metrics\s+\d+\s+dict\s+(?:dup\s+)?begin\s+(.*?)(?:end\|readonly)/s', $ascii, $metricsBlock)) {
400	if (preg_match_all('/\/(\S+)\s+\[\s*([-\d.]+)/', $metricsBlock[1], $m, PREG_SET_ORDER)) {
401	foreach ($m as $match) {
402	$widths[$match[1]] = (int) round((float) $match[2]);
403	}
404	}
405	}
406
407	return $widths;
408	}
409
410	/**
411	* Build PFB-format bytes from the three segments.
412	*
413	* For embedding in PDF, the font program must be in PFB-like format
414	* (raw segments without the PFB headers, but with correct Length1/2/3).
415	* PDF expects the concatenated raw segments without PFB segment markers.
416	*/
417	private function buildPfbBytes(string $ascii, string $binary, string $trailer): string
418	{
419	return $ascii . $binary . $trailer;
420	}
421
422	/**
423	* Build PDF font flags from parsed metrics.
424	*
425	* ISO 32000-2, Table 123:
426	* Bit 1: FixedPitch
427	* Bit 2: Serif (assume serif unless name says otherwise)
428	* Bit 3: Symbolic
429	* Bit 4: Script
430	* Bit 6: Nonsymbolic
431	* Bit 7: Italic
432	*/
433	/** @param array<string, mixed> $metrics */
434	private function buildFlags(array $metrics): int
435	{
436	$flags = 0;
437
438	if ($metrics['isFixedPitch']) {
439	$flags \|= (1 << 0); // FixedPitch
440	}
441
442	// Assume non-symbolic (standard Latin encoding) unless font name indicates otherwise
443	$name = strtolower($metrics['fontName']);
444	if (str_contains($name, 'symbol') \|\| str_contains($name, 'zapf') \|\| str_contains($name, 'dingbat') \|\| str_contains($name, 'wingding')) {
445	$flags \|= (1 << 2); // Symbolic
446	} else {
447	$flags \|= (1 << 5); // Nonsymbolic
448	}
449
450	// Serif detection
451	if (str_contains($name, 'sans') \|\| str_contains($name, 'arial') \|\| str_contains($name, 'helvetica') \|\| str_contains($name, 'gothic') \|\| str_contains($name, 'futura')) {
452	// Sans-serif — don't set serif flag
453	} else {
454	$flags \|= (1 << 1); // Serif
455	}
456
457	// Italic
458	if ($metrics['italicAngle'] != 0.0 \|\| str_contains($name, 'italic') \|\| str_contains($name, 'oblique')) {
459	$flags \|= (1 << 6); // Italic
460	}
461
462	return $flags;
463	}
464	}