Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
n/a
0 / 0
n/a
0 / 0
CRAP
n/a
0 / 0
NamedCharacterReferences
n/a
0 / 0
n/a
0 / 0
0
n/a
0 / 0
1<?php
2
3declare(strict_types=1);
4
5namespace Phpdftk\Html\Tokenizer;
6
7/**
8 * Named character references per WHATWG HTML §13.5.
9 *
10 * This file ships a hand-curated subset of the spec's ~2200-entry table:
11 * the highest-frequency entries (legacy ASCII shortcuts, common typography,
12 * Greek letters, math, currency, Latin-1 accented forms, common arrows).
13 * Together they cover the overwhelming majority of named references in
14 * real-world HTML.
15 *
16 * Full table generation: run `php scripts/generate-html-entities.php` after
17 * pulling a fresh copy of https://html.spec.whatwg.org/entities.json into
18 * `vendor-data/whatwg/entities.json`. The generator overwrites this file
19 * with the complete ~2200-entry table.
20 *
21 * Matching is longest-prefix per WHATWG: the tokenizer's named-character-
22 * reference state tries successive lengths and keeps the longest hit.
23 */
24final class NamedCharacterReferences
25{
26    /**
27     * Map of name (without leading &, with optional trailing ;) → resolved
28     * codepoint(s) as a UTF-8 string. Some names map to multi-codepoint
29     * sequences (e.g. NotEqualTilde produces two codepoints).
30     *
31     * @var array<string, string>
32     */
33    public const array TABLE = [
34        // === Legacy entries (with and without trailing ;) ===
35        'amp;' => '&', 'amp' => '&',
36        'lt;' => '<', 'lt' => '<',
37        'gt;' => '>', 'gt' => '>',
38        'quot;' => '"', 'quot' => '"',
39        'apos;' => "'",
40        'nbsp;' => "\u{00A0}", 'nbsp' => "\u{00A0}",
41        'copy;' => "\u{00A9}", 'copy' => "\u{00A9}",
42        'reg;' => "\u{00AE}", 'reg' => "\u{00AE}",
43        'trade;' => "\u{2122}",
44        // === Typography ===
45        'hellip;' => "\u{2026}",
46        'mdash;' => "\u{2014}", 'ndash;' => "\u{2013}",
47        'lsquo;' => "\u{2018}", 'rsquo;' => "\u{2019}",
48        'ldquo;' => "\u{201C}", 'rdquo;' => "\u{201D}",
49        'sbquo;' => "\u{201A}", 'bdquo;' => "\u{201E}",
50        'laquo;' => "\u{00AB}", 'raquo;' => "\u{00BB}",
51        'bull;' => "\u{2022}",
52        'middot;' => "\u{00B7}",
53        'sect;' => "\u{00A7}",
54        'para;' => "\u{00B6}",
55        'dagger;' => "\u{2020}", 'Dagger;' => "\u{2021}",
56        'permil;' => "\u{2030}",
57        'prime;' => "\u{2032}", 'Prime;' => "\u{2033}",
58        'lsaquo;' => "\u{2039}", 'rsaquo;' => "\u{203A}",
59        'oline;' => "\u{203E}",
60        'shy;' => "\u{00AD}",
61        'iexcl;' => "\u{00A1}", 'iquest;' => "\u{00BF}",
62        'brvbar;' => "\u{00A6}",
63        'deg;' => "\u{00B0}",
64        'acute;' => "\u{00B4}",
65        'cedil;' => "\u{00B8}",
66        'uml;' => "\u{00A8}",
67        'macr;' => "\u{00AF}",
68        'not;' => "\u{00AC}",
69        'curren;' => "\u{00A4}",
70        // === Currency ===
71        'cent;' => "\u{00A2}", 'pound;' => "\u{00A3}", 'yen;' => "\u{00A5}",
72        'euro;' => "\u{20AC}",
73        // === Math operators ===
74        'plusmn;' => "\u{00B1}", 'times;' => "\u{00D7}", 'divide;' => "\u{00F7}",
75        'minus;' => "\u{2212}",
76        'frac12;' => "\u{00BD}", 'frac14;' => "\u{00BC}", 'frac34;' => "\u{00BE}",
77        'sup1;' => "\u{00B9}", 'sup2;' => "\u{00B2}", 'sup3;' => "\u{00B3}",
78        'forall;' => "\u{2200}", 'part;' => "\u{2202}", 'exist;' => "\u{2203}",
79        'empty;' => "\u{2205}", 'nabla;' => "\u{2207}",
80        'isin;' => "\u{2208}", 'notin;' => "\u{2209}", 'ni;' => "\u{220B}",
81        'prod;' => "\u{220F}", 'sum;' => "\u{2211}",
82        'lowast;' => "\u{2217}", 'radic;' => "\u{221A}",
83        'prop;' => "\u{221D}", 'infin;' => "\u{221E}",
84        'ang;' => "\u{2220}", 'and;' => "\u{2227}", 'or;' => "\u{2228}",
85        'cap;' => "\u{2229}", 'cup;' => "\u{222A}", 'int;' => "\u{222B}",
86        'there4;' => "\u{2234}",
87        'sim;' => "\u{223C}", 'cong;' => "\u{2245}", 'asymp;' => "\u{2248}",
88        'ne;' => "\u{2260}", 'equiv;' => "\u{2261}",
89        'le;' => "\u{2264}", 'ge;' => "\u{2265}",
90        'sub;' => "\u{2282}", 'sup;' => "\u{2283}",
91        'nsub;' => "\u{2284}", 'sube;' => "\u{2286}", 'supe;' => "\u{2287}",
92        'oplus;' => "\u{2295}", 'otimes;' => "\u{2297}", 'perp;' => "\u{22A5}",
93        'sdot;' => "\u{22C5}",
94        // === Greek letters ===
95        'Alpha;' => "\u{0391}", 'Beta;' => "\u{0392}", 'Gamma;' => "\u{0393}",
96        'Delta;' => "\u{0394}", 'Epsilon;' => "\u{0395}", 'Zeta;' => "\u{0396}",
97        'Eta;' => "\u{0397}", 'Theta;' => "\u{0398}", 'Iota;' => "\u{0399}",
98        'Kappa;' => "\u{039A}", 'Lambda;' => "\u{039B}", 'Mu;' => "\u{039C}",
99        'Nu;' => "\u{039D}", 'Xi;' => "\u{039E}", 'Omicron;' => "\u{039F}",
100        'Pi;' => "\u{03A0}", 'Rho;' => "\u{03A1}", 'Sigma;' => "\u{03A3}",
101        'Tau;' => "\u{03A4}", 'Upsilon;' => "\u{03A5}", 'Phi;' => "\u{03A6}",
102        'Chi;' => "\u{03A7}", 'Psi;' => "\u{03A8}", 'Omega;' => "\u{03A9}",
103        'alpha;' => "\u{03B1}", 'beta;' => "\u{03B2}", 'gamma;' => "\u{03B3}",
104        'delta;' => "\u{03B4}", 'epsilon;' => "\u{03B5}", 'zeta;' => "\u{03B6}",
105        'eta;' => "\u{03B7}", 'theta;' => "\u{03B8}", 'iota;' => "\u{03B9}",
106        'kappa;' => "\u{03BA}", 'lambda;' => "\u{03BB}", 'mu;' => "\u{03BC}",
107        'nu;' => "\u{03BD}", 'xi;' => "\u{03BE}", 'omicron;' => "\u{03BF}",
108        'pi;' => "\u{03C0}", 'rho;' => "\u{03C1}", 'sigmaf;' => "\u{03C2}",
109        'sigma;' => "\u{03C3}", 'tau;' => "\u{03C4}", 'upsilon;' => "\u{03C5}",
110        'phi;' => "\u{03C6}", 'chi;' => "\u{03C7}", 'psi;' => "\u{03C8}",
111        'omega;' => "\u{03C9}",
112        'thetasym;' => "\u{03D1}", 'upsih;' => "\u{03D2}", 'piv;' => "\u{03D6}",
113        // === Latin-1 supplement (accented forms) ===
114        'Agrave;' => "\u{00C0}", 'Aacute;' => "\u{00C1}", 'Acirc;' => "\u{00C2}",
115        'Atilde;' => "\u{00C3}", 'Auml;' => "\u{00C4}", 'Aring;' => "\u{00C5}",
116        'AElig;' => "\u{00C6}", 'Ccedil;' => "\u{00C7}",
117        'Egrave;' => "\u{00C8}", 'Eacute;' => "\u{00C9}", 'Ecirc;' => "\u{00CA}",
118        'Euml;' => "\u{00CB}",
119        'Igrave;' => "\u{00CC}", 'Iacute;' => "\u{00CD}", 'Icirc;' => "\u{00CE}",
120        'Iuml;' => "\u{00CF}",
121        'ETH;' => "\u{00D0}", 'Ntilde;' => "\u{00D1}",
122        'Ograve;' => "\u{00D2}", 'Oacute;' => "\u{00D3}", 'Ocirc;' => "\u{00D4}",
123        'Otilde;' => "\u{00D5}", 'Ouml;' => "\u{00D6}", 'Oslash;' => "\u{00D8}",
124        'Ugrave;' => "\u{00D9}", 'Uacute;' => "\u{00DA}", 'Ucirc;' => "\u{00DB}",
125        'Uuml;' => "\u{00DC}", 'Yacute;' => "\u{00DD}",
126        'THORN;' => "\u{00DE}", 'szlig;' => "\u{00DF}",
127        'agrave;' => "\u{00E0}", 'aacute;' => "\u{00E1}", 'acirc;' => "\u{00E2}",
128        'atilde;' => "\u{00E3}", 'auml;' => "\u{00E4}", 'aring;' => "\u{00E5}",
129        'aelig;' => "\u{00E6}", 'ccedil;' => "\u{00E7}",
130        'egrave;' => "\u{00E8}", 'eacute;' => "\u{00E9}", 'ecirc;' => "\u{00EA}",
131        'euml;' => "\u{00EB}",
132        'igrave;' => "\u{00EC}", 'iacute;' => "\u{00ED}", 'icirc;' => "\u{00EE}",
133        'iuml;' => "\u{00EF}",
134        'eth;' => "\u{00F0}", 'ntilde;' => "\u{00F1}",
135        'ograve;' => "\u{00F2}", 'oacute;' => "\u{00F3}", 'ocirc;' => "\u{00F4}",
136        'otilde;' => "\u{00F5}", 'ouml;' => "\u{00F6}", 'oslash;' => "\u{00F8}",
137        'ugrave;' => "\u{00F9}", 'uacute;' => "\u{00FA}", 'ucirc;' => "\u{00FB}",
138        'uuml;' => "\u{00FC}", 'yacute;' => "\u{00FD}",
139        'thorn;' => "\u{00FE}", 'yuml;' => "\u{00FF}",
140        // Latin Extended
141        'OElig;' => "\u{0152}", 'oelig;' => "\u{0153}",
142        'Scaron;' => "\u{0160}", 'scaron;' => "\u{0161}",
143        'Yuml;' => "\u{0178}",
144        // === Arrows ===
145        'larr;' => "\u{2190}", 'uarr;' => "\u{2191}", 'rarr;' => "\u{2192}",
146        'darr;' => "\u{2193}", 'harr;' => "\u{2194}",
147        'lArr;' => "\u{21D0}", 'uArr;' => "\u{21D1}", 'rArr;' => "\u{21D2}",
148        'dArr;' => "\u{21D3}", 'hArr;' => "\u{21D4}",
149        'crarr;' => "\u{21B5}",
150        // === Spaces & layout ===
151        'ensp;' => "\u{2002}", 'emsp;' => "\u{2003}", 'thinsp;' => "\u{2009}",
152        'zwnj;' => "\u{200C}", 'zwj;' => "\u{200D}",
153        'lrm;' => "\u{200E}", 'rlm;' => "\u{200F}",
154        // === Geometric & misc symbols ===
155        'loz;' => "\u{25CA}",
156        'spades;' => "\u{2660}", 'clubs;' => "\u{2663}", 'hearts;' => "\u{2665}",
157        'diams;' => "\u{2666}",
158        'circ;' => "\u{02C6}", 'tilde;' => "\u{02DC}",
159    ];
160
161    /**
162     * Names that do NOT require a trailing semicolon (legacy entries per spec).
163     *
164     * @var list<string>
165     */
166    public const array NO_SEMICOLON_ALLOWED = [
167        'amp', 'lt', 'gt', 'quot', 'nbsp', 'copy', 'reg',
168    ];
169
170    /**
171     * Numeric character reference codepoint substitution table per WHATWG
172     * §13.2.5.80. Windows-1252-compatibility remappings for the C1 range.
173     *
174     * @var array<int, int>
175     */
176    public const array NUMERIC_REPLACEMENTS = [
177        0x80 => 0x20AC, 0x82 => 0x201A, 0x83 => 0x0192, 0x84 => 0x201E,
178        0x85 => 0x2026, 0x86 => 0x2020, 0x87 => 0x2021, 0x88 => 0x02C6,
179        0x89 => 0x2030, 0x8A => 0x0160, 0x8B => 0x2039, 0x8C => 0x0152,
180        0x8E => 0x017D, 0x91 => 0x2018, 0x92 => 0x2019, 0x93 => 0x201C,
181        0x94 => 0x201D, 0x95 => 0x2022, 0x96 => 0x2013, 0x97 => 0x2014,
182        0x98 => 0x02DC, 0x99 => 0x2122, 0x9A => 0x0161, 0x9B => 0x203A,
183        0x9C => 0x0153, 0x9E => 0x017E, 0x9F => 0x0178,
184    ];
185}