Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | n/a |
0 / 0 |
n/a |
0 / 0 |
CRAP | n/a |
0 / 0 |
|||
| NamedCharacterReferences | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | n/a |
0 / 0 |
|||
| 1 | <?php |
| 2 | |
| 3 | declare(strict_types=1); |
| 4 | |
| 5 | namespace Phpdftk\Html\Tokenizer; |
| 6 | |
| 7 | /** |
| 8 | * Named character references per WHATWG HTML §13.5. |
| 9 | * |
| 10 | * This file ships a hand-curated subset of the spec's ~2200-entry table: |
| 11 | * the highest-frequency entries (legacy ASCII shortcuts, common typography, |
| 12 | * Greek letters, math, currency, Latin-1 accented forms, common arrows). |
| 13 | * Together they cover the overwhelming majority of named references in |
| 14 | * real-world HTML. |
| 15 | * |
| 16 | * Full table generation: run `php scripts/generate-html-entities.php` after |
| 17 | * pulling a fresh copy of https://html.spec.whatwg.org/entities.json into |
| 18 | * `vendor-data/whatwg/entities.json`. The generator overwrites this file |
| 19 | * with the complete ~2200-entry table. |
| 20 | * |
| 21 | * Matching is longest-prefix per WHATWG: the tokenizer's named-character- |
| 22 | * reference state tries successive lengths and keeps the longest hit. |
| 23 | */ |
| 24 | final class NamedCharacterReferences |
| 25 | { |
| 26 | /** |
| 27 | * Map of name (without leading &, with optional trailing ;) → resolved |
| 28 | * codepoint(s) as a UTF-8 string. Some names map to multi-codepoint |
| 29 | * sequences (e.g. NotEqualTilde produces two codepoints). |
| 30 | * |
| 31 | * @var array<string, string> |
| 32 | */ |
| 33 | public const array TABLE = [ |
| 34 | // === Legacy entries (with and without trailing ;) === |
| 35 | 'amp;' => '&', 'amp' => '&', |
| 36 | 'lt;' => '<', 'lt' => '<', |
| 37 | 'gt;' => '>', 'gt' => '>', |
| 38 | 'quot;' => '"', 'quot' => '"', |
| 39 | 'apos;' => "'", |
| 40 | 'nbsp;' => "\u{00A0}", 'nbsp' => "\u{00A0}", |
| 41 | 'copy;' => "\u{00A9}", 'copy' => "\u{00A9}", |
| 42 | 'reg;' => "\u{00AE}", 'reg' => "\u{00AE}", |
| 43 | 'trade;' => "\u{2122}", |
| 44 | // === Typography === |
| 45 | 'hellip;' => "\u{2026}", |
| 46 | 'mdash;' => "\u{2014}", 'ndash;' => "\u{2013}", |
| 47 | 'lsquo;' => "\u{2018}", 'rsquo;' => "\u{2019}", |
| 48 | 'ldquo;' => "\u{201C}", 'rdquo;' => "\u{201D}", |
| 49 | 'sbquo;' => "\u{201A}", 'bdquo;' => "\u{201E}", |
| 50 | 'laquo;' => "\u{00AB}", 'raquo;' => "\u{00BB}", |
| 51 | 'bull;' => "\u{2022}", |
| 52 | 'middot;' => "\u{00B7}", |
| 53 | 'sect;' => "\u{00A7}", |
| 54 | 'para;' => "\u{00B6}", |
| 55 | 'dagger;' => "\u{2020}", 'Dagger;' => "\u{2021}", |
| 56 | 'permil;' => "\u{2030}", |
| 57 | 'prime;' => "\u{2032}", 'Prime;' => "\u{2033}", |
| 58 | 'lsaquo;' => "\u{2039}", 'rsaquo;' => "\u{203A}", |
| 59 | 'oline;' => "\u{203E}", |
| 60 | 'shy;' => "\u{00AD}", |
| 61 | 'iexcl;' => "\u{00A1}", 'iquest;' => "\u{00BF}", |
| 62 | 'brvbar;' => "\u{00A6}", |
| 63 | 'deg;' => "\u{00B0}", |
| 64 | 'acute;' => "\u{00B4}", |
| 65 | 'cedil;' => "\u{00B8}", |
| 66 | 'uml;' => "\u{00A8}", |
| 67 | 'macr;' => "\u{00AF}", |
| 68 | 'not;' => "\u{00AC}", |
| 69 | 'curren;' => "\u{00A4}", |
| 70 | // === Currency === |
| 71 | 'cent;' => "\u{00A2}", 'pound;' => "\u{00A3}", 'yen;' => "\u{00A5}", |
| 72 | 'euro;' => "\u{20AC}", |
| 73 | // === Math operators === |
| 74 | 'plusmn;' => "\u{00B1}", 'times;' => "\u{00D7}", 'divide;' => "\u{00F7}", |
| 75 | 'minus;' => "\u{2212}", |
| 76 | 'frac12;' => "\u{00BD}", 'frac14;' => "\u{00BC}", 'frac34;' => "\u{00BE}", |
| 77 | 'sup1;' => "\u{00B9}", 'sup2;' => "\u{00B2}", 'sup3;' => "\u{00B3}", |
| 78 | 'forall;' => "\u{2200}", 'part;' => "\u{2202}", 'exist;' => "\u{2203}", |
| 79 | 'empty;' => "\u{2205}", 'nabla;' => "\u{2207}", |
| 80 | 'isin;' => "\u{2208}", 'notin;' => "\u{2209}", 'ni;' => "\u{220B}", |
| 81 | 'prod;' => "\u{220F}", 'sum;' => "\u{2211}", |
| 82 | 'lowast;' => "\u{2217}", 'radic;' => "\u{221A}", |
| 83 | 'prop;' => "\u{221D}", 'infin;' => "\u{221E}", |
| 84 | 'ang;' => "\u{2220}", 'and;' => "\u{2227}", 'or;' => "\u{2228}", |
| 85 | 'cap;' => "\u{2229}", 'cup;' => "\u{222A}", 'int;' => "\u{222B}", |
| 86 | 'there4;' => "\u{2234}", |
| 87 | 'sim;' => "\u{223C}", 'cong;' => "\u{2245}", 'asymp;' => "\u{2248}", |
| 88 | 'ne;' => "\u{2260}", 'equiv;' => "\u{2261}", |
| 89 | 'le;' => "\u{2264}", 'ge;' => "\u{2265}", |
| 90 | 'sub;' => "\u{2282}", 'sup;' => "\u{2283}", |
| 91 | 'nsub;' => "\u{2284}", 'sube;' => "\u{2286}", 'supe;' => "\u{2287}", |
| 92 | 'oplus;' => "\u{2295}", 'otimes;' => "\u{2297}", 'perp;' => "\u{22A5}", |
| 93 | 'sdot;' => "\u{22C5}", |
| 94 | // === Greek letters === |
| 95 | 'Alpha;' => "\u{0391}", 'Beta;' => "\u{0392}", 'Gamma;' => "\u{0393}", |
| 96 | 'Delta;' => "\u{0394}", 'Epsilon;' => "\u{0395}", 'Zeta;' => "\u{0396}", |
| 97 | 'Eta;' => "\u{0397}", 'Theta;' => "\u{0398}", 'Iota;' => "\u{0399}", |
| 98 | 'Kappa;' => "\u{039A}", 'Lambda;' => "\u{039B}", 'Mu;' => "\u{039C}", |
| 99 | 'Nu;' => "\u{039D}", 'Xi;' => "\u{039E}", 'Omicron;' => "\u{039F}", |
| 100 | 'Pi;' => "\u{03A0}", 'Rho;' => "\u{03A1}", 'Sigma;' => "\u{03A3}", |
| 101 | 'Tau;' => "\u{03A4}", 'Upsilon;' => "\u{03A5}", 'Phi;' => "\u{03A6}", |
| 102 | 'Chi;' => "\u{03A7}", 'Psi;' => "\u{03A8}", 'Omega;' => "\u{03A9}", |
| 103 | 'alpha;' => "\u{03B1}", 'beta;' => "\u{03B2}", 'gamma;' => "\u{03B3}", |
| 104 | 'delta;' => "\u{03B4}", 'epsilon;' => "\u{03B5}", 'zeta;' => "\u{03B6}", |
| 105 | 'eta;' => "\u{03B7}", 'theta;' => "\u{03B8}", 'iota;' => "\u{03B9}", |
| 106 | 'kappa;' => "\u{03BA}", 'lambda;' => "\u{03BB}", 'mu;' => "\u{03BC}", |
| 107 | 'nu;' => "\u{03BD}", 'xi;' => "\u{03BE}", 'omicron;' => "\u{03BF}", |
| 108 | 'pi;' => "\u{03C0}", 'rho;' => "\u{03C1}", 'sigmaf;' => "\u{03C2}", |
| 109 | 'sigma;' => "\u{03C3}", 'tau;' => "\u{03C4}", 'upsilon;' => "\u{03C5}", |
| 110 | 'phi;' => "\u{03C6}", 'chi;' => "\u{03C7}", 'psi;' => "\u{03C8}", |
| 111 | 'omega;' => "\u{03C9}", |
| 112 | 'thetasym;' => "\u{03D1}", 'upsih;' => "\u{03D2}", 'piv;' => "\u{03D6}", |
| 113 | // === Latin-1 supplement (accented forms) === |
| 114 | 'Agrave;' => "\u{00C0}", 'Aacute;' => "\u{00C1}", 'Acirc;' => "\u{00C2}", |
| 115 | 'Atilde;' => "\u{00C3}", 'Auml;' => "\u{00C4}", 'Aring;' => "\u{00C5}", |
| 116 | 'AElig;' => "\u{00C6}", 'Ccedil;' => "\u{00C7}", |
| 117 | 'Egrave;' => "\u{00C8}", 'Eacute;' => "\u{00C9}", 'Ecirc;' => "\u{00CA}", |
| 118 | 'Euml;' => "\u{00CB}", |
| 119 | 'Igrave;' => "\u{00CC}", 'Iacute;' => "\u{00CD}", 'Icirc;' => "\u{00CE}", |
| 120 | 'Iuml;' => "\u{00CF}", |
| 121 | 'ETH;' => "\u{00D0}", 'Ntilde;' => "\u{00D1}", |
| 122 | 'Ograve;' => "\u{00D2}", 'Oacute;' => "\u{00D3}", 'Ocirc;' => "\u{00D4}", |
| 123 | 'Otilde;' => "\u{00D5}", 'Ouml;' => "\u{00D6}", 'Oslash;' => "\u{00D8}", |
| 124 | 'Ugrave;' => "\u{00D9}", 'Uacute;' => "\u{00DA}", 'Ucirc;' => "\u{00DB}", |
| 125 | 'Uuml;' => "\u{00DC}", 'Yacute;' => "\u{00DD}", |
| 126 | 'THORN;' => "\u{00DE}", 'szlig;' => "\u{00DF}", |
| 127 | 'agrave;' => "\u{00E0}", 'aacute;' => "\u{00E1}", 'acirc;' => "\u{00E2}", |
| 128 | 'atilde;' => "\u{00E3}", 'auml;' => "\u{00E4}", 'aring;' => "\u{00E5}", |
| 129 | 'aelig;' => "\u{00E6}", 'ccedil;' => "\u{00E7}", |
| 130 | 'egrave;' => "\u{00E8}", 'eacute;' => "\u{00E9}", 'ecirc;' => "\u{00EA}", |
| 131 | 'euml;' => "\u{00EB}", |
| 132 | 'igrave;' => "\u{00EC}", 'iacute;' => "\u{00ED}", 'icirc;' => "\u{00EE}", |
| 133 | 'iuml;' => "\u{00EF}", |
| 134 | 'eth;' => "\u{00F0}", 'ntilde;' => "\u{00F1}", |
| 135 | 'ograve;' => "\u{00F2}", 'oacute;' => "\u{00F3}", 'ocirc;' => "\u{00F4}", |
| 136 | 'otilde;' => "\u{00F5}", 'ouml;' => "\u{00F6}", 'oslash;' => "\u{00F8}", |
| 137 | 'ugrave;' => "\u{00F9}", 'uacute;' => "\u{00FA}", 'ucirc;' => "\u{00FB}", |
| 138 | 'uuml;' => "\u{00FC}", 'yacute;' => "\u{00FD}", |
| 139 | 'thorn;' => "\u{00FE}", 'yuml;' => "\u{00FF}", |
| 140 | // Latin Extended |
| 141 | 'OElig;' => "\u{0152}", 'oelig;' => "\u{0153}", |
| 142 | 'Scaron;' => "\u{0160}", 'scaron;' => "\u{0161}", |
| 143 | 'Yuml;' => "\u{0178}", |
| 144 | // === Arrows === |
| 145 | 'larr;' => "\u{2190}", 'uarr;' => "\u{2191}", 'rarr;' => "\u{2192}", |
| 146 | 'darr;' => "\u{2193}", 'harr;' => "\u{2194}", |
| 147 | 'lArr;' => "\u{21D0}", 'uArr;' => "\u{21D1}", 'rArr;' => "\u{21D2}", |
| 148 | 'dArr;' => "\u{21D3}", 'hArr;' => "\u{21D4}", |
| 149 | 'crarr;' => "\u{21B5}", |
| 150 | // === Spaces & layout === |
| 151 | 'ensp;' => "\u{2002}", 'emsp;' => "\u{2003}", 'thinsp;' => "\u{2009}", |
| 152 | 'zwnj;' => "\u{200C}", 'zwj;' => "\u{200D}", |
| 153 | 'lrm;' => "\u{200E}", 'rlm;' => "\u{200F}", |
| 154 | // === Geometric & misc symbols === |
| 155 | 'loz;' => "\u{25CA}", |
| 156 | 'spades;' => "\u{2660}", 'clubs;' => "\u{2663}", 'hearts;' => "\u{2665}", |
| 157 | 'diams;' => "\u{2666}", |
| 158 | 'circ;' => "\u{02C6}", 'tilde;' => "\u{02DC}", |
| 159 | ]; |
| 160 | |
| 161 | /** |
| 162 | * Names that do NOT require a trailing semicolon (legacy entries per spec). |
| 163 | * |
| 164 | * @var list<string> |
| 165 | */ |
| 166 | public const array NO_SEMICOLON_ALLOWED = [ |
| 167 | 'amp', 'lt', 'gt', 'quot', 'nbsp', 'copy', 'reg', |
| 168 | ]; |
| 169 | |
| 170 | /** |
| 171 | * Numeric character reference codepoint substitution table per WHATWG |
| 172 | * §13.2.5.80. Windows-1252-compatibility remappings for the C1 range. |
| 173 | * |
| 174 | * @var array<int, int> |
| 175 | */ |
| 176 | public const array NUMERIC_REPLACEMENTS = [ |
| 177 | 0x80 => 0x20AC, 0x82 => 0x201A, 0x83 => 0x0192, 0x84 => 0x201E, |
| 178 | 0x85 => 0x2026, 0x86 => 0x2020, 0x87 => 0x2021, 0x88 => 0x02C6, |
| 179 | 0x89 => 0x2030, 0x8A => 0x0160, 0x8B => 0x2039, 0x8C => 0x0152, |
| 180 | 0x8E => 0x017D, 0x91 => 0x2018, 0x92 => 0x2019, 0x93 => 0x201C, |
| 181 | 0x94 => 0x201D, 0x95 => 0x2022, 0x96 => 0x2013, 0x97 => 0x2014, |
| 182 | 0x98 => 0x02DC, 0x99 => 0x2122, 0x9A => 0x0161, 0x9B => 0x203A, |
| 183 | 0x9C => 0x0153, 0x9E => 0x017E, 0x9F => 0x0178, |
| 184 | ]; |
| 185 | } |