Code Coverage for /home/runner/work/phpdftk/phpdftk/packages/html/src/Tokenizer/Tokenizer.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	85.85% covered (warning)	85.85%	1171 / 1364	60.71% covered (warning)	60.71%	68 / 112	CRAP	0.00% covered (danger)	0.00%	0 / 1
Tokenizer	85.85% covered (warning)	85.85%	1171 / 1364	60.71% covered (warning)	60.71%	68 / 112	1563.12	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	3
tokenize	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	2
nextToken	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	4
errors	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
step	100.00% covered (success)	100.00%	81 / 81	100.00% covered (success)	100.00%	1 / 1	81
preprocess	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
consume	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	4
reconsumeIn	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
peekRemaining	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	3
advance	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	2
emit	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	3
emitChar	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
error	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	2
currentTokenAsEnd	50.00% covered (danger)	50.00%	1 / 2	0.00% covered (danger)	0.00%	0 / 1	1.12
currentTokenAsTag	50.00% covered (danger)	50.00%	1 / 2	0.00% covered (danger)	0.00%	0 / 1	2.50
currentTokenAsComment	50.00% covered (danger)	50.00%	1 / 2	0.00% covered (danger)	0.00%	0 / 1	1.12
currentTokenAsDoctype	50.00% covered (danger)	50.00%	1 / 2	0.00% covered (danger)	0.00%	0 / 1	1.12
startNewAttribute	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
appendToCurrentAttributeName	50.00% covered (danger)	50.00%	1 / 2	0.00% covered (danger)	0.00%	0 / 1	1.12
appendToCurrentAttributeValue	50.00% covered (danger)	50.00%	1 / 2	0.00% covered (danger)	0.00%	0 / 1	1.12
dedupAttributes	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	3
isAppropriateEndTag	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	2
stateData	100.00% covered (success)	100.00%	14 / 14	100.00% covered (success)	100.00%	1 / 1	5
stateRcdata	81.25% covered (warning)	81.25%	13 / 16	0.00% covered (danger)	0.00%	0 / 1	5.16
stateRawtext	75.00% covered (warning)	75.00%	9 / 12	0.00% covered (danger)	0.00%	0 / 1	4.25
stateScriptData	100.00% covered (success)	100.00%	12 / 12	100.00% covered (success)	100.00%	1 / 1	4
stateScriptDataLessThanSign	100.00% covered (success)	100.00%	12 / 12	100.00% covered (success)	100.00%	1 / 1	3
stateScriptDataEndTagOpen	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	3
stateScriptDataEndTagName	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
stateScriptDataEscapeStart	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	2
stateScriptDataEscapeStartDash	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	2
stateScriptDataEscaped	100.00% covered (success)	100.00%	17 / 17	100.00% covered (success)	100.00%	1 / 1	5
stateScriptDataEscapedDash	100.00% covered (success)	100.00%	19 / 19	100.00% covered (success)	100.00%	1 / 1	5
stateScriptDataEscapedDashDash	100.00% covered (success)	100.00%	22 / 22	100.00% covered (success)	100.00%	1 / 1	6
stateScriptDataEscapedLessThanSign	100.00% covered (success)	100.00%	12 / 12	100.00% covered (success)	100.00%	1 / 1	4
stateScriptDataEscapedEndTagOpen	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	3
stateScriptDataEscapedEndTagName	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
stateScriptDataDoubleEscapeStart	100.00% covered (success)	100.00%	16 / 16	100.00% covered (success)	100.00%	1 / 1	12
stateScriptDataDoubleEscaped	100.00% covered (success)	100.00%	18 / 18	100.00% covered (success)	100.00%	1 / 1	5
stateScriptDataDoubleEscapedDash	100.00% covered (success)	100.00%	20 / 20	100.00% covered (success)	100.00%	1 / 1	5
stateScriptDataDoubleEscapedDashDash	100.00% covered (success)	100.00%	23 / 23	100.00% covered (success)	100.00%	1 / 1	6
stateScriptDataDoubleEscapedLessThanSign	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	2
stateScriptDataDoubleEscapeEnd	100.00% covered (success)	100.00%	16 / 16	100.00% covered (success)	100.00%	1 / 1	12
statePlaintext	0.00% covered (danger)	0.00%	0 / 9	0.00% covered (danger)	0.00%	0 / 1	12
stateTagOpen	100.00% covered (success)	100.00%	24 / 24	100.00% covered (success)	100.00%	1 / 1	7
stateEndTagOpen	100.00% covered (success)	100.00%	18 / 18	100.00% covered (success)	100.00%	1 / 1	5
stateTagName	87.50% covered (warning)	87.50%	21 / 24	0.00% covered (danger)	0.00%	0 / 1	11.24
stateRcdataLessThanSign	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	2
stateRcdataEndTagOpen	62.50% covered (warning)	62.50%	5 / 8	0.00% covered (danger)	0.00%	0 / 1	3.47
stateRcdataEndTagName	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
stateRawtextLessThanSign	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	2
stateRawtextEndTagOpen	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	3
stateRawtextEndTagName	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
endTagNameAlternativeReturn	100.00% covered (success)	100.00%	30 / 30	100.00% covered (success)	100.00%	1 / 1	14
emitFakeOpeningChars	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	2
stateBeforeAttributeName	62.50% covered (warning)	62.50%	10 / 16	0.00% covered (danger)	0.00%	0 / 1	13.27
stateAttributeName	84.21% covered (warning)	84.21%	16 / 19	0.00% covered (danger)	0.00%	0 / 1	14.77
stateAfterAttributeName	90.00% covered (success)	90.00%	18 / 20	0.00% covered (danger)	0.00%	0 / 1	9.08
stateBeforeAttributeValue	66.67% covered (warning)	66.67%	10 / 15	0.00% covered (danger)	0.00%	0 / 1	10.37
stateAttributeValueDoubleQuoted	82.35% covered (warning)	82.35%	14 / 17	0.00% covered (danger)	0.00%	0 / 1	5.14
stateAttributeValueSingleQuoted	64.71% covered (warning)	64.71%	11 / 17	0.00% covered (danger)	0.00%	0 / 1	6.10
stateAttributeValueUnquoted	73.91% covered (warning)	73.91%	17 / 23	0.00% covered (danger)	0.00%	0 / 1	17.48
stateAfterAttributeValueQuoted	64.71% covered (warning)	64.71%	11 / 17	0.00% covered (danger)	0.00%	0 / 1	10.81
stateSelfClosingStartTag	100.00% covered (success)	100.00%	15 / 15	100.00% covered (success)	100.00%	1 / 1	4
stateBogusComment	100.00% covered (success)	100.00%	15 / 15	100.00% covered (success)	100.00%	1 / 1	4
stateMarkupDeclarationOpen	100.00% covered (success)	100.00%	22 / 22	100.00% covered (success)	100.00%	1 / 1	5
stateCommentStart	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	3
stateCommentStartDash	75.00% covered (warning)	75.00%	12 / 16	0.00% covered (danger)	0.00%	0 / 1	4.25
stateComment	84.21% covered (warning)	84.21%	16 / 19	0.00% covered (danger)	0.00%	0 / 1	5.10
stateCommentLessThanSign	80.00% covered (warning)	80.00%	8 / 10	0.00% covered (danger)	0.00%	0 / 1	3.07
stateCommentLessThanSignBang	80.00% covered (warning)	80.00%	4 / 5	0.00% covered (danger)	0.00%	0 / 1	2.03
stateCommentLessThanSignBangDash	80.00% covered (warning)	80.00%	4 / 5	0.00% covered (danger)	0.00%	0 / 1	2.03
stateCommentLessThanSignBangDashDash	66.67% covered (warning)	66.67%	4 / 6	0.00% covered (danger)	0.00%	0 / 1	3.33
stateCdataSection	66.67% covered (warning)	66.67%	6 / 9	0.00% covered (danger)	0.00%	0 / 1	3.33
stateCdataSectionBracket	66.67% covered (warning)	66.67%	4 / 6	0.00% covered (danger)	0.00%	0 / 1	2.15
stateCdataSectionEnd	80.00% covered (warning)	80.00%	8 / 10	0.00% covered (danger)	0.00%	0 / 1	3.07
stateCommentEndDash	63.64% covered (warning)	63.64%	7 / 11	0.00% covered (danger)	0.00%	0 / 1	3.43
stateCommentEnd	100.00% covered (success)	100.00%	19 / 19	100.00% covered (success)	100.00%	1 / 1	5
stateCommentEndBang	61.11% covered (warning)	61.11%	11 / 18	0.00% covered (danger)	0.00%	0 / 1	4.94
stateDoctype	62.50% covered (warning)	62.50%	10 / 16	0.00% covered (danger)	0.00%	0 / 1	9.58
stateBeforeDoctypeName	67.86% covered (warning)	67.86%	19 / 28	0.00% covered (danger)	0.00%	0 / 1	13.32
stateDoctypeName	62.50% covered (warning)	62.50%	15 / 24	0.00% covered (danger)	0.00%	0 / 1	15.27
stateAfterDoctypeName	82.76% covered (warning)	82.76%	24 / 29	0.00% covered (danger)	0.00%	0 / 1	10.51
stateAfterDoctypePublicKeyword	83.33% covered (warning)	83.33%	25 / 30	0.00% covered (danger)	0.00%	0 / 1	9.37
stateBeforeDoctypePublicIdentifier	81.48% covered (warning)	81.48%	22 / 27	0.00% covered (danger)	0.00%	0 / 1	9.51
stateDoctypePublicIdentifierDoubleQuoted	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
stateDoctypePublicIdentifierSingleQuoted	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
stateAfterDoctypePublicIdentifier	82.14% covered (warning)	82.14%	23 / 28	0.00% covered (danger)	0.00%	0 / 1	9.46
stateBetweenDoctypePublicAndSystemIdentifiers	68.00% covered (warning)	68.00%	17 / 25	0.00% covered (danger)	0.00%	0 / 1	11.65
stateAfterDoctypeSystemKeyword	56.67% covered (warning)	56.67%	17 / 30	0.00% covered (danger)	0.00%	0 / 1	15.59
stateBeforeDoctypeSystemIdentifier	81.48% covered (warning)	81.48%	22 / 27	0.00% covered (danger)	0.00%	0 / 1	9.51
stateDoctypeSystemIdentifierDoubleQuoted	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
stateDoctypeSystemIdentifierSingleQuoted	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
stateAfterDoctypeSystemIdentifier	43.75% covered (danger)	43.75%	7 / 16	0.00% covered (danger)	0.00%	0 / 1	15.72
doctypeQuotedIdentifier	62.07% covered (warning)	62.07%	18 / 29	0.00% covered (danger)	0.00%	0 / 1	11.49
stateBogusDoctype	63.64% covered (warning)	63.64%	7 / 11	0.00% covered (danger)	0.00%	0 / 1	4.77
stateCharacterReference	100.00% covered (success)	100.00%	15 / 15	100.00% covered (success)	100.00%	1 / 1	4
stateNamedCharacterReference	100.00% covered (success)	100.00%	32 / 32	100.00% covered (success)	100.00%	1 / 1	15
stateAmbiguousAmpersand	100.00% covered (success)	100.00%	12 / 12	100.00% covered (success)	100.00%	1 / 1	7
stateNumericCharacterReference	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	3
stateHexadecimalCharacterReferenceStart	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	3
stateDecimalCharacterReferenceStart	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	3
stateHexadecimalCharacterReference	80.00% covered (warning)	80.00%	16 / 20	0.00% covered (danger)	0.00%	0 / 1	8.51
stateDecimalCharacterReference	100.00% covered (success)	100.00%	14 / 14	100.00% covered (success)	100.00%	1 / 1	4
stateNumericCharacterReferenceEnd	100.00% covered (success)	100.00%	16 / 16	100.00% covered (success)	100.00%	1 / 1	6
flushTempBufferToCharOrAttribute	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	5
finalizeAndEmitTag	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	3
isAsciiAlpha	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	4
isAsciiUpperAlpha	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	2
isAsciiLowerAlpha	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	2
isAsciiAlphanumeric	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	2
isAsciiHexDigit	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	5

1	<?php
2
3	declare(strict_types=1);
4
5	namespace Phpdftk\Html\Tokenizer;
6
7	/**
8	* WHATWG HTML §13.2.5 tokenizer.
9	*
10	* Phase 1B.2 + 1B.2-bis: all ~80 spec states implemented. Covers DOCTYPE
11	* (including PUBLIC/SYSTEM identifiers), tags with every attribute form,
12	* script-data with full escape/double-escape recovery, comments (including
13	* nested-comment recovery), CDATA sections (entered when {@see self::$inForeignContent}
14	* is true), and character references (numeric + named).
15	*
16	* Named character reference table (see {@see NamedCharacterReferences}) ships
17	* the high-frequency subset of the spec's ~2200 entries. Generation of the
18	* full table from the spec's `entities.json` is a separate deliverable
19	* tracked in the rendering roadmap.
20	*
21	* Input preprocessing per WHATWG §13.2.3.5: CR/CRLF normalised to LF before
22	* tokenizing. NULL handling is per-state (some emit U+FFFD, some emit raw,
23	* all with parse-error tracking).
24	*/
25	final class Tokenizer
26	{
27	public TokenizerState $state = TokenizerState::Data;
28	public ?string $lastStartTagName = null; // for appropriate-end-tag check in RCDATA/RAWTEXT
29
30	/**
31	* Set to true by tree construction when the "adjusted current node" is
32	* not in the HTML namespace (e.g. inside SVG or MathML). Affects the
33	* MarkupDeclarationOpen state's handling of `[CDATA[`: in foreign content
34	* we enter the CdataSection state; in HTML content it's a bogus comment.
35	*/
36	public bool $inForeignContent = false;
37
38	/** @var list<string> input as an array of UTF-8 single-codepoint strings */
39	private array $chars;
40	private int $length;
41	private int $pos = 0;
42	private bool $reconsume = false;
43	private string $currentChar = '';
44
45	private ?Token $currentToken = null;
46	private TokenizerState $returnState = TokenizerState::Data;
47	private string $tempBuffer = '';
48	private int $characterReferenceCode = 0;
49
50	/** @var list<Token> */
51	private array $emitted = [];
52	private int $emittedCursor = 0;
53	/** @var list<ParseError> */
54	private array $errors = [];
55	private bool $done = false;
56
57	public function __construct(string $input)
58	{
59	$normalised = $this->preprocess($input);
60	$this->chars = $normalised === '' ? [] : (mb_str_split($normalised, 1, 'UTF-8') ?: []);
61	$this->length = count($this->chars);
62	}
63
64	/**
65	* Run the state machine to completion and return all tokens emitted, in
66	* order, ending with an EofToken. Convenience for callers that don't need
67	* mid-stream state interaction; tree construction uses {@see self::nextToken()}.
68	*
69	* @return list<Token>
70	*/
71	public function tokenize(): array
72	{
73	while (!$this->done) {
74	$this->step();
75	}
76	return $this->emitted;
77	}
78
79	/**
80	* Pull the next token, advancing the state machine until at least one
81	* token is emitted (or EOF). Tree construction drives this iteratively so
82	* it can mutate {@see self::$state} (e.g. switching to RCDATA when
83	* encountering `<title>`) between tokens.
84	*/
85	public function nextToken(): ?Token
86	{
87	while ($this->emittedCursor >= count($this->emitted) && !$this->done) {
88	$this->step();
89	}
90	if ($this->emittedCursor < count($this->emitted)) {
91	return $this->emitted[$this->emittedCursor++];
92	}
93	return null;
94	}
95
96	/** @return list<ParseError> */
97	public function errors(): array
98	{
99	return $this->errors;
100	}
101
102	private function step(): void
103	{
104	match ($this->state) {
105	TokenizerState::Data => $this->stateData(),
106	TokenizerState::Rcdata => $this->stateRcdata(),
107	TokenizerState::Rawtext => $this->stateRawtext(),
108	TokenizerState::ScriptData => $this->stateScriptData(),
109	TokenizerState::Plaintext => $this->statePlaintext(),
110	TokenizerState::TagOpen => $this->stateTagOpen(),
111	TokenizerState::EndTagOpen => $this->stateEndTagOpen(),
112	TokenizerState::TagName => $this->stateTagName(),
113	TokenizerState::RcdataLessThanSign => $this->stateRcdataLessThanSign(),
114	TokenizerState::RcdataEndTagOpen => $this->stateRcdataEndTagOpen(),
115	TokenizerState::RcdataEndTagName => $this->stateRcdataEndTagName(),
116	TokenizerState::RawtextLessThanSign => $this->stateRawtextLessThanSign(),
117	TokenizerState::RawtextEndTagOpen => $this->stateRawtextEndTagOpen(),
118	TokenizerState::RawtextEndTagName => $this->stateRawtextEndTagName(),
119	TokenizerState::ScriptDataLessThanSign => $this->stateScriptDataLessThanSign(),
120	TokenizerState::ScriptDataEndTagOpen => $this->stateScriptDataEndTagOpen(),
121	TokenizerState::ScriptDataEndTagName => $this->stateScriptDataEndTagName(),
122	TokenizerState::ScriptDataEscapeStart => $this->stateScriptDataEscapeStart(),
123	TokenizerState::ScriptDataEscapeStartDash => $this->stateScriptDataEscapeStartDash(),
124	TokenizerState::ScriptDataEscaped => $this->stateScriptDataEscaped(),
125	TokenizerState::ScriptDataEscapedDash => $this->stateScriptDataEscapedDash(),
126	TokenizerState::ScriptDataEscapedDashDash => $this->stateScriptDataEscapedDashDash(),
127	TokenizerState::ScriptDataEscapedLessThanSign => $this->stateScriptDataEscapedLessThanSign(),
128	TokenizerState::ScriptDataEscapedEndTagOpen => $this->stateScriptDataEscapedEndTagOpen(),
129	TokenizerState::ScriptDataEscapedEndTagName => $this->stateScriptDataEscapedEndTagName(),
130	TokenizerState::ScriptDataDoubleEscapeStart => $this->stateScriptDataDoubleEscapeStart(),
131	TokenizerState::ScriptDataDoubleEscaped => $this->stateScriptDataDoubleEscaped(),
132	TokenizerState::ScriptDataDoubleEscapedDash => $this->stateScriptDataDoubleEscapedDash(),
133	TokenizerState::ScriptDataDoubleEscapedDashDash => $this->stateScriptDataDoubleEscapedDashDash(),
134	TokenizerState::ScriptDataDoubleEscapedLessThanSign => $this->stateScriptDataDoubleEscapedLessThanSign(),
135	TokenizerState::ScriptDataDoubleEscapeEnd => $this->stateScriptDataDoubleEscapeEnd(),
136	TokenizerState::BeforeAttributeName => $this->stateBeforeAttributeName(),
137	TokenizerState::AttributeName => $this->stateAttributeName(),
138	TokenizerState::AfterAttributeName => $this->stateAfterAttributeName(),
139	TokenizerState::BeforeAttributeValue => $this->stateBeforeAttributeValue(),
140	TokenizerState::AttributeValueDoubleQuoted => $this->stateAttributeValueDoubleQuoted(),
141	TokenizerState::AttributeValueSingleQuoted => $this->stateAttributeValueSingleQuoted(),
142	TokenizerState::AttributeValueUnquoted => $this->stateAttributeValueUnquoted(),
143	TokenizerState::AfterAttributeValueQuoted => $this->stateAfterAttributeValueQuoted(),
144	TokenizerState::SelfClosingStartTag => $this->stateSelfClosingStartTag(),
145	TokenizerState::BogusComment => $this->stateBogusComment(),
146	TokenizerState::MarkupDeclarationOpen => $this->stateMarkupDeclarationOpen(),
147	TokenizerState::CommentStart => $this->stateCommentStart(),
148	TokenizerState::CommentStartDash => $this->stateCommentStartDash(),
149	TokenizerState::Comment => $this->stateComment(),
150	TokenizerState::CommentLessThanSign => $this->stateCommentLessThanSign(),
151	TokenizerState::CommentLessThanSignBang => $this->stateCommentLessThanSignBang(),
152	TokenizerState::CommentLessThanSignBangDash => $this->stateCommentLessThanSignBangDash(),
153	TokenizerState::CommentLessThanSignBangDashDash => $this->stateCommentLessThanSignBangDashDash(),
154	TokenizerState::CommentEndDash => $this->stateCommentEndDash(),
155	TokenizerState::CommentEnd => $this->stateCommentEnd(),
156	TokenizerState::CommentEndBang => $this->stateCommentEndBang(),
157	TokenizerState::Doctype => $this->stateDoctype(),
158	TokenizerState::BeforeDoctypeName => $this->stateBeforeDoctypeName(),
159	TokenizerState::DoctypeName => $this->stateDoctypeName(),
160	TokenizerState::AfterDoctypeName => $this->stateAfterDoctypeName(),
161	TokenizerState::AfterDoctypePublicKeyword => $this->stateAfterDoctypePublicKeyword(),
162	TokenizerState::BeforeDoctypePublicIdentifier => $this->stateBeforeDoctypePublicIdentifier(),
163	TokenizerState::DoctypePublicIdentifierDoubleQuoted => $this->stateDoctypePublicIdentifierDoubleQuoted(),
164	TokenizerState::DoctypePublicIdentifierSingleQuoted => $this->stateDoctypePublicIdentifierSingleQuoted(),
165	TokenizerState::AfterDoctypePublicIdentifier => $this->stateAfterDoctypePublicIdentifier(),
166	TokenizerState::BetweenDoctypePublicAndSystemIdentifiers => $this->stateBetweenDoctypePublicAndSystemIdentifiers(),
167	TokenizerState::AfterDoctypeSystemKeyword => $this->stateAfterDoctypeSystemKeyword(),
168	TokenizerState::BeforeDoctypeSystemIdentifier => $this->stateBeforeDoctypeSystemIdentifier(),
169	TokenizerState::DoctypeSystemIdentifierDoubleQuoted => $this->stateDoctypeSystemIdentifierDoubleQuoted(),
170	TokenizerState::DoctypeSystemIdentifierSingleQuoted => $this->stateDoctypeSystemIdentifierSingleQuoted(),
171	TokenizerState::AfterDoctypeSystemIdentifier => $this->stateAfterDoctypeSystemIdentifier(),
172	TokenizerState::BogusDoctype => $this->stateBogusDoctype(),
173	TokenizerState::CdataSection => $this->stateCdataSection(),
174	TokenizerState::CdataSectionBracket => $this->stateCdataSectionBracket(),
175	TokenizerState::CdataSectionEnd => $this->stateCdataSectionEnd(),
176	TokenizerState::CharacterReference => $this->stateCharacterReference(),
177	TokenizerState::NamedCharacterReference => $this->stateNamedCharacterReference(),
178	TokenizerState::AmbiguousAmpersand => $this->stateAmbiguousAmpersand(),
179	TokenizerState::NumericCharacterReference => $this->stateNumericCharacterReference(),
180	TokenizerState::HexadecimalCharacterReferenceStart => $this->stateHexadecimalCharacterReferenceStart(),
181	TokenizerState::DecimalCharacterReferenceStart => $this->stateDecimalCharacterReferenceStart(),
182	TokenizerState::HexadecimalCharacterReference => $this->stateHexadecimalCharacterReference(),
183	TokenizerState::DecimalCharacterReference => $this->stateDecimalCharacterReference(),
184	TokenizerState::NumericCharacterReferenceEnd => $this->stateNumericCharacterReferenceEnd(),
185	};
186	}
187
188	// ============================================================
189	// Input / output helpers
190	// ============================================================
191
192	private function preprocess(string $input): string
193	{
194	// CRLF → LF, then CR → LF per WHATWG §13.2.3.5.
195	$input = str_replace("\r\n", "\n", $input);
196	$input = str_replace("\r", "\n", $input);
197	return $input;
198	}
199
200	private function consume(): ?string
201	{
202	if ($this->reconsume) {
203	$this->reconsume = false;
204	return $this->currentChar === '' ? null : $this->currentChar;
205	}
206	if ($this->pos >= $this->length) {
207	$this->currentChar = '';
208	return null;
209	}
210	$this->currentChar = $this->chars[$this->pos++];
211	return $this->currentChar;
212	}
213
214	private function reconsumeIn(TokenizerState $next): void
215	{
216	$this->state = $next;
217	$this->reconsume = true;
218	}
219
220	private function peekRemaining(int $count): string
221	{
222	$start = $this->reconsume ? $this->pos - 1 : $this->pos;
223	if ($start >= $this->length) {
224	return '';
225	}
226	return implode('', array_slice($this->chars, $start, $count));
227	}
228
229	private function advance(int $count): void
230	{
231	// When reconsume is set, `pos` sits one past the reconsume char, so
232	// the effective "start" for the advance is pos - 1. Compute from there
233	// to keep the next consume() pointed at the correct character.
234	$effectiveStart = $this->reconsume ? $this->pos - 1 : $this->pos;
235	$this->pos = $effectiveStart + $count;
236	$this->reconsume = false;
237	}
238
239	private function emit(Token $t): void
240	{
241	if ($t instanceof StartTagToken) {
242	$this->lastStartTagName = $t->tagName;
243	}
244	$this->emitted[] = $t;
245	if ($t instanceof EofToken) {
246	$this->done = true;
247	}
248	}
249
250	private function emitChar(string $data): void
251	{
252	$this->emit(new CharacterToken($data));
253	}
254
255	private function error(ParseErrorCode $code): void
256	{
257	$this->errors[] = new ParseError($code, $this->reconsume ? $this->pos - 1 : $this->pos);
258	}
259
260	private function currentTokenAsEnd(): EndTagToken
261	{
262	assert($this->currentToken instanceof EndTagToken);
263	return $this->currentToken;
264	}
265
266	private function currentTokenAsTag(): StartTagToken\|EndTagToken
267	{
268	assert($this->currentToken instanceof StartTagToken \|\| $this->currentToken instanceof EndTagToken);
269	return $this->currentToken;
270	}
271
272	private function currentTokenAsComment(): CommentToken
273	{
274	assert($this->currentToken instanceof CommentToken);
275	return $this->currentToken;
276	}
277
278	private function currentTokenAsDoctype(): DoctypeToken
279	{
280	assert($this->currentToken instanceof DoctypeToken);
281	return $this->currentToken;
282	}
283
284	private function startNewAttribute(StartTagToken\|EndTagToken $tag): void
285	{
286	$tag->attributes[] = ['name' => '', 'value' => ''];
287	$tag->currentAttribute = count($tag->attributes) - 1;
288	}
289
290	private function appendToCurrentAttributeName(StartTagToken\|EndTagToken $tag, string $chars): void
291	{
292	assert($tag->currentAttribute !== null);
293	$tag->attributes[$tag->currentAttribute]['name'] .= $chars;
294	}
295
296	private function appendToCurrentAttributeValue(StartTagToken\|EndTagToken $tag, string $chars): void
297	{
298	assert($tag->currentAttribute !== null);
299	$tag->attributes[$tag->currentAttribute]['value'] .= $chars;
300	}
301
302	/**
303	* Per WHATWG: after a tag's attribute list is built, drop duplicate
304	* attribute names (keep the first; emit unexpected-character-in-
305	* attribute-name parse error for subsequent duplicates). Called when the
306	* tag token is finalised.
307	*/
308	private function dedupAttributes(StartTagToken\|EndTagToken $tag): void
309	{
310	$seen = [];
311	$out = [];
312	foreach ($tag->attributes as $attr) {
313	if (isset($seen[$attr['name']])) {
314	$this->error(ParseErrorCode::UnexpectedCharacterInAttributeName);
315	continue;
316	}
317	$seen[$attr['name']] = true;
318	$out[] = $attr;
319	}
320	$tag->attributes = $out;
321	}
322
323	private function isAppropriateEndTag(EndTagToken $tag): bool
324	{
325	return $this->lastStartTagName !== null && $tag->tagName === $this->lastStartTagName;
326	}
327
328	// ============================================================
329	// 13.2.5.1 Data state
330	// ============================================================
331	private function stateData(): void
332	{
333	$c = $this->consume();
334	if ($c === '&') {
335	$this->returnState = TokenizerState::Data;
336	$this->state = TokenizerState::CharacterReference;
337	return;
338	}
339	if ($c === '<') {
340	$this->state = TokenizerState::TagOpen;
341	return;
342	}
343	if ($c === null) {
344	$this->emit(new EofToken());
345	return;
346	}
347	if ($c === "\u{0000}") {
348	$this->error(ParseErrorCode::UnexpectedNullCharacter);
349	}
350	$this->emitChar($c);
351	}
352
353	// ============================================================
354	// 13.2.5.2 RCDATA state
355	// ============================================================
356	private function stateRcdata(): void
357	{
358	$c = $this->consume();
359	if ($c === '&') {
360	$this->returnState = TokenizerState::Rcdata;
361	$this->state = TokenizerState::CharacterReference;
362	return;
363	}
364	if ($c === '<') {
365	$this->state = TokenizerState::RcdataLessThanSign;
366	return;
367	}
368	if ($c === null) {
369	$this->emit(new EofToken());
370	return;
371	}
372	if ($c === "\u{0000}") {
373	$this->error(ParseErrorCode::UnexpectedNullCharacter);
374	$this->emitChar("\u{FFFD}");
375	return;
376	}
377	$this->emitChar($c);
378	}
379
380	// ============================================================
381	// 13.2.5.3 RAWTEXT state
382	// ============================================================
383	private function stateRawtext(): void
384	{
385	$c = $this->consume();
386	if ($c === '<') {
387	$this->state = TokenizerState::RawtextLessThanSign;
388	return;
389	}
390	if ($c === null) {
391	$this->emit(new EofToken());
392	return;
393	}
394	if ($c === "\u{0000}") {
395	$this->error(ParseErrorCode::UnexpectedNullCharacter);
396	$this->emitChar("\u{FFFD}");
397	return;
398	}
399	$this->emitChar($c);
400	}
401
402	// ============================================================
403	// 13.2.5.4 Script data state
404	// ============================================================
405	private function stateScriptData(): void
406	{
407	$c = $this->consume();
408	if ($c === '<') {
409	$this->state = TokenizerState::ScriptDataLessThanSign;
410	return;
411	}
412	if ($c === null) {
413	$this->emit(new EofToken());
414	return;
415	}
416	if ($c === "\u{0000}") {
417	$this->error(ParseErrorCode::UnexpectedNullCharacter);
418	$this->emitChar("\u{FFFD}");
419	return;
420	}
421	$this->emitChar($c);
422	}
423
424	// ============================================================
425	// 13.2.5.15–17 Script data less-than / end-tag states
426	// ============================================================
427	private function stateScriptDataLessThanSign(): void
428	{
429	$c = $this->consume();
430	if ($c === '/') {
431	$this->tempBuffer = '';
432	$this->state = TokenizerState::ScriptDataEndTagOpen;
433	return;
434	}
435	if ($c === '!') {
436	$this->state = TokenizerState::ScriptDataEscapeStart;
437	$this->emitChar('<');
438	$this->emitChar('!');
439	return;
440	}
441	$this->emitChar('<');
442	$this->reconsumeIn(TokenizerState::ScriptData);
443	}
444
445	private function stateScriptDataEndTagOpen(): void
446	{
447	$c = $this->consume();
448	if ($c !== null && self::isAsciiAlpha($c)) {
449	$this->currentToken = new EndTagToken();
450	$this->reconsumeIn(TokenizerState::ScriptDataEndTagName);
451	return;
452	}
453	$this->emitChar('<');
454	$this->emitChar('/');
455	$this->reconsumeIn(TokenizerState::ScriptData);
456	}
457
458	private function stateScriptDataEndTagName(): void
459	{
460	$this->endTagNameAlternativeReturn(TokenizerState::ScriptData);
461	}
462
463	// ============================================================
464	// 13.2.5.18–19 Script data escape start states
465	// ============================================================
466	private function stateScriptDataEscapeStart(): void
467	{
468	$c = $this->consume();
469	if ($c === '-') {
470	$this->state = TokenizerState::ScriptDataEscapeStartDash;
471	$this->emitChar('-');
472	return;
473	}
474	$this->reconsumeIn(TokenizerState::ScriptData);
475	}
476
477	private function stateScriptDataEscapeStartDash(): void
478	{
479	$c = $this->consume();
480	if ($c === '-') {
481	$this->state = TokenizerState::ScriptDataEscapedDashDash;
482	$this->emitChar('-');
483	return;
484	}
485	$this->reconsumeIn(TokenizerState::ScriptData);
486	}
487
488	// ============================================================
489	// 13.2.5.20–22 Script data escaped states
490	// ============================================================
491	private function stateScriptDataEscaped(): void
492	{
493	$c = $this->consume();
494	if ($c === '-') {
495	$this->state = TokenizerState::ScriptDataEscapedDash;
496	$this->emitChar('-');
497	return;
498	}
499	if ($c === '<') {
500	$this->state = TokenizerState::ScriptDataEscapedLessThanSign;
501	return;
502	}
503	if ($c === "\u{0000}") {
504	$this->error(ParseErrorCode::UnexpectedNullCharacter);
505	$this->emitChar("\u{FFFD}");
506	return;
507	}
508	if ($c === null) {
509	$this->error(ParseErrorCode::EofInScriptHtmlCommentLikeText);
510	$this->emit(new EofToken());
511	return;
512	}
513	$this->emitChar($c);
514	}
515
516	private function stateScriptDataEscapedDash(): void
517	{
518	$c = $this->consume();
519	if ($c === '-') {
520	$this->state = TokenizerState::ScriptDataEscapedDashDash;
521	$this->emitChar('-');
522	return;
523	}
524	if ($c === '<') {
525	$this->state = TokenizerState::ScriptDataEscapedLessThanSign;
526	return;
527	}
528	if ($c === "\u{0000}") {
529	$this->error(ParseErrorCode::UnexpectedNullCharacter);
530	$this->state = TokenizerState::ScriptDataEscaped;
531	$this->emitChar("\u{FFFD}");
532	return;
533	}
534	if ($c === null) {
535	$this->error(ParseErrorCode::EofInScriptHtmlCommentLikeText);
536	$this->emit(new EofToken());
537	return;
538	}
539	$this->state = TokenizerState::ScriptDataEscaped;
540	$this->emitChar($c);
541	}
542
543	private function stateScriptDataEscapedDashDash(): void
544	{
545	$c = $this->consume();
546	if ($c === '-') {
547	$this->emitChar('-');
548	return;
549	}
550	if ($c === '<') {
551	$this->state = TokenizerState::ScriptDataEscapedLessThanSign;
552	return;
553	}
554	if ($c === '>') {
555	$this->state = TokenizerState::ScriptData;
556	$this->emitChar('>');
557	return;
558	}
559	if ($c === "\u{0000}") {
560	$this->error(ParseErrorCode::UnexpectedNullCharacter);
561	$this->state = TokenizerState::ScriptDataEscaped;
562	$this->emitChar("\u{FFFD}");
563	return;
564	}
565	if ($c === null) {
566	$this->error(ParseErrorCode::EofInScriptHtmlCommentLikeText);
567	$this->emit(new EofToken());
568	return;
569	}
570	$this->state = TokenizerState::ScriptDataEscaped;
571	$this->emitChar($c);
572	}
573
574	// ============================================================
575	// 13.2.5.23–25 Script data escaped less-than / end-tag states
576	// ============================================================
577	private function stateScriptDataEscapedLessThanSign(): void
578	{
579	$c = $this->consume();
580	if ($c === '/') {
581	$this->tempBuffer = '';
582	$this->state = TokenizerState::ScriptDataEscapedEndTagOpen;
583	return;
584	}
585	if ($c !== null && self::isAsciiAlpha($c)) {
586	$this->tempBuffer = '';
587	$this->emitChar('<');
588	$this->reconsumeIn(TokenizerState::ScriptDataDoubleEscapeStart);
589	return;
590	}
591	$this->emitChar('<');
592	$this->reconsumeIn(TokenizerState::ScriptDataEscaped);
593	}
594
595	private function stateScriptDataEscapedEndTagOpen(): void
596	{
597	$c = $this->consume();
598	if ($c !== null && self::isAsciiAlpha($c)) {
599	$this->currentToken = new EndTagToken();
600	$this->reconsumeIn(TokenizerState::ScriptDataEscapedEndTagName);
601	return;
602	}
603	$this->emitChar('<');
604	$this->emitChar('/');
605	$this->reconsumeIn(TokenizerState::ScriptDataEscaped);
606	}
607
608	private function stateScriptDataEscapedEndTagName(): void
609	{
610	$this->endTagNameAlternativeReturn(TokenizerState::ScriptDataEscaped);
611	}
612
613	// ============================================================
614	// 13.2.5.26–31 Script data double escape states
615	// ============================================================
616	private function stateScriptDataDoubleEscapeStart(): void
617	{
618	$c = $this->consume();
619	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ' \|\| $c === '/' \|\| $c === '>') {
620	$this->state = $this->tempBuffer === 'script'
621	? TokenizerState::ScriptDataDoubleEscaped
622	: TokenizerState::ScriptDataEscaped;
623	$this->emitChar($c);
624	return;
625	}
626	if ($c !== null && self::isAsciiUpperAlpha($c)) {
627	$this->tempBuffer .= strtolower($c);
628	$this->emitChar($c);
629	return;
630	}
631	if ($c !== null && self::isAsciiLowerAlpha($c)) {
632	$this->tempBuffer .= $c;
633	$this->emitChar($c);
634	return;
635	}
636	$this->reconsumeIn(TokenizerState::ScriptDataEscaped);
637	}
638
639	private function stateScriptDataDoubleEscaped(): void
640	{
641	$c = $this->consume();
642	if ($c === '-') {
643	$this->state = TokenizerState::ScriptDataDoubleEscapedDash;
644	$this->emitChar('-');
645	return;
646	}
647	if ($c === '<') {
648	$this->state = TokenizerState::ScriptDataDoubleEscapedLessThanSign;
649	$this->emitChar('<');
650	return;
651	}
652	if ($c === "\u{0000}") {
653	$this->error(ParseErrorCode::UnexpectedNullCharacter);
654	$this->emitChar("\u{FFFD}");
655	return;
656	}
657	if ($c === null) {
658	$this->error(ParseErrorCode::EofInScriptHtmlCommentLikeText);
659	$this->emit(new EofToken());
660	return;
661	}
662	$this->emitChar($c);
663	}
664
665	private function stateScriptDataDoubleEscapedDash(): void
666	{
667	$c = $this->consume();
668	if ($c === '-') {
669	$this->state = TokenizerState::ScriptDataDoubleEscapedDashDash;
670	$this->emitChar('-');
671	return;
672	}
673	if ($c === '<') {
674	$this->state = TokenizerState::ScriptDataDoubleEscapedLessThanSign;
675	$this->emitChar('<');
676	return;
677	}
678	if ($c === "\u{0000}") {
679	$this->error(ParseErrorCode::UnexpectedNullCharacter);
680	$this->state = TokenizerState::ScriptDataDoubleEscaped;
681	$this->emitChar("\u{FFFD}");
682	return;
683	}
684	if ($c === null) {
685	$this->error(ParseErrorCode::EofInScriptHtmlCommentLikeText);
686	$this->emit(new EofToken());
687	return;
688	}
689	$this->state = TokenizerState::ScriptDataDoubleEscaped;
690	$this->emitChar($c);
691	}
692
693	private function stateScriptDataDoubleEscapedDashDash(): void
694	{
695	$c = $this->consume();
696	if ($c === '-') {
697	$this->emitChar('-');
698	return;
699	}
700	if ($c === '<') {
701	$this->state = TokenizerState::ScriptDataDoubleEscapedLessThanSign;
702	$this->emitChar('<');
703	return;
704	}
705	if ($c === '>') {
706	$this->state = TokenizerState::ScriptData;
707	$this->emitChar('>');
708	return;
709	}
710	if ($c === "\u{0000}") {
711	$this->error(ParseErrorCode::UnexpectedNullCharacter);
712	$this->state = TokenizerState::ScriptDataDoubleEscaped;
713	$this->emitChar("\u{FFFD}");
714	return;
715	}
716	if ($c === null) {
717	$this->error(ParseErrorCode::EofInScriptHtmlCommentLikeText);
718	$this->emit(new EofToken());
719	return;
720	}
721	$this->state = TokenizerState::ScriptDataDoubleEscaped;
722	$this->emitChar($c);
723	}
724
725	private function stateScriptDataDoubleEscapedLessThanSign(): void
726	{
727	$c = $this->consume();
728	if ($c === '/') {
729	$this->tempBuffer = '';
730	$this->state = TokenizerState::ScriptDataDoubleEscapeEnd;
731	$this->emitChar('/');
732	return;
733	}
734	$this->reconsumeIn(TokenizerState::ScriptDataDoubleEscaped);
735	}
736
737	private function stateScriptDataDoubleEscapeEnd(): void
738	{
739	$c = $this->consume();
740	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ' \|\| $c === '/' \|\| $c === '>') {
741	$this->state = $this->tempBuffer === 'script'
742	? TokenizerState::ScriptDataEscaped
743	: TokenizerState::ScriptDataDoubleEscaped;
744	$this->emitChar($c);
745	return;
746	}
747	if ($c !== null && self::isAsciiUpperAlpha($c)) {
748	$this->tempBuffer .= strtolower($c);
749	$this->emitChar($c);
750	return;
751	}
752	if ($c !== null && self::isAsciiLowerAlpha($c)) {
753	$this->tempBuffer .= $c;
754	$this->emitChar($c);
755	return;
756	}
757	$this->reconsumeIn(TokenizerState::ScriptDataDoubleEscaped);
758	}
759
760	// ============================================================
761	// 13.2.5.5 PLAINTEXT state
762	// ============================================================
763	private function statePlaintext(): void
764	{
765	$c = $this->consume();
766	if ($c === null) {
767	$this->emit(new EofToken());
768	return;
769	}
770	if ($c === "\u{0000}") {
771	$this->error(ParseErrorCode::UnexpectedNullCharacter);
772	$this->emitChar("\u{FFFD}");
773	return;
774	}
775	$this->emitChar($c);
776	}
777
778	// ============================================================
779	// 13.2.5.6 Tag open state
780	// ============================================================
781	private function stateTagOpen(): void
782	{
783	$c = $this->consume();
784	if ($c === '!') {
785	$this->state = TokenizerState::MarkupDeclarationOpen;
786	return;
787	}
788	if ($c === '/') {
789	$this->state = TokenizerState::EndTagOpen;
790	return;
791	}
792	if ($c !== null && self::isAsciiAlpha($c)) {
793	$this->currentToken = new StartTagToken();
794	$this->reconsumeIn(TokenizerState::TagName);
795	return;
796	}
797	if ($c === '?') {
798	$this->error(ParseErrorCode::UnexpectedQuestionMarkInsteadOfTagName);
799	$this->currentToken = new CommentToken();
800	$this->reconsumeIn(TokenizerState::BogusComment);
801	return;
802	}
803	if ($c === null) {
804	$this->error(ParseErrorCode::EofBeforeTagName);
805	$this->emitChar('<');
806	$this->emit(new EofToken());
807	return;
808	}
809	$this->error(ParseErrorCode::InvalidFirstCharacterOfTagName);
810	$this->emitChar('<');
811	$this->reconsumeIn(TokenizerState::Data);
812	}
813
814	// ============================================================
815	// 13.2.5.7 End tag open state
816	// ============================================================
817	private function stateEndTagOpen(): void
818	{
819	$c = $this->consume();
820	if ($c !== null && self::isAsciiAlpha($c)) {
821	$this->currentToken = new EndTagToken();
822	$this->reconsumeIn(TokenizerState::TagName);
823	return;
824	}
825	if ($c === '>') {
826	$this->error(ParseErrorCode::MissingEndTagName);
827	$this->state = TokenizerState::Data;
828	return;
829	}
830	if ($c === null) {
831	$this->error(ParseErrorCode::EofBeforeTagName);
832	$this->emitChar('<');
833	$this->emitChar('/');
834	$this->emit(new EofToken());
835	return;
836	}
837	$this->error(ParseErrorCode::InvalidFirstCharacterOfTagName);
838	$this->currentToken = new CommentToken();
839	$this->reconsumeIn(TokenizerState::BogusComment);
840	}
841
842	// ============================================================
843	// 13.2.5.8 Tag name state
844	// ============================================================
845	private function stateTagName(): void
846	{
847	$c = $this->consume();
848	$tag = $this->currentTokenAsTag();
849	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
850	$this->state = TokenizerState::BeforeAttributeName;
851	return;
852	}
853	if ($c === '/') {
854	$this->state = TokenizerState::SelfClosingStartTag;
855	return;
856	}
857	if ($c === '>') {
858	$this->finalizeAndEmitTag();
859	$this->state = TokenizerState::Data;
860	return;
861	}
862	if ($c !== null && self::isAsciiUpperAlpha($c)) {
863	$tag->tagName .= strtolower($c);
864	return;
865	}
866	if ($c === "\u{0000}") {
867	$this->error(ParseErrorCode::UnexpectedNullCharacter);
868	$tag->tagName .= "\u{FFFD}";
869	return;
870	}
871	if ($c === null) {
872	$this->error(ParseErrorCode::EofInTag);
873	$this->emit(new EofToken());
874	return;
875	}
876	$tag->tagName .= $c;
877	}
878
879	// ============================================================
880	// 13.2.5.9–11 RCDATA less-than and end-tag states
881	// ============================================================
882	private function stateRcdataLessThanSign(): void
883	{
884	$c = $this->consume();
885	if ($c === '/') {
886	$this->tempBuffer = '';
887	$this->state = TokenizerState::RcdataEndTagOpen;
888	return;
889	}
890	$this->emitChar('<');
891	$this->reconsumeIn(TokenizerState::Rcdata);
892	}
893
894	private function stateRcdataEndTagOpen(): void
895	{
896	$c = $this->consume();
897	if ($c !== null && self::isAsciiAlpha($c)) {
898	$this->currentToken = new EndTagToken();
899	$this->reconsumeIn(TokenizerState::RcdataEndTagName);
900	return;
901	}
902	$this->emitChar('<');
903	$this->emitChar('/');
904	$this->reconsumeIn(TokenizerState::Rcdata);
905	}
906
907	private function stateRcdataEndTagName(): void
908	{
909	$this->endTagNameAlternativeReturn(TokenizerState::Rcdata);
910	}
911
912	// ============================================================
913	// 13.2.5.12–14 RAWTEXT less-than and end-tag states
914	// ============================================================
915	private function stateRawtextLessThanSign(): void
916	{
917	$c = $this->consume();
918	if ($c === '/') {
919	$this->tempBuffer = '';
920	$this->state = TokenizerState::RawtextEndTagOpen;
921	return;
922	}
923	$this->emitChar('<');
924	$this->reconsumeIn(TokenizerState::Rawtext);
925	}
926
927	private function stateRawtextEndTagOpen(): void
928	{
929	$c = $this->consume();
930	if ($c !== null && self::isAsciiAlpha($c)) {
931	$this->currentToken = new EndTagToken();
932	$this->reconsumeIn(TokenizerState::RawtextEndTagName);
933	return;
934	}
935	$this->emitChar('<');
936	$this->emitChar('/');
937	$this->reconsumeIn(TokenizerState::Rawtext);
938	}
939
940	private function stateRawtextEndTagName(): void
941	{
942	$this->endTagNameAlternativeReturn(TokenizerState::Rawtext);
943	}
944
945	/**
946	* Shared logic for RCDATA / RAWTEXT / Script end-tag-name states. If the
947	* end tag matches the most recent start tag (the "appropriate end tag"),
948	* transition like a normal tag close; otherwise emit characters and
949	* return to the source state.
950	*/
951	private function endTagNameAlternativeReturn(TokenizerState $sourceState): void
952	{
953	$c = $this->consume();
954	$tag = $this->currentTokenAsEnd();
955	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
956	if ($this->isAppropriateEndTag($tag)) {
957	$this->state = TokenizerState::BeforeAttributeName;
958	return;
959	}
960	$this->emitFakeOpeningChars($sourceState);
961	return;
962	}
963	if ($c === '/') {
964	if ($this->isAppropriateEndTag($tag)) {
965	$this->state = TokenizerState::SelfClosingStartTag;
966	return;
967	}
968	$this->emitFakeOpeningChars($sourceState);
969	return;
970	}
971	if ($c === '>') {
972	if ($this->isAppropriateEndTag($tag)) {
973	$this->finalizeAndEmitTag();
974	$this->state = TokenizerState::Data;
975	return;
976	}
977	$this->emitFakeOpeningChars($sourceState);
978	return;
979	}
980	if ($c !== null && self::isAsciiUpperAlpha($c)) {
981	$tag->tagName .= strtolower($c);
982	$this->tempBuffer .= $c;
983	return;
984	}
985	if ($c !== null && self::isAsciiLowerAlpha($c)) {
986	$tag->tagName .= $c;
987	$this->tempBuffer .= $c;
988	return;
989	}
990	$this->emitFakeOpeningChars($sourceState);
991	}
992
993	private function emitFakeOpeningChars(TokenizerState $sourceState): void
994	{
995	$this->emitChar('<');
996	$this->emitChar('/');
997	if ($this->tempBuffer !== '') {
998	$this->emitChar($this->tempBuffer);
999	}
1000	$this->reconsumeIn($sourceState);
1001	}
1002
1003	// ============================================================
1004	// 13.2.5.32 Before attribute name state
1005	// ============================================================
1006	private function stateBeforeAttributeName(): void
1007	{
1008	$c = $this->consume();
1009	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1010	return;
1011	}
1012	if ($c === '/' \|\| $c === '>' \|\| $c === null) {
1013	$this->reconsumeIn(TokenizerState::AfterAttributeName);
1014	return;
1015	}
1016	if ($c === '=') {
1017	$this->error(ParseErrorCode::UnexpectedEqualsSignBeforeAttributeName);
1018	$tag = $this->currentTokenAsTag();
1019	$this->startNewAttribute($tag);
1020	$this->appendToCurrentAttributeName($tag, '=');
1021	$this->state = TokenizerState::AttributeName;
1022	return;
1023	}
1024	$tag = $this->currentTokenAsTag();
1025	$this->startNewAttribute($tag);
1026	$this->reconsumeIn(TokenizerState::AttributeName);
1027	}
1028
1029	// ============================================================
1030	// 13.2.5.33 Attribute name state
1031	// ============================================================
1032	private function stateAttributeName(): void
1033	{
1034	$c = $this->consume();
1035	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' '
1036	\|\| $c === '/' \|\| $c === '>' \|\| $c === null
1037	) {
1038	$this->reconsumeIn(TokenizerState::AfterAttributeName);
1039	return;
1040	}
1041	if ($c === '=') {
1042	$this->state = TokenizerState::BeforeAttributeValue;
1043	return;
1044	}
1045	$tag = $this->currentTokenAsTag();
1046	if (self::isAsciiUpperAlpha($c)) {
1047	$this->appendToCurrentAttributeName($tag, strtolower($c));
1048	return;
1049	}
1050	if ($c === "\u{0000}") {
1051	$this->error(ParseErrorCode::UnexpectedNullCharacter);
1052	$this->appendToCurrentAttributeName($tag, "\u{FFFD}");
1053	return;
1054	}
1055	if ($c === '"' \|\| $c === "'" \|\| $c === '<') {
1056	$this->error(ParseErrorCode::UnexpectedCharacterInAttributeName);
1057	}
1058	$this->appendToCurrentAttributeName($tag, $c);
1059	}
1060
1061	// ============================================================
1062	// 13.2.5.34 After attribute name state
1063	// ============================================================
1064	private function stateAfterAttributeName(): void
1065	{
1066	$c = $this->consume();
1067	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1068	return;
1069	}
1070	if ($c === '/') {
1071	$this->state = TokenizerState::SelfClosingStartTag;
1072	return;
1073	}
1074	if ($c === '=') {
1075	$this->state = TokenizerState::BeforeAttributeValue;
1076	return;
1077	}
1078	if ($c === '>') {
1079	$this->finalizeAndEmitTag();
1080	$this->state = TokenizerState::Data;
1081	return;
1082	}
1083	if ($c === null) {
1084	$this->error(ParseErrorCode::EofInTag);
1085	$this->emit(new EofToken());
1086	return;
1087	}
1088	$tag = $this->currentTokenAsTag();
1089	$this->startNewAttribute($tag);
1090	$this->reconsumeIn(TokenizerState::AttributeName);
1091	}
1092
1093	// ============================================================
1094	// 13.2.5.35 Before attribute value state
1095	// ============================================================
1096	private function stateBeforeAttributeValue(): void
1097	{
1098	$c = $this->consume();
1099	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1100	return;
1101	}
1102	if ($c === '"') {
1103	$this->state = TokenizerState::AttributeValueDoubleQuoted;
1104	return;
1105	}
1106	if ($c === "'") {
1107	$this->state = TokenizerState::AttributeValueSingleQuoted;
1108	return;
1109	}
1110	if ($c === '>') {
1111	$this->error(ParseErrorCode::MissingAttributeValue);
1112	$this->finalizeAndEmitTag();
1113	$this->state = TokenizerState::Data;
1114	return;
1115	}
1116	$this->reconsumeIn(TokenizerState::AttributeValueUnquoted);
1117	}
1118
1119	// ============================================================
1120	// 13.2.5.36 Attribute value (double-quoted) state
1121	// ============================================================
1122	private function stateAttributeValueDoubleQuoted(): void
1123	{
1124	$c = $this->consume();
1125	if ($c === '"') {
1126	$this->state = TokenizerState::AfterAttributeValueQuoted;
1127	return;
1128	}
1129	if ($c === '&') {
1130	$this->returnState = TokenizerState::AttributeValueDoubleQuoted;
1131	$this->state = TokenizerState::CharacterReference;
1132	return;
1133	}
1134	if ($c === "\u{0000}") {
1135	$this->error(ParseErrorCode::UnexpectedNullCharacter);
1136	$this->appendToCurrentAttributeValue($this->currentTokenAsTag(), "\u{FFFD}");
1137	return;
1138	}
1139	if ($c === null) {
1140	$this->error(ParseErrorCode::EofInTag);
1141	$this->emit(new EofToken());
1142	return;
1143	}
1144	$this->appendToCurrentAttributeValue($this->currentTokenAsTag(), $c);
1145	}
1146
1147	// ============================================================
1148	// 13.2.5.37 Attribute value (single-quoted) state
1149	// ============================================================
1150	private function stateAttributeValueSingleQuoted(): void
1151	{
1152	$c = $this->consume();
1153	if ($c === "'") {
1154	$this->state = TokenizerState::AfterAttributeValueQuoted;
1155	return;
1156	}
1157	if ($c === '&') {
1158	$this->returnState = TokenizerState::AttributeValueSingleQuoted;
1159	$this->state = TokenizerState::CharacterReference;
1160	return;
1161	}
1162	if ($c === "\u{0000}") {
1163	$this->error(ParseErrorCode::UnexpectedNullCharacter);
1164	$this->appendToCurrentAttributeValue($this->currentTokenAsTag(), "\u{FFFD}");
1165	return;
1166	}
1167	if ($c === null) {
1168	$this->error(ParseErrorCode::EofInTag);
1169	$this->emit(new EofToken());
1170	return;
1171	}
1172	$this->appendToCurrentAttributeValue($this->currentTokenAsTag(), $c);
1173	}
1174
1175	// ============================================================
1176	// 13.2.5.38 Attribute value (unquoted) state
1177	// ============================================================
1178	private function stateAttributeValueUnquoted(): void
1179	{
1180	$c = $this->consume();
1181	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1182	$this->state = TokenizerState::BeforeAttributeName;
1183	return;
1184	}
1185	if ($c === '&') {
1186	$this->returnState = TokenizerState::AttributeValueUnquoted;
1187	$this->state = TokenizerState::CharacterReference;
1188	return;
1189	}
1190	if ($c === '>') {
1191	$this->finalizeAndEmitTag();
1192	$this->state = TokenizerState::Data;
1193	return;
1194	}
1195	if ($c === "\u{0000}") {
1196	$this->error(ParseErrorCode::UnexpectedNullCharacter);
1197	$this->appendToCurrentAttributeValue($this->currentTokenAsTag(), "\u{FFFD}");
1198	return;
1199	}
1200	if ($c === null) {
1201	$this->error(ParseErrorCode::EofInTag);
1202	$this->emit(new EofToken());
1203	return;
1204	}
1205	if ($c === '"' \|\| $c === "'" \|\| $c === '<' \|\| $c === '=' \|\| $c === '`') {
1206	$this->error(ParseErrorCode::UnexpectedCharacterInUnquotedAttributeValue);
1207	}
1208	$this->appendToCurrentAttributeValue($this->currentTokenAsTag(), $c);
1209	}
1210
1211	// ============================================================
1212	// 13.2.5.39 After attribute value (quoted) state
1213	// ============================================================
1214	private function stateAfterAttributeValueQuoted(): void
1215	{
1216	$c = $this->consume();
1217	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1218	$this->state = TokenizerState::BeforeAttributeName;
1219	return;
1220	}
1221	if ($c === '/') {
1222	$this->state = TokenizerState::SelfClosingStartTag;
1223	return;
1224	}
1225	if ($c === '>') {
1226	$this->finalizeAndEmitTag();
1227	$this->state = TokenizerState::Data;
1228	return;
1229	}
1230	if ($c === null) {
1231	$this->error(ParseErrorCode::EofInTag);
1232	$this->emit(new EofToken());
1233	return;
1234	}
1235	$this->error(ParseErrorCode::MissingWhitespaceBetweenAttributes);
1236	$this->reconsumeIn(TokenizerState::BeforeAttributeName);
1237	}
1238
1239	// ============================================================
1240	// 13.2.5.40 Self-closing start tag state
1241	// ============================================================
1242	private function stateSelfClosingStartTag(): void
1243	{
1244	$c = $this->consume();
1245	if ($c === '>') {
1246	$tag = $this->currentTokenAsTag();
1247	$tag->selfClosing = true;
1248	if ($tag instanceof EndTagToken) {
1249	$this->error(ParseErrorCode::EndTagWithTrailingSolidus);
1250	}
1251	$this->finalizeAndEmitTag();
1252	$this->state = TokenizerState::Data;
1253	return;
1254	}
1255	if ($c === null) {
1256	$this->error(ParseErrorCode::EofInTag);
1257	$this->emit(new EofToken());
1258	return;
1259	}
1260	$this->error(ParseErrorCode::UnexpectedSolidusInTag);
1261	$this->reconsumeIn(TokenizerState::BeforeAttributeName);
1262	}
1263
1264	// ============================================================
1265	// 13.2.5.41 Bogus comment state
1266	// ============================================================
1267	private function stateBogusComment(): void
1268	{
1269	$c = $this->consume();
1270	$comment = $this->currentTokenAsComment();
1271	if ($c === '>') {
1272	$this->emit($comment);
1273	$this->state = TokenizerState::Data;
1274	return;
1275	}
1276	if ($c === null) {
1277	$this->emit($comment);
1278	$this->emit(new EofToken());
1279	return;
1280	}
1281	if ($c === "\u{0000}") {
1282	$this->error(ParseErrorCode::UnexpectedNullCharacter);
1283	$comment->append("\u{FFFD}");
1284	return;
1285	}
1286	$comment->append($c);
1287	}
1288
1289	// ============================================================
1290	// 13.2.5.42 Markup declaration open state
1291	// ============================================================
1292	private function stateMarkupDeclarationOpen(): void
1293	{
1294	// Peek ahead at the next characters. Two cases at Phase 1B.2:
1295	// "--" → comment start
1296	// "doctype" (case-insensitive) → doctype
1297	// CDATA section ([CDATA[) is deferred to 1B.2-bis.
1298	$rest = $this->peekRemaining(7);
1299	if (str_starts_with($rest, '--')) {
1300	$this->advance(2);
1301	$this->currentToken = new CommentToken();
1302	$this->state = TokenizerState::CommentStart;
1303	return;
1304	}
1305	if (strcasecmp(substr($rest, 0, 7), 'doctype') === 0) {
1306	$this->advance(7);
1307	$this->state = TokenizerState::Doctype;
1308	return;
1309	}
1310	if (str_starts_with($rest, '[CDATA[')) {
1311	$this->advance(7);
1312	if ($this->inForeignContent) {
1313	$this->state = TokenizerState::CdataSection;
1314	return;
1315	}
1316	$this->error(ParseErrorCode::CdataInHtmlContent);
1317	$this->currentToken = new CommentToken('[CDATA[');
1318	$this->state = TokenizerState::BogusComment;
1319	return;
1320	}
1321	$this->error(ParseErrorCode::IncorrectlyOpenedComment);
1322	$this->currentToken = new CommentToken();
1323	$this->state = TokenizerState::BogusComment;
1324	}
1325
1326	// ============================================================
1327	// 13.2.5.43–48 Comment states
1328	// ============================================================
1329	private function stateCommentStart(): void
1330	{
1331	$c = $this->consume();
1332	if ($c === '-') {
1333	$this->state = TokenizerState::CommentStartDash;
1334	return;
1335	}
1336	if ($c === '>') {
1337	$this->error(ParseErrorCode::AbruptClosingOfEmptyComment);
1338	$this->emit($this->currentTokenAsComment());
1339	$this->state = TokenizerState::Data;
1340	return;
1341	}
1342	$this->reconsumeIn(TokenizerState::Comment);
1343	}
1344
1345	private function stateCommentStartDash(): void
1346	{
1347	$c = $this->consume();
1348	if ($c === '-') {
1349	$this->state = TokenizerState::CommentEnd;
1350	return;
1351	}
1352	if ($c === '>') {
1353	$this->error(ParseErrorCode::AbruptClosingOfEmptyComment);
1354	$this->emit($this->currentTokenAsComment());
1355	$this->state = TokenizerState::Data;
1356	return;
1357	}
1358	if ($c === null) {
1359	$this->error(ParseErrorCode::EofInComment);
1360	$this->emit($this->currentTokenAsComment());
1361	$this->emit(new EofToken());
1362	return;
1363	}
1364	$this->currentTokenAsComment()->append('-');
1365	$this->reconsumeIn(TokenizerState::Comment);
1366	}
1367
1368	private function stateComment(): void
1369	{
1370	$c = $this->consume();
1371	$comment = $this->currentTokenAsComment();
1372	if ($c === '<') {
1373	$comment->append('<');
1374	$this->state = TokenizerState::CommentLessThanSign;
1375	return;
1376	}
1377	if ($c === '-') {
1378	$this->state = TokenizerState::CommentEndDash;
1379	return;
1380	}
1381	if ($c === "\u{0000}") {
1382	$this->error(ParseErrorCode::UnexpectedNullCharacter);
1383	$comment->append("\u{FFFD}");
1384	return;
1385	}
1386	if ($c === null) {
1387	$this->error(ParseErrorCode::EofInComment);
1388	$this->emit($comment);
1389	$this->emit(new EofToken());
1390	return;
1391	}
1392	$comment->append($c);
1393	}
1394
1395	// ============================================================
1396	// 13.2.5.45–48 Comment less-than-sign / bang recovery states
1397	// ============================================================
1398	private function stateCommentLessThanSign(): void
1399	{
1400	$c = $this->consume();
1401	$comment = $this->currentTokenAsComment();
1402	if ($c === '!') {
1403	$comment->append('!');
1404	$this->state = TokenizerState::CommentLessThanSignBang;
1405	return;
1406	}
1407	if ($c === '<') {
1408	$comment->append('<');
1409	return;
1410	}
1411	$this->reconsumeIn(TokenizerState::Comment);
1412	}
1413
1414	private function stateCommentLessThanSignBang(): void
1415	{
1416	$c = $this->consume();
1417	if ($c === '-') {
1418	$this->state = TokenizerState::CommentLessThanSignBangDash;
1419	return;
1420	}
1421	$this->reconsumeIn(TokenizerState::Comment);
1422	}
1423
1424	private function stateCommentLessThanSignBangDash(): void
1425	{
1426	$c = $this->consume();
1427	if ($c === '-') {
1428	$this->state = TokenizerState::CommentLessThanSignBangDashDash;
1429	return;
1430	}
1431	$this->reconsumeIn(TokenizerState::CommentEndDash);
1432	}
1433
1434	private function stateCommentLessThanSignBangDashDash(): void
1435	{
1436	$c = $this->consume();
1437	if ($c === '>' \|\| $c === null) {
1438	$this->reconsumeIn(TokenizerState::CommentEnd);
1439	return;
1440	}
1441	$this->error(ParseErrorCode::NestedComment);
1442	$this->reconsumeIn(TokenizerState::CommentEnd);
1443	}
1444
1445	// ============================================================
1446	// 13.2.5.69–71 CDATA section states (only valid in foreign content)
1447	// ============================================================
1448	private function stateCdataSection(): void
1449	{
1450	$c = $this->consume();
1451	if ($c === ']') {
1452	$this->state = TokenizerState::CdataSectionBracket;
1453	return;
1454	}
1455	if ($c === null) {
1456	$this->error(ParseErrorCode::EofInCdata);
1457	$this->emit(new EofToken());
1458	return;
1459	}
1460	// NULL inside CDATA is emitted verbatim per spec — no replacement.
1461	$this->emitChar($c);
1462	}
1463
1464	private function stateCdataSectionBracket(): void
1465	{
1466	$c = $this->consume();
1467	if ($c === ']') {
1468	$this->state = TokenizerState::CdataSectionEnd;
1469	return;
1470	}
1471	$this->emitChar(']');
1472	$this->reconsumeIn(TokenizerState::CdataSection);
1473	}
1474
1475	private function stateCdataSectionEnd(): void
1476	{
1477	$c = $this->consume();
1478	if ($c === ']') {
1479	$this->emitChar(']');
1480	return;
1481	}
1482	if ($c === '>') {
1483	$this->state = TokenizerState::Data;
1484	return;
1485	}
1486	$this->emitChar(']');
1487	$this->emitChar(']');
1488	$this->reconsumeIn(TokenizerState::CdataSection);
1489	}
1490
1491	private function stateCommentEndDash(): void
1492	{
1493	$c = $this->consume();
1494	if ($c === '-') {
1495	$this->state = TokenizerState::CommentEnd;
1496	return;
1497	}
1498	if ($c === null) {
1499	$this->error(ParseErrorCode::EofInComment);
1500	$this->emit($this->currentTokenAsComment());
1501	$this->emit(new EofToken());
1502	return;
1503	}
1504	$this->currentTokenAsComment()->append('-');
1505	$this->reconsumeIn(TokenizerState::Comment);
1506	}
1507
1508	private function stateCommentEnd(): void
1509	{
1510	$c = $this->consume();
1511	$comment = $this->currentTokenAsComment();
1512	if ($c === '>') {
1513	$this->emit($comment);
1514	$this->state = TokenizerState::Data;
1515	return;
1516	}
1517	if ($c === '!') {
1518	$this->state = TokenizerState::CommentEndBang;
1519	return;
1520	}
1521	if ($c === '-') {
1522	$comment->append('-');
1523	return;
1524	}
1525	if ($c === null) {
1526	$this->error(ParseErrorCode::EofInComment);
1527	$this->emit($comment);
1528	$this->emit(new EofToken());
1529	return;
1530	}
1531	$comment->append('--');
1532	$this->reconsumeIn(TokenizerState::Comment);
1533	}
1534
1535	private function stateCommentEndBang(): void
1536	{
1537	$c = $this->consume();
1538	$comment = $this->currentTokenAsComment();
1539	if ($c === '-') {
1540	$comment->append('--!');
1541	$this->state = TokenizerState::CommentEndDash;
1542	return;
1543	}
1544	if ($c === '>') {
1545	$this->error(ParseErrorCode::IncorrectlyClosedComment);
1546	$this->emit($comment);
1547	$this->state = TokenizerState::Data;
1548	return;
1549	}
1550	if ($c === null) {
1551	$this->error(ParseErrorCode::EofInComment);
1552	$this->emit($comment);
1553	$this->emit(new EofToken());
1554	return;
1555	}
1556	$comment->append('--!');
1557	$this->reconsumeIn(TokenizerState::Comment);
1558	}
1559
1560	// ============================================================
1561	// 13.2.5.53 DOCTYPE state (and friends)
1562	// ============================================================
1563	private function stateDoctype(): void
1564	{
1565	$c = $this->consume();
1566	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1567	$this->state = TokenizerState::BeforeDoctypeName;
1568	return;
1569	}
1570	if ($c === '>') {
1571	$this->reconsumeIn(TokenizerState::BeforeDoctypeName);
1572	return;
1573	}
1574	if ($c === null) {
1575	$this->error(ParseErrorCode::EofInDoctype);
1576	$token = new DoctypeToken();
1577	$token->forceQuirks = true;
1578	$this->emit($token);
1579	$this->emit(new EofToken());
1580	return;
1581	}
1582	$this->error(ParseErrorCode::MissingWhitespaceBeforeDoctypeName);
1583	$this->reconsumeIn(TokenizerState::BeforeDoctypeName);
1584	}
1585
1586	private function stateBeforeDoctypeName(): void
1587	{
1588	$c = $this->consume();
1589	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1590	return;
1591	}
1592	$token = new DoctypeToken();
1593	$this->currentToken = $token;
1594	if ($c !== null && self::isAsciiUpperAlpha($c)) {
1595	$token->name = strtolower($c);
1596	$this->state = TokenizerState::DoctypeName;
1597	return;
1598	}
1599	if ($c === "\u{0000}") {
1600	$this->error(ParseErrorCode::UnexpectedNullCharacter);
1601	$token->name = "\u{FFFD}";
1602	$this->state = TokenizerState::DoctypeName;
1603	return;
1604	}
1605	if ($c === '>') {
1606	$this->error(ParseErrorCode::MissingDoctypeName);
1607	$token->forceQuirks = true;
1608	$this->emit($token);
1609	$this->state = TokenizerState::Data;
1610	return;
1611	}
1612	if ($c === null) {
1613	$this->error(ParseErrorCode::EofInDoctype);
1614	$token->forceQuirks = true;
1615	$this->emit($token);
1616	$this->emit(new EofToken());
1617	return;
1618	}
1619	$token->name = $c;
1620	$this->state = TokenizerState::DoctypeName;
1621	}
1622
1623	private function stateDoctypeName(): void
1624	{
1625	$c = $this->consume();
1626	$token = $this->currentTokenAsDoctype();
1627	assert($token->name !== null);
1628	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1629	$this->state = TokenizerState::AfterDoctypeName;
1630	return;
1631	}
1632	if ($c === '>') {
1633	$this->emit($token);
1634	$this->state = TokenizerState::Data;
1635	return;
1636	}
1637	if ($c !== null && self::isAsciiUpperAlpha($c)) {
1638	$token->name .= strtolower($c);
1639	return;
1640	}
1641	if ($c === "\u{0000}") {
1642	$this->error(ParseErrorCode::UnexpectedNullCharacter);
1643	$token->name .= "\u{FFFD}";
1644	return;
1645	}
1646	if ($c === null) {
1647	$this->error(ParseErrorCode::EofInDoctype);
1648	$token->forceQuirks = true;
1649	$this->emit($token);
1650	$this->emit(new EofToken());
1651	return;
1652	}
1653	$token->name .= $c;
1654	}
1655
1656	private function stateAfterDoctypeName(): void
1657	{
1658	$c = $this->consume();
1659	$token = $this->currentTokenAsDoctype();
1660	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1661	return;
1662	}
1663	if ($c === '>') {
1664	$this->emit($token);
1665	$this->state = TokenizerState::Data;
1666	return;
1667	}
1668	if ($c === null) {
1669	$this->error(ParseErrorCode::EofInDoctype);
1670	$token->forceQuirks = true;
1671	$this->emit($token);
1672	$this->emit(new EofToken());
1673	return;
1674	}
1675	// Look ahead for PUBLIC or SYSTEM (case-insensitive, including the
1676	// current char). pos points one past the current char; -1 to include it.
1677	$effectivePos = $this->reconsume ? $this->pos - 1 : $this->pos - 1;
1678	$window = implode('', array_slice($this->chars, $effectivePos, 6));
1679	if (strcasecmp($window, 'PUBLIC') === 0) {
1680	$this->pos = $effectivePos + 6;
1681	$this->reconsume = false;
1682	$this->state = TokenizerState::AfterDoctypePublicKeyword;
1683	return;
1684	}
1685	if (strcasecmp($window, 'SYSTEM') === 0) {
1686	$this->pos = $effectivePos + 6;
1687	$this->reconsume = false;
1688	$this->state = TokenizerState::AfterDoctypeSystemKeyword;
1689	return;
1690	}
1691	$this->error(ParseErrorCode::InvalidCharacterSequenceAfterDoctypeName);
1692	$token->forceQuirks = true;
1693	$this->reconsumeIn(TokenizerState::BogusDoctype);
1694	}
1695
1696	// ============================================================
1697	// 13.2.5.57–67 DOCTYPE PUBLIC / SYSTEM identifier states
1698	// ============================================================
1699	private function stateAfterDoctypePublicKeyword(): void
1700	{
1701	$c = $this->consume();
1702	$token = $this->currentTokenAsDoctype();
1703	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1704	$this->state = TokenizerState::BeforeDoctypePublicIdentifier;
1705	return;
1706	}
1707	if ($c === '"') {
1708	$this->error(ParseErrorCode::MissingWhitespaceAfterDoctypePublicKeyword);
1709	$token->publicId = '';
1710	$this->state = TokenizerState::DoctypePublicIdentifierDoubleQuoted;
1711	return;
1712	}
1713	if ($c === "'") {
1714	$this->error(ParseErrorCode::MissingWhitespaceAfterDoctypePublicKeyword);
1715	$token->publicId = '';
1716	$this->state = TokenizerState::DoctypePublicIdentifierSingleQuoted;
1717	return;
1718	}
1719	if ($c === '>') {
1720	$this->error(ParseErrorCode::MissingDoctypePublicIdentifier);
1721	$token->forceQuirks = true;
1722	$this->emit($token);
1723	$this->state = TokenizerState::Data;
1724	return;
1725	}
1726	if ($c === null) {
1727	$this->error(ParseErrorCode::EofInDoctype);
1728	$token->forceQuirks = true;
1729	$this->emit($token);
1730	$this->emit(new EofToken());
1731	return;
1732	}
1733	$this->error(ParseErrorCode::MissingQuoteBeforeDoctypePublicIdentifier);
1734	$token->forceQuirks = true;
1735	$this->reconsumeIn(TokenizerState::BogusDoctype);
1736	}
1737
1738	private function stateBeforeDoctypePublicIdentifier(): void
1739	{
1740	$c = $this->consume();
1741	$token = $this->currentTokenAsDoctype();
1742	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1743	return;
1744	}
1745	if ($c === '"') {
1746	$token->publicId = '';
1747	$this->state = TokenizerState::DoctypePublicIdentifierDoubleQuoted;
1748	return;
1749	}
1750	if ($c === "'") {
1751	$token->publicId = '';
1752	$this->state = TokenizerState::DoctypePublicIdentifierSingleQuoted;
1753	return;
1754	}
1755	if ($c === '>') {
1756	$this->error(ParseErrorCode::MissingDoctypePublicIdentifier);
1757	$token->forceQuirks = true;
1758	$this->emit($token);
1759	$this->state = TokenizerState::Data;
1760	return;
1761	}
1762	if ($c === null) {
1763	$this->error(ParseErrorCode::EofInDoctype);
1764	$token->forceQuirks = true;
1765	$this->emit($token);
1766	$this->emit(new EofToken());
1767	return;
1768	}
1769	$this->error(ParseErrorCode::MissingQuoteBeforeDoctypePublicIdentifier);
1770	$token->forceQuirks = true;
1771	$this->reconsumeIn(TokenizerState::BogusDoctype);
1772	}
1773
1774	private function stateDoctypePublicIdentifierDoubleQuoted(): void
1775	{
1776	$this->doctypeQuotedIdentifier(true, '"');
1777	}
1778
1779	private function stateDoctypePublicIdentifierSingleQuoted(): void
1780	{
1781	$this->doctypeQuotedIdentifier(true, "'");
1782	}
1783
1784	private function stateAfterDoctypePublicIdentifier(): void
1785	{
1786	$c = $this->consume();
1787	$token = $this->currentTokenAsDoctype();
1788	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1789	$this->state = TokenizerState::BetweenDoctypePublicAndSystemIdentifiers;
1790	return;
1791	}
1792	if ($c === '>') {
1793	$this->emit($token);
1794	$this->state = TokenizerState::Data;
1795	return;
1796	}
1797	if ($c === '"') {
1798	$this->error(ParseErrorCode::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
1799	$token->systemId = '';
1800	$this->state = TokenizerState::DoctypeSystemIdentifierDoubleQuoted;
1801	return;
1802	}
1803	if ($c === "'") {
1804	$this->error(ParseErrorCode::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
1805	$token->systemId = '';
1806	$this->state = TokenizerState::DoctypeSystemIdentifierSingleQuoted;
1807	return;
1808	}
1809	if ($c === null) {
1810	$this->error(ParseErrorCode::EofInDoctype);
1811	$token->forceQuirks = true;
1812	$this->emit($token);
1813	$this->emit(new EofToken());
1814	return;
1815	}
1816	$this->error(ParseErrorCode::MissingQuoteBeforeDoctypeSystemIdentifier);
1817	$token->forceQuirks = true;
1818	$this->reconsumeIn(TokenizerState::BogusDoctype);
1819	}
1820
1821	private function stateBetweenDoctypePublicAndSystemIdentifiers(): void
1822	{
1823	$c = $this->consume();
1824	$token = $this->currentTokenAsDoctype();
1825	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1826	return;
1827	}
1828	if ($c === '>') {
1829	$this->emit($token);
1830	$this->state = TokenizerState::Data;
1831	return;
1832	}
1833	if ($c === '"') {
1834	$token->systemId = '';
1835	$this->state = TokenizerState::DoctypeSystemIdentifierDoubleQuoted;
1836	return;
1837	}
1838	if ($c === "'") {
1839	$token->systemId = '';
1840	$this->state = TokenizerState::DoctypeSystemIdentifierSingleQuoted;
1841	return;
1842	}
1843	if ($c === null) {
1844	$this->error(ParseErrorCode::EofInDoctype);
1845	$token->forceQuirks = true;
1846	$this->emit($token);
1847	$this->emit(new EofToken());
1848	return;
1849	}
1850	$this->error(ParseErrorCode::MissingQuoteBeforeDoctypeSystemIdentifier);
1851	$token->forceQuirks = true;
1852	$this->reconsumeIn(TokenizerState::BogusDoctype);
1853	}
1854
1855	private function stateAfterDoctypeSystemKeyword(): void
1856	{
1857	$c = $this->consume();
1858	$token = $this->currentTokenAsDoctype();
1859	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1860	$this->state = TokenizerState::BeforeDoctypeSystemIdentifier;
1861	return;
1862	}
1863	if ($c === '"') {
1864	$this->error(ParseErrorCode::MissingWhitespaceAfterDoctypeSystemKeyword);
1865	$token->systemId = '';
1866	$this->state = TokenizerState::DoctypeSystemIdentifierDoubleQuoted;
1867	return;
1868	}
1869	if ($c === "'") {
1870	$this->error(ParseErrorCode::MissingWhitespaceAfterDoctypeSystemKeyword);
1871	$token->systemId = '';
1872	$this->state = TokenizerState::DoctypeSystemIdentifierSingleQuoted;
1873	return;
1874	}
1875	if ($c === '>') {
1876	$this->error(ParseErrorCode::MissingDoctypeSystemIdentifier);
1877	$token->forceQuirks = true;
1878	$this->emit($token);
1879	$this->state = TokenizerState::Data;
1880	return;
1881	}
1882	if ($c === null) {
1883	$this->error(ParseErrorCode::EofInDoctype);
1884	$token->forceQuirks = true;
1885	$this->emit($token);
1886	$this->emit(new EofToken());
1887	return;
1888	}
1889	$this->error(ParseErrorCode::MissingQuoteBeforeDoctypeSystemIdentifier);
1890	$token->forceQuirks = true;
1891	$this->reconsumeIn(TokenizerState::BogusDoctype);
1892	}
1893
1894	private function stateBeforeDoctypeSystemIdentifier(): void
1895	{
1896	$c = $this->consume();
1897	$token = $this->currentTokenAsDoctype();
1898	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1899	return;
1900	}
1901	if ($c === '"') {
1902	$token->systemId = '';
1903	$this->state = TokenizerState::DoctypeSystemIdentifierDoubleQuoted;
1904	return;
1905	}
1906	if ($c === "'") {
1907	$token->systemId = '';
1908	$this->state = TokenizerState::DoctypeSystemIdentifierSingleQuoted;
1909	return;
1910	}
1911	if ($c === '>') {
1912	$this->error(ParseErrorCode::MissingDoctypeSystemIdentifier);
1913	$token->forceQuirks = true;
1914	$this->emit($token);
1915	$this->state = TokenizerState::Data;
1916	return;
1917	}
1918	if ($c === null) {
1919	$this->error(ParseErrorCode::EofInDoctype);
1920	$token->forceQuirks = true;
1921	$this->emit($token);
1922	$this->emit(new EofToken());
1923	return;
1924	}
1925	$this->error(ParseErrorCode::MissingQuoteBeforeDoctypeSystemIdentifier);
1926	$token->forceQuirks = true;
1927	$this->reconsumeIn(TokenizerState::BogusDoctype);
1928	}
1929
1930	private function stateDoctypeSystemIdentifierDoubleQuoted(): void
1931	{
1932	$this->doctypeQuotedIdentifier(false, '"');
1933	}
1934
1935	private function stateDoctypeSystemIdentifierSingleQuoted(): void
1936	{
1937	$this->doctypeQuotedIdentifier(false, "'");
1938	}
1939
1940	private function stateAfterDoctypeSystemIdentifier(): void
1941	{
1942	$c = $this->consume();
1943	$token = $this->currentTokenAsDoctype();
1944	if ($c === "\t" \|\| $c === "\n" \|\| $c === "\f" \|\| $c === ' ') {
1945	return;
1946	}
1947	if ($c === '>') {
1948	$this->emit($token);
1949	$this->state = TokenizerState::Data;
1950	return;
1951	}
1952	if ($c === null) {
1953	$this->error(ParseErrorCode::EofInDoctype);
1954	$token->forceQuirks = true;
1955	$this->emit($token);
1956	$this->emit(new EofToken());
1957	return;
1958	}
1959	// Per spec: do NOT set force-quirks here. The DOCTYPE is otherwise
1960	// syntactically valid; trailing garbage just goes to a bogus state.
1961	$this->error(ParseErrorCode::UnexpectedCharacterAfterDoctypeSystemIdentifier);
1962	$this->reconsumeIn(TokenizerState::BogusDoctype);
1963	}
1964
1965	/**
1966	* Shared logic for the four quoted-identifier states. $isPublic selects
1967	* which identifier field to append to; $terminator is " or '.
1968	*/
1969	private function doctypeQuotedIdentifier(bool $isPublic, string $terminator): void
1970	{
1971	$c = $this->consume();
1972	$token = $this->currentTokenAsDoctype();
1973	$field = $isPublic ? 'publicId' : 'systemId';
1974	if ($c === $terminator) {
1975	$this->state = $isPublic
1976	? TokenizerState::AfterDoctypePublicIdentifier
1977	: TokenizerState::AfterDoctypeSystemIdentifier;
1978	return;
1979	}
1980	if ($c === "\u{0000}") {
1981	$this->error(ParseErrorCode::UnexpectedNullCharacter);
1982	assert($token->{$field} !== null);
1983	$token->{$field} .= "\u{FFFD}";
1984	return;
1985	}
1986	if ($c === '>') {
1987	$this->error($isPublic
1988	? ParseErrorCode::AbruptDoctypePublicIdentifier
1989	: ParseErrorCode::AbruptDoctypeSystemIdentifier);
1990	$token->forceQuirks = true;
1991	$this->emit($token);
1992	$this->state = TokenizerState::Data;
1993	return;
1994	}
1995	if ($c === null) {
1996	$this->error(ParseErrorCode::EofInDoctype);
1997	$token->forceQuirks = true;
1998	$this->emit($token);
1999	$this->emit(new EofToken());
2000	return;
2001	}
2002	assert($token->{$field} !== null);
2003	$token->{$field} .= $c;
2004	}
2005
2006	private function stateBogusDoctype(): void
2007	{
2008	$c = $this->consume();
2009	if ($c === '>') {
2010	$this->emit($this->currentTokenAsDoctype());
2011	$this->state = TokenizerState::Data;
2012	return;
2013	}
2014	if ($c === null) {
2015	$this->emit($this->currentTokenAsDoctype());
2016	$this->emit(new EofToken());
2017	return;
2018	}
2019	if ($c === "\u{0000}") {
2020	$this->error(ParseErrorCode::UnexpectedNullCharacter);
2021	}
2022	// Otherwise, ignore (no append per spec).
2023	}
2024
2025	// ============================================================
2026	// 13.2.5.72–80 Character reference states
2027	// ============================================================
2028	private function stateCharacterReference(): void
2029	{
2030	$this->tempBuffer = '&';
2031	$c = $this->consume();
2032	if ($c === null) {
2033	$this->flushTempBufferToCharOrAttribute();
2034	$this->reconsumeIn($this->returnState);
2035	return;
2036	}
2037	if (self::isAsciiAlphanumeric($c)) {
2038	$this->reconsumeIn(TokenizerState::NamedCharacterReference);
2039	return;
2040	}
2041	if ($c === '#') {
2042	$this->tempBuffer .= '#';
2043	$this->state = TokenizerState::NumericCharacterReference;
2044	return;
2045	}
2046	$this->flushTempBufferToCharOrAttribute();
2047	$this->reconsumeIn($this->returnState);
2048	}
2049
2050	private function stateNamedCharacterReference(): void
2051	{
2052	// Greedy longest-match against the in-memory table. WHATWG defines
2053	// matching against the spec's full ~2200-entry trie; our table is the
2054	// high-frequency subset (see NamedCharacterReferences).
2055	$start = $this->reconsume ? $this->pos - 1 : $this->pos;
2056	$bestMatch = null;
2057	$bestLen = 0;
2058	for ($len = 1; $len <= 32 && $start + $len <= $this->length; $len++) {
2059	$candidate = implode('', array_slice($this->chars, $start, $len));
2060	if (isset(NamedCharacterReferences::TABLE[$candidate])) {
2061	$bestMatch = $candidate;
2062	$bestLen = $len;
2063	}
2064	}
2065
2066	if ($bestMatch !== null) {
2067	$hasSemicolon = str_ends_with($bestMatch, ';');
2068	$nextChar = $start + $bestLen < $this->length ? $this->chars[$start + $bestLen] : null;
2069	$inAttribute = $this->returnState === TokenizerState::AttributeValueDoubleQuoted
2070	\|\| $this->returnState === TokenizerState::AttributeValueSingleQuoted
2071	\|\| $this->returnState === TokenizerState::AttributeValueUnquoted;
2072
2073	// Special case for attribute values + legacy entries: if the next
2074	// char is "=" or alphanumeric, don't decode (preserves
2075	// backward-compat with old URLs like ?foo=bar&copy=true).
2076	if (!$hasSemicolon && $inAttribute && $nextChar !== null
2077	&& ($nextChar === '=' \|\| self::isAsciiAlphanumeric($nextChar))
2078	) {
2079	$this->tempBuffer = '&' . $bestMatch;
2080	$this->advance($bestLen);
2081	$this->reconsume = false;
2082	$this->flushTempBufferToCharOrAttribute();
2083	$this->state = $this->returnState;
2084	return;
2085	}
2086
2087	if (!$hasSemicolon) {
2088	$this->error(ParseErrorCode::MissingSemicolonAfterCharacterReference);
2089	}
2090	$this->tempBuffer = NamedCharacterReferences::TABLE[$bestMatch];
2091	$this->advance($bestLen);
2092	$this->reconsume = false;
2093	$this->flushTempBufferToCharOrAttribute();
2094	$this->state = $this->returnState;
2095	return;
2096	}
2097
2098	// No match — flush "&" and fall through to the ambiguous-ampersand state
2099	// to consume any remaining ASCII alphanumerics + ";" without decoding.
2100	$this->flushTempBufferToCharOrAttribute();
2101	$this->state = TokenizerState::AmbiguousAmpersand;
2102	}
2103
2104	private function stateAmbiguousAmpersand(): void
2105	{
2106	$c = $this->consume();
2107	if ($c !== null && self::isAsciiAlphanumeric($c)) {
2108	$inAttribute = $this->returnState === TokenizerState::AttributeValueDoubleQuoted
2109	\|\| $this->returnState === TokenizerState::AttributeValueSingleQuoted
2110	\|\| $this->returnState === TokenizerState::AttributeValueUnquoted;
2111	if ($inAttribute) {
2112	$this->appendToCurrentAttributeValue($this->currentTokenAsTag(), $c);
2113	} else {
2114	$this->emitChar($c);
2115	}
2116	return;
2117	}
2118	if ($c === ';') {
2119	$this->error(ParseErrorCode::UnknownNamedCharacterReference);
2120	}
2121	$this->reconsumeIn($this->returnState);
2122	}
2123
2124	private function stateNumericCharacterReference(): void
2125	{
2126	$this->characterReferenceCode = 0;
2127	$c = $this->consume();
2128	if ($c === 'x' \|\| $c === 'X') {
2129	$this->tempBuffer .= $c;
2130	$this->state = TokenizerState::HexadecimalCharacterReferenceStart;
2131	return;
2132	}
2133	$this->reconsumeIn(TokenizerState::DecimalCharacterReferenceStart);
2134	}
2135
2136	private function stateHexadecimalCharacterReferenceStart(): void
2137	{
2138	$c = $this->consume();
2139	if ($c !== null && self::isAsciiHexDigit($c)) {
2140	$this->reconsumeIn(TokenizerState::HexadecimalCharacterReference);
2141	return;
2142	}
2143	$this->error(ParseErrorCode::AbsenceOfDigitsInNumericCharacterReference);
2144	$this->flushTempBufferToCharOrAttribute();
2145	$this->reconsumeIn($this->returnState);
2146	}
2147
2148	private function stateDecimalCharacterReferenceStart(): void
2149	{
2150	$c = $this->consume();
2151	if ($c !== null && ctype_digit($c)) {
2152	$this->reconsumeIn(TokenizerState::DecimalCharacterReference);
2153	return;
2154	}
2155	$this->error(ParseErrorCode::AbsenceOfDigitsInNumericCharacterReference);
2156	$this->flushTempBufferToCharOrAttribute();
2157	$this->reconsumeIn($this->returnState);
2158	}
2159
2160	private function stateHexadecimalCharacterReference(): void
2161	{
2162	$c = $this->consume();
2163	if ($c === null) {
2164	$this->error(ParseErrorCode::MissingSemicolonAfterCharacterReference);
2165	$this->state = TokenizerState::NumericCharacterReferenceEnd;
2166	$this->reconsume = true;
2167	return;
2168	}
2169	if (ctype_digit($c)) {
2170	$this->characterReferenceCode = $this->characterReferenceCode * 16 + (ord($c) - 0x30);
2171	return;
2172	}
2173	if ($c >= 'A' && $c <= 'F') {
2174	$this->characterReferenceCode = $this->characterReferenceCode * 16 + (ord($c) - 0x37);
2175	return;
2176	}
2177	if ($c >= 'a' && $c <= 'f') {
2178	$this->characterReferenceCode = $this->characterReferenceCode * 16 + (ord($c) - 0x57);
2179	return;
2180	}
2181	if ($c === ';') {
2182	$this->state = TokenizerState::NumericCharacterReferenceEnd;
2183	return;
2184	}
2185	$this->error(ParseErrorCode::MissingSemicolonAfterCharacterReference);
2186	$this->reconsumeIn(TokenizerState::NumericCharacterReferenceEnd);
2187	}
2188
2189	private function stateDecimalCharacterReference(): void
2190	{
2191	$c = $this->consume();
2192	if ($c === null) {
2193	$this->error(ParseErrorCode::MissingSemicolonAfterCharacterReference);
2194	$this->state = TokenizerState::NumericCharacterReferenceEnd;
2195	$this->reconsume = true;
2196	return;
2197	}
2198	if (ctype_digit($c)) {
2199	$this->characterReferenceCode = $this->characterReferenceCode * 10 + (ord($c) - 0x30);
2200	return;
2201	}
2202	if ($c === ';') {
2203	$this->state = TokenizerState::NumericCharacterReferenceEnd;
2204	return;
2205	}
2206	$this->error(ParseErrorCode::MissingSemicolonAfterCharacterReference);
2207	$this->reconsumeIn(TokenizerState::NumericCharacterReferenceEnd);
2208	}
2209
2210	private function stateNumericCharacterReferenceEnd(): void
2211	{
2212	$code = $this->characterReferenceCode;
2213	if ($code === 0) {
2214	$this->error(ParseErrorCode::NullCharacterReference);
2215	$code = 0xFFFD;
2216	} elseif ($code > 0x10FFFF) {
2217	$this->error(ParseErrorCode::CharacterReferenceOutsideUnicodeRange);
2218	$code = 0xFFFD;
2219	} elseif ($code >= 0xD800 && $code <= 0xDFFF) {
2220	$this->error(ParseErrorCode::SurrogateCharacterReference);
2221	$code = 0xFFFD;
2222	} elseif (isset(NamedCharacterReferences::NUMERIC_REPLACEMENTS[$code])) {
2223	$this->error(ParseErrorCode::ControlCharacterReference);
2224	$code = NamedCharacterReferences::NUMERIC_REPLACEMENTS[$code];
2225	}
2226	$this->tempBuffer = mb_chr($code, 'UTF-8');
2227	$this->flushTempBufferToCharOrAttribute();
2228	$this->state = $this->returnState;
2229	}
2230
2231	private function flushTempBufferToCharOrAttribute(): void
2232	{
2233	$inAttribute = $this->returnState === TokenizerState::AttributeValueDoubleQuoted
2234	\|\| $this->returnState === TokenizerState::AttributeValueSingleQuoted
2235	\|\| $this->returnState === TokenizerState::AttributeValueUnquoted;
2236	if ($inAttribute) {
2237	$this->appendToCurrentAttributeValue($this->currentTokenAsTag(), $this->tempBuffer);
2238	} else {
2239	if ($this->tempBuffer !== '') {
2240	$this->emitChar($this->tempBuffer);
2241	}
2242	}
2243	$this->tempBuffer = '';
2244	}
2245
2246	// ============================================================
2247	// Tag finalisation
2248	// ============================================================
2249	private function finalizeAndEmitTag(): void
2250	{
2251	$tag = $this->currentTokenAsTag();
2252	$this->dedupAttributes($tag);
2253	if ($tag instanceof EndTagToken && count($tag->attributes) > 0) {
2254	$this->error(ParseErrorCode::EndTagWithAttributes);
2255	}
2256	$this->emit($tag);
2257	}
2258
2259	// ============================================================
2260	// Character classification helpers
2261	// ============================================================
2262	private static function isAsciiAlpha(string $c): bool
2263	{
2264	return ($c >= 'a' && $c <= 'z') \|\| ($c >= 'A' && $c <= 'Z');
2265	}
2266
2267	private static function isAsciiUpperAlpha(string $c): bool
2268	{
2269	return $c >= 'A' && $c <= 'Z';
2270	}
2271
2272	private static function isAsciiLowerAlpha(string $c): bool
2273	{
2274	return $c >= 'a' && $c <= 'z';
2275	}
2276
2277	private static function isAsciiAlphanumeric(string $c): bool
2278	{
2279	return self::isAsciiAlpha($c) \|\| ctype_digit($c);
2280	}
2281
2282	private static function isAsciiHexDigit(string $c): bool
2283	{
2284	return ctype_digit($c) \|\| ($c >= 'A' && $c <= 'F') \|\| ($c >= 'a' && $c <= 'f');
2285	}
2286	}