Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
85.85% covered (warning)
85.85%
1171 / 1364
60.71% covered (warning)
60.71%
68 / 112
CRAP
0.00% covered (danger)
0.00%
0 / 1
Tokenizer
85.85% covered (warning)
85.85%
1171 / 1364
60.71% covered (warning)
60.71%
68 / 112
1563.12
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
3
 tokenize
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 nextToken
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
4
 errors
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 step
100.00% covered (success)
100.00%
81 / 81
100.00% covered (success)
100.00%
1 / 1
81
 preprocess
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 consume
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
4
 reconsumeIn
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 peekRemaining
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
3
 advance
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 emit
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
3
 emitChar
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 error
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
2
 currentTokenAsEnd
50.00% covered (danger)
50.00%
1 / 2
0.00% covered (danger)
0.00%
0 / 1
1.12
 currentTokenAsTag
50.00% covered (danger)
50.00%
1 / 2
0.00% covered (danger)
0.00%
0 / 1
2.50
 currentTokenAsComment
50.00% covered (danger)
50.00%
1 / 2
0.00% covered (danger)
0.00%
0 / 1
1.12
 currentTokenAsDoctype
50.00% covered (danger)
50.00%
1 / 2
0.00% covered (danger)
0.00%
0 / 1
1.12
 startNewAttribute
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 appendToCurrentAttributeName
50.00% covered (danger)
50.00%
1 / 2
0.00% covered (danger)
0.00%
0 / 1
1.12
 appendToCurrentAttributeValue
50.00% covered (danger)
50.00%
1 / 2
0.00% covered (danger)
0.00%
0 / 1
1.12
 dedupAttributes
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
3
 isAppropriateEndTag
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
2
 stateData
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
5
 stateRcdata
81.25% covered (warning)
81.25%
13 / 16
0.00% covered (danger)
0.00%
0 / 1
5.16
 stateRawtext
75.00% covered (warning)
75.00%
9 / 12
0.00% covered (danger)
0.00%
0 / 1
4.25
 stateScriptData
100.00% covered (success)
100.00%
12 / 12
100.00% covered (success)
100.00%
1 / 1
4
 stateScriptDataLessThanSign
100.00% covered (success)
100.00%
12 / 12
100.00% covered (success)
100.00%
1 / 1
3
 stateScriptDataEndTagOpen
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
3
 stateScriptDataEndTagName
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stateScriptDataEscapeStart
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 stateScriptDataEscapeStartDash
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 stateScriptDataEscaped
100.00% covered (success)
100.00%
17 / 17
100.00% covered (success)
100.00%
1 / 1
5
 stateScriptDataEscapedDash
100.00% covered (success)
100.00%
19 / 19
100.00% covered (success)
100.00%
1 / 1
5
 stateScriptDataEscapedDashDash
100.00% covered (success)
100.00%
22 / 22
100.00% covered (success)
100.00%
1 / 1
6
 stateScriptDataEscapedLessThanSign
100.00% covered (success)
100.00%
12 / 12
100.00% covered (success)
100.00%
1 / 1
4
 stateScriptDataEscapedEndTagOpen
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
3
 stateScriptDataEscapedEndTagName
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stateScriptDataDoubleEscapeStart
100.00% covered (success)
100.00%
16 / 16
100.00% covered (success)
100.00%
1 / 1
12
 stateScriptDataDoubleEscaped
100.00% covered (success)
100.00%
18 / 18
100.00% covered (success)
100.00%
1 / 1
5
 stateScriptDataDoubleEscapedDash
100.00% covered (success)
100.00%
20 / 20
100.00% covered (success)
100.00%
1 / 1
5
 stateScriptDataDoubleEscapedDashDash
100.00% covered (success)
100.00%
23 / 23
100.00% covered (success)
100.00%
1 / 1
6
 stateScriptDataDoubleEscapedLessThanSign
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
2
 stateScriptDataDoubleEscapeEnd
100.00% covered (success)
100.00%
16 / 16
100.00% covered (success)
100.00%
1 / 1
12
 statePlaintext
0.00% covered (danger)
0.00%
0 / 9
0.00% covered (danger)
0.00%
0 / 1
12
 stateTagOpen
100.00% covered (success)
100.00%
24 / 24
100.00% covered (success)
100.00%
1 / 1
7
 stateEndTagOpen
100.00% covered (success)
100.00%
18 / 18
100.00% covered (success)
100.00%
1 / 1
5
 stateTagName
87.50% covered (warning)
87.50%
21 / 24
0.00% covered (danger)
0.00%
0 / 1
11.24
 stateRcdataLessThanSign
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
2
 stateRcdataEndTagOpen
62.50% covered (warning)
62.50%
5 / 8
0.00% covered (danger)
0.00%
0 / 1
3.47
 stateRcdataEndTagName
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stateRawtextLessThanSign
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
2
 stateRawtextEndTagOpen
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
3
 stateRawtextEndTagName
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 endTagNameAlternativeReturn
100.00% covered (success)
100.00%
30 / 30
100.00% covered (success)
100.00%
1 / 1
14
 emitFakeOpeningChars
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 stateBeforeAttributeName
62.50% covered (warning)
62.50%
10 / 16
0.00% covered (danger)
0.00%
0 / 1
13.27
 stateAttributeName
84.21% covered (warning)
84.21%
16 / 19
0.00% covered (danger)
0.00%
0 / 1
14.77
 stateAfterAttributeName
90.00% covered (success)
90.00%
18 / 20
0.00% covered (danger)
0.00%
0 / 1
9.08
 stateBeforeAttributeValue
66.67% covered (warning)
66.67%
10 / 15
0.00% covered (danger)
0.00%
0 / 1
10.37
 stateAttributeValueDoubleQuoted
82.35% covered (warning)
82.35%
14 / 17
0.00% covered (danger)
0.00%
0 / 1
5.14
 stateAttributeValueSingleQuoted
64.71% covered (warning)
64.71%
11 / 17
0.00% covered (danger)
0.00%
0 / 1
6.10
 stateAttributeValueUnquoted
73.91% covered (warning)
73.91%
17 / 23
0.00% covered (danger)
0.00%
0 / 1
17.48
 stateAfterAttributeValueQuoted
64.71% covered (warning)
64.71%
11 / 17
0.00% covered (danger)
0.00%
0 / 1
10.81
 stateSelfClosingStartTag
100.00% covered (success)
100.00%
15 / 15
100.00% covered (success)
100.00%
1 / 1
4
 stateBogusComment
100.00% covered (success)
100.00%
15 / 15
100.00% covered (success)
100.00%
1 / 1
4
 stateMarkupDeclarationOpen
100.00% covered (success)
100.00%
22 / 22
100.00% covered (success)
100.00%
1 / 1
5
 stateCommentStart
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
3
 stateCommentStartDash
75.00% covered (warning)
75.00%
12 / 16
0.00% covered (danger)
0.00%
0 / 1
4.25
 stateComment
84.21% covered (warning)
84.21%
16 / 19
0.00% covered (danger)
0.00%
0 / 1
5.10
 stateCommentLessThanSign
80.00% covered (warning)
80.00%
8 / 10
0.00% covered (danger)
0.00%
0 / 1
3.07
 stateCommentLessThanSignBang
80.00% covered (warning)
80.00%
4 / 5
0.00% covered (danger)
0.00%
0 / 1
2.03
 stateCommentLessThanSignBangDash
80.00% covered (warning)
80.00%
4 / 5
0.00% covered (danger)
0.00%
0 / 1
2.03
 stateCommentLessThanSignBangDashDash
66.67% covered (warning)
66.67%
4 / 6
0.00% covered (danger)
0.00%
0 / 1
3.33
 stateCdataSection
66.67% covered (warning)
66.67%
6 / 9
0.00% covered (danger)
0.00%
0 / 1
3.33
 stateCdataSectionBracket
66.67% covered (warning)
66.67%
4 / 6
0.00% covered (danger)
0.00%
0 / 1
2.15
 stateCdataSectionEnd
80.00% covered (warning)
80.00%
8 / 10
0.00% covered (danger)
0.00%
0 / 1
3.07
 stateCommentEndDash
63.64% covered (warning)
63.64%
7 / 11
0.00% covered (danger)
0.00%
0 / 1
3.43
 stateCommentEnd
100.00% covered (success)
100.00%
19 / 19
100.00% covered (success)
100.00%
1 / 1
5
 stateCommentEndBang
61.11% covered (warning)
61.11%
11 / 18
0.00% covered (danger)
0.00%
0 / 1
4.94
 stateDoctype
62.50% covered (warning)
62.50%
10 / 16
0.00% covered (danger)
0.00%
0 / 1
9.58
 stateBeforeDoctypeName
67.86% covered (warning)
67.86%
19 / 28
0.00% covered (danger)
0.00%
0 / 1
13.32
 stateDoctypeName
62.50% covered (warning)
62.50%
15 / 24
0.00% covered (danger)
0.00%
0 / 1
15.27
 stateAfterDoctypeName
82.76% covered (warning)
82.76%
24 / 29
0.00% covered (danger)
0.00%
0 / 1
10.51
 stateAfterDoctypePublicKeyword
83.33% covered (warning)
83.33%
25 / 30
0.00% covered (danger)
0.00%
0 / 1
9.37
 stateBeforeDoctypePublicIdentifier
81.48% covered (warning)
81.48%
22 / 27
0.00% covered (danger)
0.00%
0 / 1
9.51
 stateDoctypePublicIdentifierDoubleQuoted
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stateDoctypePublicIdentifierSingleQuoted
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stateAfterDoctypePublicIdentifier
82.14% covered (warning)
82.14%
23 / 28
0.00% covered (danger)
0.00%
0 / 1
9.46
 stateBetweenDoctypePublicAndSystemIdentifiers
68.00% covered (warning)
68.00%
17 / 25
0.00% covered (danger)
0.00%
0 / 1
11.65
 stateAfterDoctypeSystemKeyword
56.67% covered (warning)
56.67%
17 / 30
0.00% covered (danger)
0.00%
0 / 1
15.59
 stateBeforeDoctypeSystemIdentifier
81.48% covered (warning)
81.48%
22 / 27
0.00% covered (danger)
0.00%
0 / 1
9.51
 stateDoctypeSystemIdentifierDoubleQuoted
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stateDoctypeSystemIdentifierSingleQuoted
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stateAfterDoctypeSystemIdentifier
43.75% covered (danger)
43.75%
7 / 16
0.00% covered (danger)
0.00%
0 / 1
15.72
 doctypeQuotedIdentifier
62.07% covered (warning)
62.07%
18 / 29
0.00% covered (danger)
0.00%
0 / 1
11.49
 stateBogusDoctype
63.64% covered (warning)
63.64%
7 / 11
0.00% covered (danger)
0.00%
0 / 1
4.77
 stateCharacterReference
100.00% covered (success)
100.00%
15 / 15
100.00% covered (success)
100.00%
1 / 1
4
 stateNamedCharacterReference
100.00% covered (success)
100.00%
32 / 32
100.00% covered (success)
100.00%
1 / 1
15
 stateAmbiguousAmpersand
100.00% covered (success)
100.00%
12 / 12
100.00% covered (success)
100.00%
1 / 1
7
 stateNumericCharacterReference
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
3
 stateHexadecimalCharacterReferenceStart
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
3
 stateDecimalCharacterReferenceStart
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
3
 stateHexadecimalCharacterReference
80.00% covered (warning)
80.00%
16 / 20
0.00% covered (danger)
0.00%
0 / 1
8.51
 stateDecimalCharacterReference
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
4
 stateNumericCharacterReferenceEnd
100.00% covered (success)
100.00%
16 / 16
100.00% covered (success)
100.00%
1 / 1
6
 flushTempBufferToCharOrAttribute
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
5
 finalizeAndEmitTag
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
3
 isAsciiAlpha
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
4
 isAsciiUpperAlpha
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
2
 isAsciiLowerAlpha
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
2
 isAsciiAlphanumeric
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
2
 isAsciiHexDigit
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
5
1<?php
2
3declare(strict_types=1);
4
5namespace Phpdftk\Html\Tokenizer;
6
7/**
8 * WHATWG HTML Â§13.2.5 tokenizer.
9 *
10 * Phase 1B.2 + 1B.2-bis: all ~80 spec states implemented. Covers DOCTYPE
11 * (including PUBLIC/SYSTEM identifiers), tags with every attribute form,
12 * script-data with full escape/double-escape recovery, comments (including
13 * nested-comment recovery), CDATA sections (entered when {@see self::$inForeignContent}
14 * is true), and character references (numeric + named).
15 *
16 * Named character reference table (see {@see NamedCharacterReferences}) ships
17 * the high-frequency subset of the spec's ~2200 entries. Generation of the
18 * full table from the spec's `entities.json` is a separate deliverable
19 * tracked in the rendering roadmap.
20 *
21 * Input preprocessing per WHATWG Â§13.2.3.5: CR/CRLF normalised to LF before
22 * tokenizing. NULL handling is per-state (some emit U+FFFD, some emit raw,
23 * all with parse-error tracking).
24 */
25final class Tokenizer
26{
27    public TokenizerState $state = TokenizerState::Data;
28    public ?string $lastStartTagName = null; // for appropriate-end-tag check in RCDATA/RAWTEXT
29
30    /**
31     * Set to true by tree construction when the "adjusted current node" is
32     * not in the HTML namespace (e.g. inside SVG or MathML). Affects the
33     * MarkupDeclarationOpen state's handling of `[CDATA[`: in foreign content
34     * we enter the CdataSection state; in HTML content it's a bogus comment.
35     */
36    public bool $inForeignContent = false;
37
38    /** @var list<string> input as an array of UTF-8 single-codepoint strings */
39    private array $chars;
40    private int $length;
41    private int $pos = 0;
42    private bool $reconsume = false;
43    private string $currentChar = '';
44
45    private ?Token $currentToken = null;
46    private TokenizerState $returnState = TokenizerState::Data;
47    private string $tempBuffer = '';
48    private int $characterReferenceCode = 0;
49
50    /** @var list<Token> */
51    private array $emitted = [];
52    private int $emittedCursor = 0;
53    /** @var list<ParseError> */
54    private array $errors = [];
55    private bool $done = false;
56
57    public function __construct(string $input)
58    {
59        $normalised = $this->preprocess($input);
60        $this->chars = $normalised === '' ? [] : (mb_str_split($normalised, 1, 'UTF-8') ?: []);
61        $this->length = count($this->chars);
62    }
63
64    /**
65     * Run the state machine to completion and return all tokens emitted, in
66     * order, ending with an EofToken. Convenience for callers that don't need
67     * mid-stream state interaction; tree construction uses {@see self::nextToken()}.
68     *
69     * @return list<Token>
70     */
71    public function tokenize(): array
72    {
73        while (!$this->done) {
74            $this->step();
75        }
76        return $this->emitted;
77    }
78
79    /**
80     * Pull the next token, advancing the state machine until at least one
81     * token is emitted (or EOF). Tree construction drives this iteratively so
82     * it can mutate {@see self::$state} (e.g. switching to RCDATA when
83     * encountering `<title>`) between tokens.
84     */
85    public function nextToken(): ?Token
86    {
87        while ($this->emittedCursor >= count($this->emitted) && !$this->done) {
88            $this->step();
89        }
90        if ($this->emittedCursor < count($this->emitted)) {
91            return $this->emitted[$this->emittedCursor++];
92        }
93        return null;
94    }
95
96    /** @return list<ParseError> */
97    public function errors(): array
98    {
99        return $this->errors;
100    }
101
102    private function step(): void
103    {
104        match ($this->state) {
105            TokenizerState::Data => $this->stateData(),
106            TokenizerState::Rcdata => $this->stateRcdata(),
107            TokenizerState::Rawtext => $this->stateRawtext(),
108            TokenizerState::ScriptData => $this->stateScriptData(),
109            TokenizerState::Plaintext => $this->statePlaintext(),
110            TokenizerState::TagOpen => $this->stateTagOpen(),
111            TokenizerState::EndTagOpen => $this->stateEndTagOpen(),
112            TokenizerState::TagName => $this->stateTagName(),
113            TokenizerState::RcdataLessThanSign => $this->stateRcdataLessThanSign(),
114            TokenizerState::RcdataEndTagOpen => $this->stateRcdataEndTagOpen(),
115            TokenizerState::RcdataEndTagName => $this->stateRcdataEndTagName(),
116            TokenizerState::RawtextLessThanSign => $this->stateRawtextLessThanSign(),
117            TokenizerState::RawtextEndTagOpen => $this->stateRawtextEndTagOpen(),
118            TokenizerState::RawtextEndTagName => $this->stateRawtextEndTagName(),
119            TokenizerState::ScriptDataLessThanSign => $this->stateScriptDataLessThanSign(),
120            TokenizerState::ScriptDataEndTagOpen => $this->stateScriptDataEndTagOpen(),
121            TokenizerState::ScriptDataEndTagName => $this->stateScriptDataEndTagName(),
122            TokenizerState::ScriptDataEscapeStart => $this->stateScriptDataEscapeStart(),
123            TokenizerState::ScriptDataEscapeStartDash => $this->stateScriptDataEscapeStartDash(),
124            TokenizerState::ScriptDataEscaped => $this->stateScriptDataEscaped(),
125            TokenizerState::ScriptDataEscapedDash => $this->stateScriptDataEscapedDash(),
126            TokenizerState::ScriptDataEscapedDashDash => $this->stateScriptDataEscapedDashDash(),
127            TokenizerState::ScriptDataEscapedLessThanSign => $this->stateScriptDataEscapedLessThanSign(),
128            TokenizerState::ScriptDataEscapedEndTagOpen => $this->stateScriptDataEscapedEndTagOpen(),
129            TokenizerState::ScriptDataEscapedEndTagName => $this->stateScriptDataEscapedEndTagName(),
130            TokenizerState::ScriptDataDoubleEscapeStart => $this->stateScriptDataDoubleEscapeStart(),
131            TokenizerState::ScriptDataDoubleEscaped => $this->stateScriptDataDoubleEscaped(),
132            TokenizerState::ScriptDataDoubleEscapedDash => $this->stateScriptDataDoubleEscapedDash(),
133            TokenizerState::ScriptDataDoubleEscapedDashDash => $this->stateScriptDataDoubleEscapedDashDash(),
134            TokenizerState::ScriptDataDoubleEscapedLessThanSign => $this->stateScriptDataDoubleEscapedLessThanSign(),
135            TokenizerState::ScriptDataDoubleEscapeEnd => $this->stateScriptDataDoubleEscapeEnd(),
136            TokenizerState::BeforeAttributeName => $this->stateBeforeAttributeName(),
137            TokenizerState::AttributeName => $this->stateAttributeName(),
138            TokenizerState::AfterAttributeName => $this->stateAfterAttributeName(),
139            TokenizerState::BeforeAttributeValue => $this->stateBeforeAttributeValue(),
140            TokenizerState::AttributeValueDoubleQuoted => $this->stateAttributeValueDoubleQuoted(),
141            TokenizerState::AttributeValueSingleQuoted => $this->stateAttributeValueSingleQuoted(),
142            TokenizerState::AttributeValueUnquoted => $this->stateAttributeValueUnquoted(),
143            TokenizerState::AfterAttributeValueQuoted => $this->stateAfterAttributeValueQuoted(),
144            TokenizerState::SelfClosingStartTag => $this->stateSelfClosingStartTag(),
145            TokenizerState::BogusComment => $this->stateBogusComment(),
146            TokenizerState::MarkupDeclarationOpen => $this->stateMarkupDeclarationOpen(),
147            TokenizerState::CommentStart => $this->stateCommentStart(),
148            TokenizerState::CommentStartDash => $this->stateCommentStartDash(),
149            TokenizerState::Comment => $this->stateComment(),
150            TokenizerState::CommentLessThanSign => $this->stateCommentLessThanSign(),
151            TokenizerState::CommentLessThanSignBang => $this->stateCommentLessThanSignBang(),
152            TokenizerState::CommentLessThanSignBangDash => $this->stateCommentLessThanSignBangDash(),
153            TokenizerState::CommentLessThanSignBangDashDash => $this->stateCommentLessThanSignBangDashDash(),
154            TokenizerState::CommentEndDash => $this->stateCommentEndDash(),
155            TokenizerState::CommentEnd => $this->stateCommentEnd(),
156            TokenizerState::CommentEndBang => $this->stateCommentEndBang(),
157            TokenizerState::Doctype => $this->stateDoctype(),
158            TokenizerState::BeforeDoctypeName => $this->stateBeforeDoctypeName(),
159            TokenizerState::DoctypeName => $this->stateDoctypeName(),
160            TokenizerState::AfterDoctypeName => $this->stateAfterDoctypeName(),
161            TokenizerState::AfterDoctypePublicKeyword => $this->stateAfterDoctypePublicKeyword(),
162            TokenizerState::BeforeDoctypePublicIdentifier => $this->stateBeforeDoctypePublicIdentifier(),
163            TokenizerState::DoctypePublicIdentifierDoubleQuoted => $this->stateDoctypePublicIdentifierDoubleQuoted(),
164            TokenizerState::DoctypePublicIdentifierSingleQuoted => $this->stateDoctypePublicIdentifierSingleQuoted(),
165            TokenizerState::AfterDoctypePublicIdentifier => $this->stateAfterDoctypePublicIdentifier(),
166            TokenizerState::BetweenDoctypePublicAndSystemIdentifiers => $this->stateBetweenDoctypePublicAndSystemIdentifiers(),
167            TokenizerState::AfterDoctypeSystemKeyword => $this->stateAfterDoctypeSystemKeyword(),
168            TokenizerState::BeforeDoctypeSystemIdentifier => $this->stateBeforeDoctypeSystemIdentifier(),
169            TokenizerState::DoctypeSystemIdentifierDoubleQuoted => $this->stateDoctypeSystemIdentifierDoubleQuoted(),
170            TokenizerState::DoctypeSystemIdentifierSingleQuoted => $this->stateDoctypeSystemIdentifierSingleQuoted(),
171            TokenizerState::AfterDoctypeSystemIdentifier => $this->stateAfterDoctypeSystemIdentifier(),
172            TokenizerState::BogusDoctype => $this->stateBogusDoctype(),
173            TokenizerState::CdataSection => $this->stateCdataSection(),
174            TokenizerState::CdataSectionBracket => $this->stateCdataSectionBracket(),
175            TokenizerState::CdataSectionEnd => $this->stateCdataSectionEnd(),
176            TokenizerState::CharacterReference => $this->stateCharacterReference(),
177            TokenizerState::NamedCharacterReference => $this->stateNamedCharacterReference(),
178            TokenizerState::AmbiguousAmpersand => $this->stateAmbiguousAmpersand(),
179            TokenizerState::NumericCharacterReference => $this->stateNumericCharacterReference(),
180            TokenizerState::HexadecimalCharacterReferenceStart => $this->stateHexadecimalCharacterReferenceStart(),
181            TokenizerState::DecimalCharacterReferenceStart => $this->stateDecimalCharacterReferenceStart(),
182            TokenizerState::HexadecimalCharacterReference => $this->stateHexadecimalCharacterReference(),
183            TokenizerState::DecimalCharacterReference => $this->stateDecimalCharacterReference(),
184            TokenizerState::NumericCharacterReferenceEnd => $this->stateNumericCharacterReferenceEnd(),
185        };
186    }
187
188    // ============================================================
189    // Input / output helpers
190    // ============================================================
191
192    private function preprocess(string $input): string
193    {
194        // CRLF â†’ LF, then CR â†’ LF per WHATWG Â§13.2.3.5.
195        $input = str_replace("\r\n", "\n", $input);
196        $input = str_replace("\r", "\n", $input);
197        return $input;
198    }
199
200    private function consume(): ?string
201    {
202        if ($this->reconsume) {
203            $this->reconsume = false;
204            return $this->currentChar === '' ? null : $this->currentChar;
205        }
206        if ($this->pos >= $this->length) {
207            $this->currentChar = '';
208            return null;
209        }
210        $this->currentChar = $this->chars[$this->pos++];
211        return $this->currentChar;
212    }
213
214    private function reconsumeIn(TokenizerState $next): void
215    {
216        $this->state = $next;
217        $this->reconsume = true;
218    }
219
220    private function peekRemaining(int $count): string
221    {
222        $start = $this->reconsume ? $this->pos - 1 : $this->pos;
223        if ($start >= $this->length) {
224            return '';
225        }
226        return implode('', array_slice($this->chars, $start, $count));
227    }
228
229    private function advance(int $count): void
230    {
231        // When reconsume is set, `pos` sits one past the reconsume char, so
232        // the effective "start" for the advance is pos - 1. Compute from there
233        // to keep the next consume() pointed at the correct character.
234        $effectiveStart = $this->reconsume ? $this->pos - 1 : $this->pos;
235        $this->pos = $effectiveStart + $count;
236        $this->reconsume = false;
237    }
238
239    private function emit(Token $t): void
240    {
241        if ($t instanceof StartTagToken) {
242            $this->lastStartTagName = $t->tagName;
243        }
244        $this->emitted[] = $t;
245        if ($t instanceof EofToken) {
246            $this->done = true;
247        }
248    }
249
250    private function emitChar(string $data): void
251    {
252        $this->emit(new CharacterToken($data));
253    }
254
255    private function error(ParseErrorCode $code): void
256    {
257        $this->errors[] = new ParseError($code, $this->reconsume ? $this->pos - 1 : $this->pos);
258    }
259
260    private function currentTokenAsEnd(): EndTagToken
261    {
262        assert($this->currentToken instanceof EndTagToken);
263        return $this->currentToken;
264    }
265
266    private function currentTokenAsTag(): StartTagToken|EndTagToken
267    {
268        assert($this->currentToken instanceof StartTagToken || $this->currentToken instanceof EndTagToken);
269        return $this->currentToken;
270    }
271
272    private function currentTokenAsComment(): CommentToken
273    {
274        assert($this->currentToken instanceof CommentToken);
275        return $this->currentToken;
276    }
277
278    private function currentTokenAsDoctype(): DoctypeToken
279    {
280        assert($this->currentToken instanceof DoctypeToken);
281        return $this->currentToken;
282    }
283
284    private function startNewAttribute(StartTagToken|EndTagToken $tag): void
285    {
286        $tag->attributes[] = ['name' => '', 'value' => ''];
287        $tag->currentAttribute = count($tag->attributes) - 1;
288    }
289
290    private function appendToCurrentAttributeName(StartTagToken|EndTagToken $tag, string $chars): void
291    {
292        assert($tag->currentAttribute !== null);
293        $tag->attributes[$tag->currentAttribute]['name'] .= $chars;
294    }
295
296    private function appendToCurrentAttributeValue(StartTagToken|EndTagToken $tag, string $chars): void
297    {
298        assert($tag->currentAttribute !== null);
299        $tag->attributes[$tag->currentAttribute]['value'] .= $chars;
300    }
301
302    /**
303     * Per WHATWG: after a tag's attribute list is built, drop duplicate
304     * attribute names (keep the first; emit unexpected-character-in-
305     * attribute-name parse error for subsequent duplicates). Called when the
306     * tag token is finalised.
307     */
308    private function dedupAttributes(StartTagToken|EndTagToken $tag): void
309    {
310        $seen = [];
311        $out = [];
312        foreach ($tag->attributes as $attr) {
313            if (isset($seen[$attr['name']])) {
314                $this->error(ParseErrorCode::UnexpectedCharacterInAttributeName);
315                continue;
316            }
317            $seen[$attr['name']] = true;
318            $out[] = $attr;
319        }
320        $tag->attributes = $out;
321    }
322
323    private function isAppropriateEndTag(EndTagToken $tag): bool
324    {
325        return $this->lastStartTagName !== null && $tag->tagName === $this->lastStartTagName;
326    }
327
328    // ============================================================
329    // 13.2.5.1 Data state
330    // ============================================================
331    private function stateData(): void
332    {
333        $c = $this->consume();
334        if ($c === '&') {
335            $this->returnState = TokenizerState::Data;
336            $this->state = TokenizerState::CharacterReference;
337            return;
338        }
339        if ($c === '<') {
340            $this->state = TokenizerState::TagOpen;
341            return;
342        }
343        if ($c === null) {
344            $this->emit(new EofToken());
345            return;
346        }
347        if ($c === "\u{0000}") {
348            $this->error(ParseErrorCode::UnexpectedNullCharacter);
349        }
350        $this->emitChar($c);
351    }
352
353    // ============================================================
354    // 13.2.5.2 RCDATA state
355    // ============================================================
356    private function stateRcdata(): void
357    {
358        $c = $this->consume();
359        if ($c === '&') {
360            $this->returnState = TokenizerState::Rcdata;
361            $this->state = TokenizerState::CharacterReference;
362            return;
363        }
364        if ($c === '<') {
365            $this->state = TokenizerState::RcdataLessThanSign;
366            return;
367        }
368        if ($c === null) {
369            $this->emit(new EofToken());
370            return;
371        }
372        if ($c === "\u{0000}") {
373            $this->error(ParseErrorCode::UnexpectedNullCharacter);
374            $this->emitChar("\u{FFFD}");
375            return;
376        }
377        $this->emitChar($c);
378    }
379
380    // ============================================================
381    // 13.2.5.3 RAWTEXT state
382    // ============================================================
383    private function stateRawtext(): void
384    {
385        $c = $this->consume();
386        if ($c === '<') {
387            $this->state = TokenizerState::RawtextLessThanSign;
388            return;
389        }
390        if ($c === null) {
391            $this->emit(new EofToken());
392            return;
393        }
394        if ($c === "\u{0000}") {
395            $this->error(ParseErrorCode::UnexpectedNullCharacter);
396            $this->emitChar("\u{FFFD}");
397            return;
398        }
399        $this->emitChar($c);
400    }
401
402    // ============================================================
403    // 13.2.5.4 Script data state
404    // ============================================================
405    private function stateScriptData(): void
406    {
407        $c = $this->consume();
408        if ($c === '<') {
409            $this->state = TokenizerState::ScriptDataLessThanSign;
410            return;
411        }
412        if ($c === null) {
413            $this->emit(new EofToken());
414            return;
415        }
416        if ($c === "\u{0000}") {
417            $this->error(ParseErrorCode::UnexpectedNullCharacter);
418            $this->emitChar("\u{FFFD}");
419            return;
420        }
421        $this->emitChar($c);
422    }
423
424    // ============================================================
425    // 13.2.5.15–17 Script data less-than / end-tag states
426    // ============================================================
427    private function stateScriptDataLessThanSign(): void
428    {
429        $c = $this->consume();
430        if ($c === '/') {
431            $this->tempBuffer = '';
432            $this->state = TokenizerState::ScriptDataEndTagOpen;
433            return;
434        }
435        if ($c === '!') {
436            $this->state = TokenizerState::ScriptDataEscapeStart;
437            $this->emitChar('<');
438            $this->emitChar('!');
439            return;
440        }
441        $this->emitChar('<');
442        $this->reconsumeIn(TokenizerState::ScriptData);
443    }
444
445    private function stateScriptDataEndTagOpen(): void
446    {
447        $c = $this->consume();
448        if ($c !== null && self::isAsciiAlpha($c)) {
449            $this->currentToken = new EndTagToken();
450            $this->reconsumeIn(TokenizerState::ScriptDataEndTagName);
451            return;
452        }
453        $this->emitChar('<');
454        $this->emitChar('/');
455        $this->reconsumeIn(TokenizerState::ScriptData);
456    }
457
458    private function stateScriptDataEndTagName(): void
459    {
460        $this->endTagNameAlternativeReturn(TokenizerState::ScriptData);
461    }
462
463    // ============================================================
464    // 13.2.5.18–19 Script data escape start states
465    // ============================================================
466    private function stateScriptDataEscapeStart(): void
467    {
468        $c = $this->consume();
469        if ($c === '-') {
470            $this->state = TokenizerState::ScriptDataEscapeStartDash;
471            $this->emitChar('-');
472            return;
473        }
474        $this->reconsumeIn(TokenizerState::ScriptData);
475    }
476
477    private function stateScriptDataEscapeStartDash(): void
478    {
479        $c = $this->consume();
480        if ($c === '-') {
481            $this->state = TokenizerState::ScriptDataEscapedDashDash;
482            $this->emitChar('-');
483            return;
484        }
485        $this->reconsumeIn(TokenizerState::ScriptData);
486    }
487
488    // ============================================================
489    // 13.2.5.20–22 Script data escaped states
490    // ============================================================
491    private function stateScriptDataEscaped(): void
492    {
493        $c = $this->consume();
494        if ($c === '-') {
495            $this->state = TokenizerState::ScriptDataEscapedDash;
496            $this->emitChar('-');
497            return;
498        }
499        if ($c === '<') {
500            $this->state = TokenizerState::ScriptDataEscapedLessThanSign;
501            return;
502        }
503        if ($c === "\u{0000}") {
504            $this->error(ParseErrorCode::UnexpectedNullCharacter);
505            $this->emitChar("\u{FFFD}");
506            return;
507        }
508        if ($c === null) {
509            $this->error(ParseErrorCode::EofInScriptHtmlCommentLikeText);
510            $this->emit(new EofToken());
511            return;
512        }
513        $this->emitChar($c);
514    }
515
516    private function stateScriptDataEscapedDash(): void
517    {
518        $c = $this->consume();
519        if ($c === '-') {
520            $this->state = TokenizerState::ScriptDataEscapedDashDash;
521            $this->emitChar('-');
522            return;
523        }
524        if ($c === '<') {
525            $this->state = TokenizerState::ScriptDataEscapedLessThanSign;
526            return;
527        }
528        if ($c === "\u{0000}") {
529            $this->error(ParseErrorCode::UnexpectedNullCharacter);
530            $this->state = TokenizerState::ScriptDataEscaped;
531            $this->emitChar("\u{FFFD}");
532            return;
533        }
534        if ($c === null) {
535            $this->error(ParseErrorCode::EofInScriptHtmlCommentLikeText);
536            $this->emit(new EofToken());
537            return;
538        }
539        $this->state = TokenizerState::ScriptDataEscaped;
540        $this->emitChar($c);
541    }
542
543    private function stateScriptDataEscapedDashDash(): void
544    {
545        $c = $this->consume();
546        if ($c === '-') {
547            $this->emitChar('-');
548            return;
549        }
550        if ($c === '<') {
551            $this->state = TokenizerState::ScriptDataEscapedLessThanSign;
552            return;
553        }
554        if ($c === '>') {
555            $this->state = TokenizerState::ScriptData;
556            $this->emitChar('>');
557            return;
558        }
559        if ($c === "\u{0000}") {
560            $this->error(ParseErrorCode::UnexpectedNullCharacter);
561            $this->state = TokenizerState::ScriptDataEscaped;
562            $this->emitChar("\u{FFFD}");
563            return;
564        }
565        if ($c === null) {
566            $this->error(ParseErrorCode::EofInScriptHtmlCommentLikeText);
567            $this->emit(new EofToken());
568            return;
569        }
570        $this->state = TokenizerState::ScriptDataEscaped;
571        $this->emitChar($c);
572    }
573
574    // ============================================================
575    // 13.2.5.23–25 Script data escaped less-than / end-tag states
576    // ============================================================
577    private function stateScriptDataEscapedLessThanSign(): void
578    {
579        $c = $this->consume();
580        if ($c === '/') {
581            $this->tempBuffer = '';
582            $this->state = TokenizerState::ScriptDataEscapedEndTagOpen;
583            return;
584        }
585        if ($c !== null && self::isAsciiAlpha($c)) {
586            $this->tempBuffer = '';
587            $this->emitChar('<');
588            $this->reconsumeIn(TokenizerState::ScriptDataDoubleEscapeStart);
589            return;
590        }
591        $this->emitChar('<');
592        $this->reconsumeIn(TokenizerState::ScriptDataEscaped);
593    }
594
595    private function stateScriptDataEscapedEndTagOpen(): void
596    {
597        $c = $this->consume();
598        if ($c !== null && self::isAsciiAlpha($c)) {
599            $this->currentToken = new EndTagToken();
600            $this->reconsumeIn(TokenizerState::ScriptDataEscapedEndTagName);
601            return;
602        }
603        $this->emitChar('<');
604        $this->emitChar('/');
605        $this->reconsumeIn(TokenizerState::ScriptDataEscaped);
606    }
607
608    private function stateScriptDataEscapedEndTagName(): void
609    {
610        $this->endTagNameAlternativeReturn(TokenizerState::ScriptDataEscaped);
611    }
612
613    // ============================================================
614    // 13.2.5.26–31 Script data double escape states
615    // ============================================================
616    private function stateScriptDataDoubleEscapeStart(): void
617    {
618        $c = $this->consume();
619        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ' || $c === '/' || $c === '>') {
620            $this->state = $this->tempBuffer === 'script'
621                ? TokenizerState::ScriptDataDoubleEscaped
622                : TokenizerState::ScriptDataEscaped;
623            $this->emitChar($c);
624            return;
625        }
626        if ($c !== null && self::isAsciiUpperAlpha($c)) {
627            $this->tempBuffer .= strtolower($c);
628            $this->emitChar($c);
629            return;
630        }
631        if ($c !== null && self::isAsciiLowerAlpha($c)) {
632            $this->tempBuffer .= $c;
633            $this->emitChar($c);
634            return;
635        }
636        $this->reconsumeIn(TokenizerState::ScriptDataEscaped);
637    }
638
639    private function stateScriptDataDoubleEscaped(): void
640    {
641        $c = $this->consume();
642        if ($c === '-') {
643            $this->state = TokenizerState::ScriptDataDoubleEscapedDash;
644            $this->emitChar('-');
645            return;
646        }
647        if ($c === '<') {
648            $this->state = TokenizerState::ScriptDataDoubleEscapedLessThanSign;
649            $this->emitChar('<');
650            return;
651        }
652        if ($c === "\u{0000}") {
653            $this->error(ParseErrorCode::UnexpectedNullCharacter);
654            $this->emitChar("\u{FFFD}");
655            return;
656        }
657        if ($c === null) {
658            $this->error(ParseErrorCode::EofInScriptHtmlCommentLikeText);
659            $this->emit(new EofToken());
660            return;
661        }
662        $this->emitChar($c);
663    }
664
665    private function stateScriptDataDoubleEscapedDash(): void
666    {
667        $c = $this->consume();
668        if ($c === '-') {
669            $this->state = TokenizerState::ScriptDataDoubleEscapedDashDash;
670            $this->emitChar('-');
671            return;
672        }
673        if ($c === '<') {
674            $this->state = TokenizerState::ScriptDataDoubleEscapedLessThanSign;
675            $this->emitChar('<');
676            return;
677        }
678        if ($c === "\u{0000}") {
679            $this->error(ParseErrorCode::UnexpectedNullCharacter);
680            $this->state = TokenizerState::ScriptDataDoubleEscaped;
681            $this->emitChar("\u{FFFD}");
682            return;
683        }
684        if ($c === null) {
685            $this->error(ParseErrorCode::EofInScriptHtmlCommentLikeText);
686            $this->emit(new EofToken());
687            return;
688        }
689        $this->state = TokenizerState::ScriptDataDoubleEscaped;
690        $this->emitChar($c);
691    }
692
693    private function stateScriptDataDoubleEscapedDashDash(): void
694    {
695        $c = $this->consume();
696        if ($c === '-') {
697            $this->emitChar('-');
698            return;
699        }
700        if ($c === '<') {
701            $this->state = TokenizerState::ScriptDataDoubleEscapedLessThanSign;
702            $this->emitChar('<');
703            return;
704        }
705        if ($c === '>') {
706            $this->state = TokenizerState::ScriptData;
707            $this->emitChar('>');
708            return;
709        }
710        if ($c === "\u{0000}") {
711            $this->error(ParseErrorCode::UnexpectedNullCharacter);
712            $this->state = TokenizerState::ScriptDataDoubleEscaped;
713            $this->emitChar("\u{FFFD}");
714            return;
715        }
716        if ($c === null) {
717            $this->error(ParseErrorCode::EofInScriptHtmlCommentLikeText);
718            $this->emit(new EofToken());
719            return;
720        }
721        $this->state = TokenizerState::ScriptDataDoubleEscaped;
722        $this->emitChar($c);
723    }
724
725    private function stateScriptDataDoubleEscapedLessThanSign(): void
726    {
727        $c = $this->consume();
728        if ($c === '/') {
729            $this->tempBuffer = '';
730            $this->state = TokenizerState::ScriptDataDoubleEscapeEnd;
731            $this->emitChar('/');
732            return;
733        }
734        $this->reconsumeIn(TokenizerState::ScriptDataDoubleEscaped);
735    }
736
737    private function stateScriptDataDoubleEscapeEnd(): void
738    {
739        $c = $this->consume();
740        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ' || $c === '/' || $c === '>') {
741            $this->state = $this->tempBuffer === 'script'
742                ? TokenizerState::ScriptDataEscaped
743                : TokenizerState::ScriptDataDoubleEscaped;
744            $this->emitChar($c);
745            return;
746        }
747        if ($c !== null && self::isAsciiUpperAlpha($c)) {
748            $this->tempBuffer .= strtolower($c);
749            $this->emitChar($c);
750            return;
751        }
752        if ($c !== null && self::isAsciiLowerAlpha($c)) {
753            $this->tempBuffer .= $c;
754            $this->emitChar($c);
755            return;
756        }
757        $this->reconsumeIn(TokenizerState::ScriptDataDoubleEscaped);
758    }
759
760    // ============================================================
761    // 13.2.5.5 PLAINTEXT state
762    // ============================================================
763    private function statePlaintext(): void
764    {
765        $c = $this->consume();
766        if ($c === null) {
767            $this->emit(new EofToken());
768            return;
769        }
770        if ($c === "\u{0000}") {
771            $this->error(ParseErrorCode::UnexpectedNullCharacter);
772            $this->emitChar("\u{FFFD}");
773            return;
774        }
775        $this->emitChar($c);
776    }
777
778    // ============================================================
779    // 13.2.5.6 Tag open state
780    // ============================================================
781    private function stateTagOpen(): void
782    {
783        $c = $this->consume();
784        if ($c === '!') {
785            $this->state = TokenizerState::MarkupDeclarationOpen;
786            return;
787        }
788        if ($c === '/') {
789            $this->state = TokenizerState::EndTagOpen;
790            return;
791        }
792        if ($c !== null && self::isAsciiAlpha($c)) {
793            $this->currentToken = new StartTagToken();
794            $this->reconsumeIn(TokenizerState::TagName);
795            return;
796        }
797        if ($c === '?') {
798            $this->error(ParseErrorCode::UnexpectedQuestionMarkInsteadOfTagName);
799            $this->currentToken = new CommentToken();
800            $this->reconsumeIn(TokenizerState::BogusComment);
801            return;
802        }
803        if ($c === null) {
804            $this->error(ParseErrorCode::EofBeforeTagName);
805            $this->emitChar('<');
806            $this->emit(new EofToken());
807            return;
808        }
809        $this->error(ParseErrorCode::InvalidFirstCharacterOfTagName);
810        $this->emitChar('<');
811        $this->reconsumeIn(TokenizerState::Data);
812    }
813
814    // ============================================================
815    // 13.2.5.7 End tag open state
816    // ============================================================
817    private function stateEndTagOpen(): void
818    {
819        $c = $this->consume();
820        if ($c !== null && self::isAsciiAlpha($c)) {
821            $this->currentToken = new EndTagToken();
822            $this->reconsumeIn(TokenizerState::TagName);
823            return;
824        }
825        if ($c === '>') {
826            $this->error(ParseErrorCode::MissingEndTagName);
827            $this->state = TokenizerState::Data;
828            return;
829        }
830        if ($c === null) {
831            $this->error(ParseErrorCode::EofBeforeTagName);
832            $this->emitChar('<');
833            $this->emitChar('/');
834            $this->emit(new EofToken());
835            return;
836        }
837        $this->error(ParseErrorCode::InvalidFirstCharacterOfTagName);
838        $this->currentToken = new CommentToken();
839        $this->reconsumeIn(TokenizerState::BogusComment);
840    }
841
842    // ============================================================
843    // 13.2.5.8 Tag name state
844    // ============================================================
845    private function stateTagName(): void
846    {
847        $c = $this->consume();
848        $tag = $this->currentTokenAsTag();
849        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
850            $this->state = TokenizerState::BeforeAttributeName;
851            return;
852        }
853        if ($c === '/') {
854            $this->state = TokenizerState::SelfClosingStartTag;
855            return;
856        }
857        if ($c === '>') {
858            $this->finalizeAndEmitTag();
859            $this->state = TokenizerState::Data;
860            return;
861        }
862        if ($c !== null && self::isAsciiUpperAlpha($c)) {
863            $tag->tagName .= strtolower($c);
864            return;
865        }
866        if ($c === "\u{0000}") {
867            $this->error(ParseErrorCode::UnexpectedNullCharacter);
868            $tag->tagName .= "\u{FFFD}";
869            return;
870        }
871        if ($c === null) {
872            $this->error(ParseErrorCode::EofInTag);
873            $this->emit(new EofToken());
874            return;
875        }
876        $tag->tagName .= $c;
877    }
878
879    // ============================================================
880    // 13.2.5.9–11 RCDATA less-than and end-tag states
881    // ============================================================
882    private function stateRcdataLessThanSign(): void
883    {
884        $c = $this->consume();
885        if ($c === '/') {
886            $this->tempBuffer = '';
887            $this->state = TokenizerState::RcdataEndTagOpen;
888            return;
889        }
890        $this->emitChar('<');
891        $this->reconsumeIn(TokenizerState::Rcdata);
892    }
893
894    private function stateRcdataEndTagOpen(): void
895    {
896        $c = $this->consume();
897        if ($c !== null && self::isAsciiAlpha($c)) {
898            $this->currentToken = new EndTagToken();
899            $this->reconsumeIn(TokenizerState::RcdataEndTagName);
900            return;
901        }
902        $this->emitChar('<');
903        $this->emitChar('/');
904        $this->reconsumeIn(TokenizerState::Rcdata);
905    }
906
907    private function stateRcdataEndTagName(): void
908    {
909        $this->endTagNameAlternativeReturn(TokenizerState::Rcdata);
910    }
911
912    // ============================================================
913    // 13.2.5.12–14 RAWTEXT less-than and end-tag states
914    // ============================================================
915    private function stateRawtextLessThanSign(): void
916    {
917        $c = $this->consume();
918        if ($c === '/') {
919            $this->tempBuffer = '';
920            $this->state = TokenizerState::RawtextEndTagOpen;
921            return;
922        }
923        $this->emitChar('<');
924        $this->reconsumeIn(TokenizerState::Rawtext);
925    }
926
927    private function stateRawtextEndTagOpen(): void
928    {
929        $c = $this->consume();
930        if ($c !== null && self::isAsciiAlpha($c)) {
931            $this->currentToken = new EndTagToken();
932            $this->reconsumeIn(TokenizerState::RawtextEndTagName);
933            return;
934        }
935        $this->emitChar('<');
936        $this->emitChar('/');
937        $this->reconsumeIn(TokenizerState::Rawtext);
938    }
939
940    private function stateRawtextEndTagName(): void
941    {
942        $this->endTagNameAlternativeReturn(TokenizerState::Rawtext);
943    }
944
945    /**
946     * Shared logic for RCDATA / RAWTEXT / Script end-tag-name states. If the
947     * end tag matches the most recent start tag (the "appropriate end tag"),
948     * transition like a normal tag close; otherwise emit characters and
949     * return to the source state.
950     */
951    private function endTagNameAlternativeReturn(TokenizerState $sourceState): void
952    {
953        $c = $this->consume();
954        $tag = $this->currentTokenAsEnd();
955        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
956            if ($this->isAppropriateEndTag($tag)) {
957                $this->state = TokenizerState::BeforeAttributeName;
958                return;
959            }
960            $this->emitFakeOpeningChars($sourceState);
961            return;
962        }
963        if ($c === '/') {
964            if ($this->isAppropriateEndTag($tag)) {
965                $this->state = TokenizerState::SelfClosingStartTag;
966                return;
967            }
968            $this->emitFakeOpeningChars($sourceState);
969            return;
970        }
971        if ($c === '>') {
972            if ($this->isAppropriateEndTag($tag)) {
973                $this->finalizeAndEmitTag();
974                $this->state = TokenizerState::Data;
975                return;
976            }
977            $this->emitFakeOpeningChars($sourceState);
978            return;
979        }
980        if ($c !== null && self::isAsciiUpperAlpha($c)) {
981            $tag->tagName .= strtolower($c);
982            $this->tempBuffer .= $c;
983            return;
984        }
985        if ($c !== null && self::isAsciiLowerAlpha($c)) {
986            $tag->tagName .= $c;
987            $this->tempBuffer .= $c;
988            return;
989        }
990        $this->emitFakeOpeningChars($sourceState);
991    }
992
993    private function emitFakeOpeningChars(TokenizerState $sourceState): void
994    {
995        $this->emitChar('<');
996        $this->emitChar('/');
997        if ($this->tempBuffer !== '') {
998            $this->emitChar($this->tempBuffer);
999        }
1000        $this->reconsumeIn($sourceState);
1001    }
1002
1003    // ============================================================
1004    // 13.2.5.32 Before attribute name state
1005    // ============================================================
1006    private function stateBeforeAttributeName(): void
1007    {
1008        $c = $this->consume();
1009        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1010            return;
1011        }
1012        if ($c === '/' || $c === '>' || $c === null) {
1013            $this->reconsumeIn(TokenizerState::AfterAttributeName);
1014            return;
1015        }
1016        if ($c === '=') {
1017            $this->error(ParseErrorCode::UnexpectedEqualsSignBeforeAttributeName);
1018            $tag = $this->currentTokenAsTag();
1019            $this->startNewAttribute($tag);
1020            $this->appendToCurrentAttributeName($tag, '=');
1021            $this->state = TokenizerState::AttributeName;
1022            return;
1023        }
1024        $tag = $this->currentTokenAsTag();
1025        $this->startNewAttribute($tag);
1026        $this->reconsumeIn(TokenizerState::AttributeName);
1027    }
1028
1029    // ============================================================
1030    // 13.2.5.33 Attribute name state
1031    // ============================================================
1032    private function stateAttributeName(): void
1033    {
1034        $c = $this->consume();
1035        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' '
1036            || $c === '/' || $c === '>' || $c === null
1037        ) {
1038            $this->reconsumeIn(TokenizerState::AfterAttributeName);
1039            return;
1040        }
1041        if ($c === '=') {
1042            $this->state = TokenizerState::BeforeAttributeValue;
1043            return;
1044        }
1045        $tag = $this->currentTokenAsTag();
1046        if (self::isAsciiUpperAlpha($c)) {
1047            $this->appendToCurrentAttributeName($tag, strtolower($c));
1048            return;
1049        }
1050        if ($c === "\u{0000}") {
1051            $this->error(ParseErrorCode::UnexpectedNullCharacter);
1052            $this->appendToCurrentAttributeName($tag, "\u{FFFD}");
1053            return;
1054        }
1055        if ($c === '"' || $c === "'" || $c === '<') {
1056            $this->error(ParseErrorCode::UnexpectedCharacterInAttributeName);
1057        }
1058        $this->appendToCurrentAttributeName($tag, $c);
1059    }
1060
1061    // ============================================================
1062    // 13.2.5.34 After attribute name state
1063    // ============================================================
1064    private function stateAfterAttributeName(): void
1065    {
1066        $c = $this->consume();
1067        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1068            return;
1069        }
1070        if ($c === '/') {
1071            $this->state = TokenizerState::SelfClosingStartTag;
1072            return;
1073        }
1074        if ($c === '=') {
1075            $this->state = TokenizerState::BeforeAttributeValue;
1076            return;
1077        }
1078        if ($c === '>') {
1079            $this->finalizeAndEmitTag();
1080            $this->state = TokenizerState::Data;
1081            return;
1082        }
1083        if ($c === null) {
1084            $this->error(ParseErrorCode::EofInTag);
1085            $this->emit(new EofToken());
1086            return;
1087        }
1088        $tag = $this->currentTokenAsTag();
1089        $this->startNewAttribute($tag);
1090        $this->reconsumeIn(TokenizerState::AttributeName);
1091    }
1092
1093    // ============================================================
1094    // 13.2.5.35 Before attribute value state
1095    // ============================================================
1096    private function stateBeforeAttributeValue(): void
1097    {
1098        $c = $this->consume();
1099        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1100            return;
1101        }
1102        if ($c === '"') {
1103            $this->state = TokenizerState::AttributeValueDoubleQuoted;
1104            return;
1105        }
1106        if ($c === "'") {
1107            $this->state = TokenizerState::AttributeValueSingleQuoted;
1108            return;
1109        }
1110        if ($c === '>') {
1111            $this->error(ParseErrorCode::MissingAttributeValue);
1112            $this->finalizeAndEmitTag();
1113            $this->state = TokenizerState::Data;
1114            return;
1115        }
1116        $this->reconsumeIn(TokenizerState::AttributeValueUnquoted);
1117    }
1118
1119    // ============================================================
1120    // 13.2.5.36 Attribute value (double-quoted) state
1121    // ============================================================
1122    private function stateAttributeValueDoubleQuoted(): void
1123    {
1124        $c = $this->consume();
1125        if ($c === '"') {
1126            $this->state = TokenizerState::AfterAttributeValueQuoted;
1127            return;
1128        }
1129        if ($c === '&') {
1130            $this->returnState = TokenizerState::AttributeValueDoubleQuoted;
1131            $this->state = TokenizerState::CharacterReference;
1132            return;
1133        }
1134        if ($c === "\u{0000}") {
1135            $this->error(ParseErrorCode::UnexpectedNullCharacter);
1136            $this->appendToCurrentAttributeValue($this->currentTokenAsTag(), "\u{FFFD}");
1137            return;
1138        }
1139        if ($c === null) {
1140            $this->error(ParseErrorCode::EofInTag);
1141            $this->emit(new EofToken());
1142            return;
1143        }
1144        $this->appendToCurrentAttributeValue($this->currentTokenAsTag(), $c);
1145    }
1146
1147    // ============================================================
1148    // 13.2.5.37 Attribute value (single-quoted) state
1149    // ============================================================
1150    private function stateAttributeValueSingleQuoted(): void
1151    {
1152        $c = $this->consume();
1153        if ($c === "'") {
1154            $this->state = TokenizerState::AfterAttributeValueQuoted;
1155            return;
1156        }
1157        if ($c === '&') {
1158            $this->returnState = TokenizerState::AttributeValueSingleQuoted;
1159            $this->state = TokenizerState::CharacterReference;
1160            return;
1161        }
1162        if ($c === "\u{0000}") {
1163            $this->error(ParseErrorCode::UnexpectedNullCharacter);
1164            $this->appendToCurrentAttributeValue($this->currentTokenAsTag(), "\u{FFFD}");
1165            return;
1166        }
1167        if ($c === null) {
1168            $this->error(ParseErrorCode::EofInTag);
1169            $this->emit(new EofToken());
1170            return;
1171        }
1172        $this->appendToCurrentAttributeValue($this->currentTokenAsTag(), $c);
1173    }
1174
1175    // ============================================================
1176    // 13.2.5.38 Attribute value (unquoted) state
1177    // ============================================================
1178    private function stateAttributeValueUnquoted(): void
1179    {
1180        $c = $this->consume();
1181        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1182            $this->state = TokenizerState::BeforeAttributeName;
1183            return;
1184        }
1185        if ($c === '&') {
1186            $this->returnState = TokenizerState::AttributeValueUnquoted;
1187            $this->state = TokenizerState::CharacterReference;
1188            return;
1189        }
1190        if ($c === '>') {
1191            $this->finalizeAndEmitTag();
1192            $this->state = TokenizerState::Data;
1193            return;
1194        }
1195        if ($c === "\u{0000}") {
1196            $this->error(ParseErrorCode::UnexpectedNullCharacter);
1197            $this->appendToCurrentAttributeValue($this->currentTokenAsTag(), "\u{FFFD}");
1198            return;
1199        }
1200        if ($c === null) {
1201            $this->error(ParseErrorCode::EofInTag);
1202            $this->emit(new EofToken());
1203            return;
1204        }
1205        if ($c === '"' || $c === "'" || $c === '<' || $c === '=' || $c === '`') {
1206            $this->error(ParseErrorCode::UnexpectedCharacterInUnquotedAttributeValue);
1207        }
1208        $this->appendToCurrentAttributeValue($this->currentTokenAsTag(), $c);
1209    }
1210
1211    // ============================================================
1212    // 13.2.5.39 After attribute value (quoted) state
1213    // ============================================================
1214    private function stateAfterAttributeValueQuoted(): void
1215    {
1216        $c = $this->consume();
1217        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1218            $this->state = TokenizerState::BeforeAttributeName;
1219            return;
1220        }
1221        if ($c === '/') {
1222            $this->state = TokenizerState::SelfClosingStartTag;
1223            return;
1224        }
1225        if ($c === '>') {
1226            $this->finalizeAndEmitTag();
1227            $this->state = TokenizerState::Data;
1228            return;
1229        }
1230        if ($c === null) {
1231            $this->error(ParseErrorCode::EofInTag);
1232            $this->emit(new EofToken());
1233            return;
1234        }
1235        $this->error(ParseErrorCode::MissingWhitespaceBetweenAttributes);
1236        $this->reconsumeIn(TokenizerState::BeforeAttributeName);
1237    }
1238
1239    // ============================================================
1240    // 13.2.5.40 Self-closing start tag state
1241    // ============================================================
1242    private function stateSelfClosingStartTag(): void
1243    {
1244        $c = $this->consume();
1245        if ($c === '>') {
1246            $tag = $this->currentTokenAsTag();
1247            $tag->selfClosing = true;
1248            if ($tag instanceof EndTagToken) {
1249                $this->error(ParseErrorCode::EndTagWithTrailingSolidus);
1250            }
1251            $this->finalizeAndEmitTag();
1252            $this->state = TokenizerState::Data;
1253            return;
1254        }
1255        if ($c === null) {
1256            $this->error(ParseErrorCode::EofInTag);
1257            $this->emit(new EofToken());
1258            return;
1259        }
1260        $this->error(ParseErrorCode::UnexpectedSolidusInTag);
1261        $this->reconsumeIn(TokenizerState::BeforeAttributeName);
1262    }
1263
1264    // ============================================================
1265    // 13.2.5.41 Bogus comment state
1266    // ============================================================
1267    private function stateBogusComment(): void
1268    {
1269        $c = $this->consume();
1270        $comment = $this->currentTokenAsComment();
1271        if ($c === '>') {
1272            $this->emit($comment);
1273            $this->state = TokenizerState::Data;
1274            return;
1275        }
1276        if ($c === null) {
1277            $this->emit($comment);
1278            $this->emit(new EofToken());
1279            return;
1280        }
1281        if ($c === "\u{0000}") {
1282            $this->error(ParseErrorCode::UnexpectedNullCharacter);
1283            $comment->append("\u{FFFD}");
1284            return;
1285        }
1286        $comment->append($c);
1287    }
1288
1289    // ============================================================
1290    // 13.2.5.42 Markup declaration open state
1291    // ============================================================
1292    private function stateMarkupDeclarationOpen(): void
1293    {
1294        // Peek ahead at the next characters. Two cases at Phase 1B.2:
1295        //   "--"   â†’ comment start
1296        //   "doctype" (case-insensitive) â†’ doctype
1297        // CDATA section ([CDATA[) is deferred to 1B.2-bis.
1298        $rest = $this->peekRemaining(7);
1299        if (str_starts_with($rest, '--')) {
1300            $this->advance(2);
1301            $this->currentToken = new CommentToken();
1302            $this->state = TokenizerState::CommentStart;
1303            return;
1304        }
1305        if (strcasecmp(substr($rest, 0, 7), 'doctype') === 0) {
1306            $this->advance(7);
1307            $this->state = TokenizerState::Doctype;
1308            return;
1309        }
1310        if (str_starts_with($rest, '[CDATA[')) {
1311            $this->advance(7);
1312            if ($this->inForeignContent) {
1313                $this->state = TokenizerState::CdataSection;
1314                return;
1315            }
1316            $this->error(ParseErrorCode::CdataInHtmlContent);
1317            $this->currentToken = new CommentToken('[CDATA[');
1318            $this->state = TokenizerState::BogusComment;
1319            return;
1320        }
1321        $this->error(ParseErrorCode::IncorrectlyOpenedComment);
1322        $this->currentToken = new CommentToken();
1323        $this->state = TokenizerState::BogusComment;
1324    }
1325
1326    // ============================================================
1327    // 13.2.5.43–48 Comment states
1328    // ============================================================
1329    private function stateCommentStart(): void
1330    {
1331        $c = $this->consume();
1332        if ($c === '-') {
1333            $this->state = TokenizerState::CommentStartDash;
1334            return;
1335        }
1336        if ($c === '>') {
1337            $this->error(ParseErrorCode::AbruptClosingOfEmptyComment);
1338            $this->emit($this->currentTokenAsComment());
1339            $this->state = TokenizerState::Data;
1340            return;
1341        }
1342        $this->reconsumeIn(TokenizerState::Comment);
1343    }
1344
1345    private function stateCommentStartDash(): void
1346    {
1347        $c = $this->consume();
1348        if ($c === '-') {
1349            $this->state = TokenizerState::CommentEnd;
1350            return;
1351        }
1352        if ($c === '>') {
1353            $this->error(ParseErrorCode::AbruptClosingOfEmptyComment);
1354            $this->emit($this->currentTokenAsComment());
1355            $this->state = TokenizerState::Data;
1356            return;
1357        }
1358        if ($c === null) {
1359            $this->error(ParseErrorCode::EofInComment);
1360            $this->emit($this->currentTokenAsComment());
1361            $this->emit(new EofToken());
1362            return;
1363        }
1364        $this->currentTokenAsComment()->append('-');
1365        $this->reconsumeIn(TokenizerState::Comment);
1366    }
1367
1368    private function stateComment(): void
1369    {
1370        $c = $this->consume();
1371        $comment = $this->currentTokenAsComment();
1372        if ($c === '<') {
1373            $comment->append('<');
1374            $this->state = TokenizerState::CommentLessThanSign;
1375            return;
1376        }
1377        if ($c === '-') {
1378            $this->state = TokenizerState::CommentEndDash;
1379            return;
1380        }
1381        if ($c === "\u{0000}") {
1382            $this->error(ParseErrorCode::UnexpectedNullCharacter);
1383            $comment->append("\u{FFFD}");
1384            return;
1385        }
1386        if ($c === null) {
1387            $this->error(ParseErrorCode::EofInComment);
1388            $this->emit($comment);
1389            $this->emit(new EofToken());
1390            return;
1391        }
1392        $comment->append($c);
1393    }
1394
1395    // ============================================================
1396    // 13.2.5.45–48 Comment less-than-sign / bang recovery states
1397    // ============================================================
1398    private function stateCommentLessThanSign(): void
1399    {
1400        $c = $this->consume();
1401        $comment = $this->currentTokenAsComment();
1402        if ($c === '!') {
1403            $comment->append('!');
1404            $this->state = TokenizerState::CommentLessThanSignBang;
1405            return;
1406        }
1407        if ($c === '<') {
1408            $comment->append('<');
1409            return;
1410        }
1411        $this->reconsumeIn(TokenizerState::Comment);
1412    }
1413
1414    private function stateCommentLessThanSignBang(): void
1415    {
1416        $c = $this->consume();
1417        if ($c === '-') {
1418            $this->state = TokenizerState::CommentLessThanSignBangDash;
1419            return;
1420        }
1421        $this->reconsumeIn(TokenizerState::Comment);
1422    }
1423
1424    private function stateCommentLessThanSignBangDash(): void
1425    {
1426        $c = $this->consume();
1427        if ($c === '-') {
1428            $this->state = TokenizerState::CommentLessThanSignBangDashDash;
1429            return;
1430        }
1431        $this->reconsumeIn(TokenizerState::CommentEndDash);
1432    }
1433
1434    private function stateCommentLessThanSignBangDashDash(): void
1435    {
1436        $c = $this->consume();
1437        if ($c === '>' || $c === null) {
1438            $this->reconsumeIn(TokenizerState::CommentEnd);
1439            return;
1440        }
1441        $this->error(ParseErrorCode::NestedComment);
1442        $this->reconsumeIn(TokenizerState::CommentEnd);
1443    }
1444
1445    // ============================================================
1446    // 13.2.5.69–71 CDATA section states (only valid in foreign content)
1447    // ============================================================
1448    private function stateCdataSection(): void
1449    {
1450        $c = $this->consume();
1451        if ($c === ']') {
1452            $this->state = TokenizerState::CdataSectionBracket;
1453            return;
1454        }
1455        if ($c === null) {
1456            $this->error(ParseErrorCode::EofInCdata);
1457            $this->emit(new EofToken());
1458            return;
1459        }
1460        // NULL inside CDATA is emitted verbatim per spec â€” no replacement.
1461        $this->emitChar($c);
1462    }
1463
1464    private function stateCdataSectionBracket(): void
1465    {
1466        $c = $this->consume();
1467        if ($c === ']') {
1468            $this->state = TokenizerState::CdataSectionEnd;
1469            return;
1470        }
1471        $this->emitChar(']');
1472        $this->reconsumeIn(TokenizerState::CdataSection);
1473    }
1474
1475    private function stateCdataSectionEnd(): void
1476    {
1477        $c = $this->consume();
1478        if ($c === ']') {
1479            $this->emitChar(']');
1480            return;
1481        }
1482        if ($c === '>') {
1483            $this->state = TokenizerState::Data;
1484            return;
1485        }
1486        $this->emitChar(']');
1487        $this->emitChar(']');
1488        $this->reconsumeIn(TokenizerState::CdataSection);
1489    }
1490
1491    private function stateCommentEndDash(): void
1492    {
1493        $c = $this->consume();
1494        if ($c === '-') {
1495            $this->state = TokenizerState::CommentEnd;
1496            return;
1497        }
1498        if ($c === null) {
1499            $this->error(ParseErrorCode::EofInComment);
1500            $this->emit($this->currentTokenAsComment());
1501            $this->emit(new EofToken());
1502            return;
1503        }
1504        $this->currentTokenAsComment()->append('-');
1505        $this->reconsumeIn(TokenizerState::Comment);
1506    }
1507
1508    private function stateCommentEnd(): void
1509    {
1510        $c = $this->consume();
1511        $comment = $this->currentTokenAsComment();
1512        if ($c === '>') {
1513            $this->emit($comment);
1514            $this->state = TokenizerState::Data;
1515            return;
1516        }
1517        if ($c === '!') {
1518            $this->state = TokenizerState::CommentEndBang;
1519            return;
1520        }
1521        if ($c === '-') {
1522            $comment->append('-');
1523            return;
1524        }
1525        if ($c === null) {
1526            $this->error(ParseErrorCode::EofInComment);
1527            $this->emit($comment);
1528            $this->emit(new EofToken());
1529            return;
1530        }
1531        $comment->append('--');
1532        $this->reconsumeIn(TokenizerState::Comment);
1533    }
1534
1535    private function stateCommentEndBang(): void
1536    {
1537        $c = $this->consume();
1538        $comment = $this->currentTokenAsComment();
1539        if ($c === '-') {
1540            $comment->append('--!');
1541            $this->state = TokenizerState::CommentEndDash;
1542            return;
1543        }
1544        if ($c === '>') {
1545            $this->error(ParseErrorCode::IncorrectlyClosedComment);
1546            $this->emit($comment);
1547            $this->state = TokenizerState::Data;
1548            return;
1549        }
1550        if ($c === null) {
1551            $this->error(ParseErrorCode::EofInComment);
1552            $this->emit($comment);
1553            $this->emit(new EofToken());
1554            return;
1555        }
1556        $comment->append('--!');
1557        $this->reconsumeIn(TokenizerState::Comment);
1558    }
1559
1560    // ============================================================
1561    // 13.2.5.53 DOCTYPE state (and friends)
1562    // ============================================================
1563    private function stateDoctype(): void
1564    {
1565        $c = $this->consume();
1566        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1567            $this->state = TokenizerState::BeforeDoctypeName;
1568            return;
1569        }
1570        if ($c === '>') {
1571            $this->reconsumeIn(TokenizerState::BeforeDoctypeName);
1572            return;
1573        }
1574        if ($c === null) {
1575            $this->error(ParseErrorCode::EofInDoctype);
1576            $token = new DoctypeToken();
1577            $token->forceQuirks = true;
1578            $this->emit($token);
1579            $this->emit(new EofToken());
1580            return;
1581        }
1582        $this->error(ParseErrorCode::MissingWhitespaceBeforeDoctypeName);
1583        $this->reconsumeIn(TokenizerState::BeforeDoctypeName);
1584    }
1585
1586    private function stateBeforeDoctypeName(): void
1587    {
1588        $c = $this->consume();
1589        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1590            return;
1591        }
1592        $token = new DoctypeToken();
1593        $this->currentToken = $token;
1594        if ($c !== null && self::isAsciiUpperAlpha($c)) {
1595            $token->name = strtolower($c);
1596            $this->state = TokenizerState::DoctypeName;
1597            return;
1598        }
1599        if ($c === "\u{0000}") {
1600            $this->error(ParseErrorCode::UnexpectedNullCharacter);
1601            $token->name = "\u{FFFD}";
1602            $this->state = TokenizerState::DoctypeName;
1603            return;
1604        }
1605        if ($c === '>') {
1606            $this->error(ParseErrorCode::MissingDoctypeName);
1607            $token->forceQuirks = true;
1608            $this->emit($token);
1609            $this->state = TokenizerState::Data;
1610            return;
1611        }
1612        if ($c === null) {
1613            $this->error(ParseErrorCode::EofInDoctype);
1614            $token->forceQuirks = true;
1615            $this->emit($token);
1616            $this->emit(new EofToken());
1617            return;
1618        }
1619        $token->name = $c;
1620        $this->state = TokenizerState::DoctypeName;
1621    }
1622
1623    private function stateDoctypeName(): void
1624    {
1625        $c = $this->consume();
1626        $token = $this->currentTokenAsDoctype();
1627        assert($token->name !== null);
1628        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1629            $this->state = TokenizerState::AfterDoctypeName;
1630            return;
1631        }
1632        if ($c === '>') {
1633            $this->emit($token);
1634            $this->state = TokenizerState::Data;
1635            return;
1636        }
1637        if ($c !== null && self::isAsciiUpperAlpha($c)) {
1638            $token->name .= strtolower($c);
1639            return;
1640        }
1641        if ($c === "\u{0000}") {
1642            $this->error(ParseErrorCode::UnexpectedNullCharacter);
1643            $token->name .= "\u{FFFD}";
1644            return;
1645        }
1646        if ($c === null) {
1647            $this->error(ParseErrorCode::EofInDoctype);
1648            $token->forceQuirks = true;
1649            $this->emit($token);
1650            $this->emit(new EofToken());
1651            return;
1652        }
1653        $token->name .= $c;
1654    }
1655
1656    private function stateAfterDoctypeName(): void
1657    {
1658        $c = $this->consume();
1659        $token = $this->currentTokenAsDoctype();
1660        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1661            return;
1662        }
1663        if ($c === '>') {
1664            $this->emit($token);
1665            $this->state = TokenizerState::Data;
1666            return;
1667        }
1668        if ($c === null) {
1669            $this->error(ParseErrorCode::EofInDoctype);
1670            $token->forceQuirks = true;
1671            $this->emit($token);
1672            $this->emit(new EofToken());
1673            return;
1674        }
1675        // Look ahead for PUBLIC or SYSTEM (case-insensitive, including the
1676        // current char). pos points one past the current char; -1 to include it.
1677        $effectivePos = $this->reconsume ? $this->pos - 1 : $this->pos - 1;
1678        $window = implode('', array_slice($this->chars, $effectivePos, 6));
1679        if (strcasecmp($window, 'PUBLIC') === 0) {
1680            $this->pos = $effectivePos + 6;
1681            $this->reconsume = false;
1682            $this->state = TokenizerState::AfterDoctypePublicKeyword;
1683            return;
1684        }
1685        if (strcasecmp($window, 'SYSTEM') === 0) {
1686            $this->pos = $effectivePos + 6;
1687            $this->reconsume = false;
1688            $this->state = TokenizerState::AfterDoctypeSystemKeyword;
1689            return;
1690        }
1691        $this->error(ParseErrorCode::InvalidCharacterSequenceAfterDoctypeName);
1692        $token->forceQuirks = true;
1693        $this->reconsumeIn(TokenizerState::BogusDoctype);
1694    }
1695
1696    // ============================================================
1697    // 13.2.5.57–67 DOCTYPE PUBLIC / SYSTEM identifier states
1698    // ============================================================
1699    private function stateAfterDoctypePublicKeyword(): void
1700    {
1701        $c = $this->consume();
1702        $token = $this->currentTokenAsDoctype();
1703        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1704            $this->state = TokenizerState::BeforeDoctypePublicIdentifier;
1705            return;
1706        }
1707        if ($c === '"') {
1708            $this->error(ParseErrorCode::MissingWhitespaceAfterDoctypePublicKeyword);
1709            $token->publicId = '';
1710            $this->state = TokenizerState::DoctypePublicIdentifierDoubleQuoted;
1711            return;
1712        }
1713        if ($c === "'") {
1714            $this->error(ParseErrorCode::MissingWhitespaceAfterDoctypePublicKeyword);
1715            $token->publicId = '';
1716            $this->state = TokenizerState::DoctypePublicIdentifierSingleQuoted;
1717            return;
1718        }
1719        if ($c === '>') {
1720            $this->error(ParseErrorCode::MissingDoctypePublicIdentifier);
1721            $token->forceQuirks = true;
1722            $this->emit($token);
1723            $this->state = TokenizerState::Data;
1724            return;
1725        }
1726        if ($c === null) {
1727            $this->error(ParseErrorCode::EofInDoctype);
1728            $token->forceQuirks = true;
1729            $this->emit($token);
1730            $this->emit(new EofToken());
1731            return;
1732        }
1733        $this->error(ParseErrorCode::MissingQuoteBeforeDoctypePublicIdentifier);
1734        $token->forceQuirks = true;
1735        $this->reconsumeIn(TokenizerState::BogusDoctype);
1736    }
1737
1738    private function stateBeforeDoctypePublicIdentifier(): void
1739    {
1740        $c = $this->consume();
1741        $token = $this->currentTokenAsDoctype();
1742        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1743            return;
1744        }
1745        if ($c === '"') {
1746            $token->publicId = '';
1747            $this->state = TokenizerState::DoctypePublicIdentifierDoubleQuoted;
1748            return;
1749        }
1750        if ($c === "'") {
1751            $token->publicId = '';
1752            $this->state = TokenizerState::DoctypePublicIdentifierSingleQuoted;
1753            return;
1754        }
1755        if ($c === '>') {
1756            $this->error(ParseErrorCode::MissingDoctypePublicIdentifier);
1757            $token->forceQuirks = true;
1758            $this->emit($token);
1759            $this->state = TokenizerState::Data;
1760            return;
1761        }
1762        if ($c === null) {
1763            $this->error(ParseErrorCode::EofInDoctype);
1764            $token->forceQuirks = true;
1765            $this->emit($token);
1766            $this->emit(new EofToken());
1767            return;
1768        }
1769        $this->error(ParseErrorCode::MissingQuoteBeforeDoctypePublicIdentifier);
1770        $token->forceQuirks = true;
1771        $this->reconsumeIn(TokenizerState::BogusDoctype);
1772    }
1773
1774    private function stateDoctypePublicIdentifierDoubleQuoted(): void
1775    {
1776        $this->doctypeQuotedIdentifier(true, '"');
1777    }
1778
1779    private function stateDoctypePublicIdentifierSingleQuoted(): void
1780    {
1781        $this->doctypeQuotedIdentifier(true, "'");
1782    }
1783
1784    private function stateAfterDoctypePublicIdentifier(): void
1785    {
1786        $c = $this->consume();
1787        $token = $this->currentTokenAsDoctype();
1788        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1789            $this->state = TokenizerState::BetweenDoctypePublicAndSystemIdentifiers;
1790            return;
1791        }
1792        if ($c === '>') {
1793            $this->emit($token);
1794            $this->state = TokenizerState::Data;
1795            return;
1796        }
1797        if ($c === '"') {
1798            $this->error(ParseErrorCode::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
1799            $token->systemId = '';
1800            $this->state = TokenizerState::DoctypeSystemIdentifierDoubleQuoted;
1801            return;
1802        }
1803        if ($c === "'") {
1804            $this->error(ParseErrorCode::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
1805            $token->systemId = '';
1806            $this->state = TokenizerState::DoctypeSystemIdentifierSingleQuoted;
1807            return;
1808        }
1809        if ($c === null) {
1810            $this->error(ParseErrorCode::EofInDoctype);
1811            $token->forceQuirks = true;
1812            $this->emit($token);
1813            $this->emit(new EofToken());
1814            return;
1815        }
1816        $this->error(ParseErrorCode::MissingQuoteBeforeDoctypeSystemIdentifier);
1817        $token->forceQuirks = true;
1818        $this->reconsumeIn(TokenizerState::BogusDoctype);
1819    }
1820
1821    private function stateBetweenDoctypePublicAndSystemIdentifiers(): void
1822    {
1823        $c = $this->consume();
1824        $token = $this->currentTokenAsDoctype();
1825        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1826            return;
1827        }
1828        if ($c === '>') {
1829            $this->emit($token);
1830            $this->state = TokenizerState::Data;
1831            return;
1832        }
1833        if ($c === '"') {
1834            $token->systemId = '';
1835            $this->state = TokenizerState::DoctypeSystemIdentifierDoubleQuoted;
1836            return;
1837        }
1838        if ($c === "'") {
1839            $token->systemId = '';
1840            $this->state = TokenizerState::DoctypeSystemIdentifierSingleQuoted;
1841            return;
1842        }
1843        if ($c === null) {
1844            $this->error(ParseErrorCode::EofInDoctype);
1845            $token->forceQuirks = true;
1846            $this->emit($token);
1847            $this->emit(new EofToken());
1848            return;
1849        }
1850        $this->error(ParseErrorCode::MissingQuoteBeforeDoctypeSystemIdentifier);
1851        $token->forceQuirks = true;
1852        $this->reconsumeIn(TokenizerState::BogusDoctype);
1853    }
1854
1855    private function stateAfterDoctypeSystemKeyword(): void
1856    {
1857        $c = $this->consume();
1858        $token = $this->currentTokenAsDoctype();
1859        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1860            $this->state = TokenizerState::BeforeDoctypeSystemIdentifier;
1861            return;
1862        }
1863        if ($c === '"') {
1864            $this->error(ParseErrorCode::MissingWhitespaceAfterDoctypeSystemKeyword);
1865            $token->systemId = '';
1866            $this->state = TokenizerState::DoctypeSystemIdentifierDoubleQuoted;
1867            return;
1868        }
1869        if ($c === "'") {
1870            $this->error(ParseErrorCode::MissingWhitespaceAfterDoctypeSystemKeyword);
1871            $token->systemId = '';
1872            $this->state = TokenizerState::DoctypeSystemIdentifierSingleQuoted;
1873            return;
1874        }
1875        if ($c === '>') {
1876            $this->error(ParseErrorCode::MissingDoctypeSystemIdentifier);
1877            $token->forceQuirks = true;
1878            $this->emit($token);
1879            $this->state = TokenizerState::Data;
1880            return;
1881        }
1882        if ($c === null) {
1883            $this->error(ParseErrorCode::EofInDoctype);
1884            $token->forceQuirks = true;
1885            $this->emit($token);
1886            $this->emit(new EofToken());
1887            return;
1888        }
1889        $this->error(ParseErrorCode::MissingQuoteBeforeDoctypeSystemIdentifier);
1890        $token->forceQuirks = true;
1891        $this->reconsumeIn(TokenizerState::BogusDoctype);
1892    }
1893
1894    private function stateBeforeDoctypeSystemIdentifier(): void
1895    {
1896        $c = $this->consume();
1897        $token = $this->currentTokenAsDoctype();
1898        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1899            return;
1900        }
1901        if ($c === '"') {
1902            $token->systemId = '';
1903            $this->state = TokenizerState::DoctypeSystemIdentifierDoubleQuoted;
1904            return;
1905        }
1906        if ($c === "'") {
1907            $token->systemId = '';
1908            $this->state = TokenizerState::DoctypeSystemIdentifierSingleQuoted;
1909            return;
1910        }
1911        if ($c === '>') {
1912            $this->error(ParseErrorCode::MissingDoctypeSystemIdentifier);
1913            $token->forceQuirks = true;
1914            $this->emit($token);
1915            $this->state = TokenizerState::Data;
1916            return;
1917        }
1918        if ($c === null) {
1919            $this->error(ParseErrorCode::EofInDoctype);
1920            $token->forceQuirks = true;
1921            $this->emit($token);
1922            $this->emit(new EofToken());
1923            return;
1924        }
1925        $this->error(ParseErrorCode::MissingQuoteBeforeDoctypeSystemIdentifier);
1926        $token->forceQuirks = true;
1927        $this->reconsumeIn(TokenizerState::BogusDoctype);
1928    }
1929
1930    private function stateDoctypeSystemIdentifierDoubleQuoted(): void
1931    {
1932        $this->doctypeQuotedIdentifier(false, '"');
1933    }
1934
1935    private function stateDoctypeSystemIdentifierSingleQuoted(): void
1936    {
1937        $this->doctypeQuotedIdentifier(false, "'");
1938    }
1939
1940    private function stateAfterDoctypeSystemIdentifier(): void
1941    {
1942        $c = $this->consume();
1943        $token = $this->currentTokenAsDoctype();
1944        if ($c === "\t" || $c === "\n" || $c === "\f" || $c === ' ') {
1945            return;
1946        }
1947        if ($c === '>') {
1948            $this->emit($token);
1949            $this->state = TokenizerState::Data;
1950            return;
1951        }
1952        if ($c === null) {
1953            $this->error(ParseErrorCode::EofInDoctype);
1954            $token->forceQuirks = true;
1955            $this->emit($token);
1956            $this->emit(new EofToken());
1957            return;
1958        }
1959        // Per spec: do NOT set force-quirks here. The DOCTYPE is otherwise
1960        // syntactically valid; trailing garbage just goes to a bogus state.
1961        $this->error(ParseErrorCode::UnexpectedCharacterAfterDoctypeSystemIdentifier);
1962        $this->reconsumeIn(TokenizerState::BogusDoctype);
1963    }
1964
1965    /**
1966     * Shared logic for the four quoted-identifier states. $isPublic selects
1967     * which identifier field to append to; $terminator is " or '.
1968     */
1969    private function doctypeQuotedIdentifier(bool $isPublic, string $terminator): void
1970    {
1971        $c = $this->consume();
1972        $token = $this->currentTokenAsDoctype();
1973        $field = $isPublic ? 'publicId' : 'systemId';
1974        if ($c === $terminator) {
1975            $this->state = $isPublic
1976                ? TokenizerState::AfterDoctypePublicIdentifier
1977                : TokenizerState::AfterDoctypeSystemIdentifier;
1978            return;
1979        }
1980        if ($c === "\u{0000}") {
1981            $this->error(ParseErrorCode::UnexpectedNullCharacter);
1982            assert($token->{$field} !== null);
1983            $token->{$field} .= "\u{FFFD}";
1984            return;
1985        }
1986        if ($c === '>') {
1987            $this->error($isPublic
1988                ? ParseErrorCode::AbruptDoctypePublicIdentifier
1989                : ParseErrorCode::AbruptDoctypeSystemIdentifier);
1990            $token->forceQuirks = true;
1991            $this->emit($token);
1992            $this->state = TokenizerState::Data;
1993            return;
1994        }
1995        if ($c === null) {
1996            $this->error(ParseErrorCode::EofInDoctype);
1997            $token->forceQuirks = true;
1998            $this->emit($token);
1999            $this->emit(new EofToken());
2000            return;
2001        }
2002        assert($token->{$field} !== null);
2003        $token->{$field} .= $c;
2004    }
2005
2006    private function stateBogusDoctype(): void
2007    {
2008        $c = $this->consume();
2009        if ($c === '>') {
2010            $this->emit($this->currentTokenAsDoctype());
2011            $this->state = TokenizerState::Data;
2012            return;
2013        }
2014        if ($c === null) {
2015            $this->emit($this->currentTokenAsDoctype());
2016            $this->emit(new EofToken());
2017            return;
2018        }
2019        if ($c === "\u{0000}") {
2020            $this->error(ParseErrorCode::UnexpectedNullCharacter);
2021        }
2022        // Otherwise, ignore (no append per spec).
2023    }
2024
2025    // ============================================================
2026    // 13.2.5.72–80 Character reference states
2027    // ============================================================
2028    private function stateCharacterReference(): void
2029    {
2030        $this->tempBuffer = '&';
2031        $c = $this->consume();
2032        if ($c === null) {
2033            $this->flushTempBufferToCharOrAttribute();
2034            $this->reconsumeIn($this->returnState);
2035            return;
2036        }
2037        if (self::isAsciiAlphanumeric($c)) {
2038            $this->reconsumeIn(TokenizerState::NamedCharacterReference);
2039            return;
2040        }
2041        if ($c === '#') {
2042            $this->tempBuffer .= '#';
2043            $this->state = TokenizerState::NumericCharacterReference;
2044            return;
2045        }
2046        $this->flushTempBufferToCharOrAttribute();
2047        $this->reconsumeIn($this->returnState);
2048    }
2049
2050    private function stateNamedCharacterReference(): void
2051    {
2052        // Greedy longest-match against the in-memory table. WHATWG defines
2053        // matching against the spec's full ~2200-entry trie; our table is the
2054        // high-frequency subset (see NamedCharacterReferences).
2055        $start = $this->reconsume ? $this->pos - 1 : $this->pos;
2056        $bestMatch = null;
2057        $bestLen = 0;
2058        for ($len = 1; $len <= 32 && $start + $len <= $this->length; $len++) {
2059            $candidate = implode('', array_slice($this->chars, $start, $len));
2060            if (isset(NamedCharacterReferences::TABLE[$candidate])) {
2061                $bestMatch = $candidate;
2062                $bestLen = $len;
2063            }
2064        }
2065
2066        if ($bestMatch !== null) {
2067            $hasSemicolon = str_ends_with($bestMatch, ';');
2068            $nextChar = $start + $bestLen < $this->length ? $this->chars[$start + $bestLen] : null;
2069            $inAttribute = $this->returnState === TokenizerState::AttributeValueDoubleQuoted
2070                || $this->returnState === TokenizerState::AttributeValueSingleQuoted
2071                || $this->returnState === TokenizerState::AttributeValueUnquoted;
2072
2073            // Special case for attribute values + legacy entries: if the next
2074            // char is "=" or alphanumeric, don't decode (preserves
2075            // backward-compat with old URLs like ?foo=bar&copy=true).
2076            if (!$hasSemicolon && $inAttribute && $nextChar !== null
2077                && ($nextChar === '=' || self::isAsciiAlphanumeric($nextChar))
2078            ) {
2079                $this->tempBuffer = '&' . $bestMatch;
2080                $this->advance($bestLen);
2081                $this->reconsume = false;
2082                $this->flushTempBufferToCharOrAttribute();
2083                $this->state = $this->returnState;
2084                return;
2085            }
2086
2087            if (!$hasSemicolon) {
2088                $this->error(ParseErrorCode::MissingSemicolonAfterCharacterReference);
2089            }
2090            $this->tempBuffer = NamedCharacterReferences::TABLE[$bestMatch];
2091            $this->advance($bestLen);
2092            $this->reconsume = false;
2093            $this->flushTempBufferToCharOrAttribute();
2094            $this->state = $this->returnState;
2095            return;
2096        }
2097
2098        // No match â€” flush "&" and fall through to the ambiguous-ampersand state
2099        // to consume any remaining ASCII alphanumerics + ";" without decoding.
2100        $this->flushTempBufferToCharOrAttribute();
2101        $this->state = TokenizerState::AmbiguousAmpersand;
2102    }
2103
2104    private function stateAmbiguousAmpersand(): void
2105    {
2106        $c = $this->consume();
2107        if ($c !== null && self::isAsciiAlphanumeric($c)) {
2108            $inAttribute = $this->returnState === TokenizerState::AttributeValueDoubleQuoted
2109                || $this->returnState === TokenizerState::AttributeValueSingleQuoted
2110                || $this->returnState === TokenizerState::AttributeValueUnquoted;
2111            if ($inAttribute) {
2112                $this->appendToCurrentAttributeValue($this->currentTokenAsTag(), $c);
2113            } else {
2114                $this->emitChar($c);
2115            }
2116            return;
2117        }
2118        if ($c === ';') {
2119            $this->error(ParseErrorCode::UnknownNamedCharacterReference);
2120        }
2121        $this->reconsumeIn($this->returnState);
2122    }
2123
2124    private function stateNumericCharacterReference(): void
2125    {
2126        $this->characterReferenceCode = 0;
2127        $c = $this->consume();
2128        if ($c === 'x' || $c === 'X') {
2129            $this->tempBuffer .= $c;
2130            $this->state = TokenizerState::HexadecimalCharacterReferenceStart;
2131            return;
2132        }
2133        $this->reconsumeIn(TokenizerState::DecimalCharacterReferenceStart);
2134    }
2135
2136    private function stateHexadecimalCharacterReferenceStart(): void
2137    {
2138        $c = $this->consume();
2139        if ($c !== null && self::isAsciiHexDigit($c)) {
2140            $this->reconsumeIn(TokenizerState::HexadecimalCharacterReference);
2141            return;
2142        }
2143        $this->error(ParseErrorCode::AbsenceOfDigitsInNumericCharacterReference);
2144        $this->flushTempBufferToCharOrAttribute();
2145        $this->reconsumeIn($this->returnState);
2146    }
2147
2148    private function stateDecimalCharacterReferenceStart(): void
2149    {
2150        $c = $this->consume();
2151        if ($c !== null && ctype_digit($c)) {
2152            $this->reconsumeIn(TokenizerState::DecimalCharacterReference);
2153            return;
2154        }
2155        $this->error(ParseErrorCode::AbsenceOfDigitsInNumericCharacterReference);
2156        $this->flushTempBufferToCharOrAttribute();
2157        $this->reconsumeIn($this->returnState);
2158    }
2159
2160    private function stateHexadecimalCharacterReference(): void
2161    {
2162        $c = $this->consume();
2163        if ($c === null) {
2164            $this->error(ParseErrorCode::MissingSemicolonAfterCharacterReference);
2165            $this->state = TokenizerState::NumericCharacterReferenceEnd;
2166            $this->reconsume = true;
2167            return;
2168        }
2169        if (ctype_digit($c)) {
2170            $this->characterReferenceCode = $this->characterReferenceCode * 16 + (ord($c) - 0x30);
2171            return;
2172        }
2173        if ($c >= 'A' && $c <= 'F') {
2174            $this->characterReferenceCode = $this->characterReferenceCode * 16 + (ord($c) - 0x37);
2175            return;
2176        }
2177        if ($c >= 'a' && $c <= 'f') {
2178            $this->characterReferenceCode = $this->characterReferenceCode * 16 + (ord($c) - 0x57);
2179            return;
2180        }
2181        if ($c === ';') {
2182            $this->state = TokenizerState::NumericCharacterReferenceEnd;
2183            return;
2184        }
2185        $this->error(ParseErrorCode::MissingSemicolonAfterCharacterReference);
2186        $this->reconsumeIn(TokenizerState::NumericCharacterReferenceEnd);
2187    }
2188
2189    private function stateDecimalCharacterReference(): void
2190    {
2191        $c = $this->consume();
2192        if ($c === null) {
2193            $this->error(ParseErrorCode::MissingSemicolonAfterCharacterReference);
2194            $this->state = TokenizerState::NumericCharacterReferenceEnd;
2195            $this->reconsume = true;
2196            return;
2197        }
2198        if (ctype_digit($c)) {
2199            $this->characterReferenceCode = $this->characterReferenceCode * 10 + (ord($c) - 0x30);
2200            return;
2201        }
2202        if ($c === ';') {
2203            $this->state = TokenizerState::NumericCharacterReferenceEnd;
2204            return;
2205        }
2206        $this->error(ParseErrorCode::MissingSemicolonAfterCharacterReference);
2207        $this->reconsumeIn(TokenizerState::NumericCharacterReferenceEnd);
2208    }
2209
2210    private function stateNumericCharacterReferenceEnd(): void
2211    {
2212        $code = $this->characterReferenceCode;
2213        if ($code === 0) {
2214            $this->error(ParseErrorCode::NullCharacterReference);
2215            $code = 0xFFFD;
2216        } elseif ($code > 0x10FFFF) {
2217            $this->error(ParseErrorCode::CharacterReferenceOutsideUnicodeRange);
2218            $code = 0xFFFD;
2219        } elseif ($code >= 0xD800 && $code <= 0xDFFF) {
2220            $this->error(ParseErrorCode::SurrogateCharacterReference);
2221            $code = 0xFFFD;
2222        } elseif (isset(NamedCharacterReferences::NUMERIC_REPLACEMENTS[$code])) {
2223            $this->error(ParseErrorCode::ControlCharacterReference);
2224            $code = NamedCharacterReferences::NUMERIC_REPLACEMENTS[$code];
2225        }
2226        $this->tempBuffer = mb_chr($code, 'UTF-8');
2227        $this->flushTempBufferToCharOrAttribute();
2228        $this->state = $this->returnState;
2229    }
2230
2231    private function flushTempBufferToCharOrAttribute(): void
2232    {
2233        $inAttribute = $this->returnState === TokenizerState::AttributeValueDoubleQuoted
2234            || $this->returnState === TokenizerState::AttributeValueSingleQuoted
2235            || $this->returnState === TokenizerState::AttributeValueUnquoted;
2236        if ($inAttribute) {
2237            $this->appendToCurrentAttributeValue($this->currentTokenAsTag(), $this->tempBuffer);
2238        } else {
2239            if ($this->tempBuffer !== '') {
2240                $this->emitChar($this->tempBuffer);
2241            }
2242        }
2243        $this->tempBuffer = '';
2244    }
2245
2246    // ============================================================
2247    // Tag finalisation
2248    // ============================================================
2249    private function finalizeAndEmitTag(): void
2250    {
2251        $tag = $this->currentTokenAsTag();
2252        $this->dedupAttributes($tag);
2253        if ($tag instanceof EndTagToken && count($tag->attributes) > 0) {
2254            $this->error(ParseErrorCode::EndTagWithAttributes);
2255        }
2256        $this->emit($tag);
2257    }
2258
2259    // ============================================================
2260    // Character classification helpers
2261    // ============================================================
2262    private static function isAsciiAlpha(string $c): bool
2263    {
2264        return ($c >= 'a' && $c <= 'z') || ($c >= 'A' && $c <= 'Z');
2265    }
2266
2267    private static function isAsciiUpperAlpha(string $c): bool
2268    {
2269        return $c >= 'A' && $c <= 'Z';
2270    }
2271
2272    private static function isAsciiLowerAlpha(string $c): bool
2273    {
2274        return $c >= 'a' && $c <= 'z';
2275    }
2276
2277    private static function isAsciiAlphanumeric(string $c): bool
2278    {
2279        return self::isAsciiAlpha($c) || ctype_digit($c);
2280    }
2281
2282    private static function isAsciiHexDigit(string $c): bool
2283    {
2284        return ctype_digit($c) || ($c >= 'A' && $c <= 'F') || ($c >= 'a' && $c <= 'f');
2285    }
2286}