Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
73.68% |
14 / 19 |
|
66.67% |
2 / 3 |
CRAP | |
0.00% |
0 / 1 |
| Parser | |
73.68% |
14 / 19 |
|
66.67% |
2 / 3 |
13.21 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| parseDocument | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| parseFragment | |
66.67% |
10 / 15 |
|
0.00% |
0 / 1 |
12.00 | |||
| 1 | <?php |
| 2 | |
| 3 | declare(strict_types=1); |
| 4 | |
| 5 | namespace Phpdftk\Html; |
| 6 | |
| 7 | use Phpdftk\Html\Dom\Document; |
| 8 | use Phpdftk\Html\Dom\DocumentFragment; |
| 9 | use Phpdftk\Html\Dom\Element; |
| 10 | use Phpdftk\Html\Tokenizer\Tokenizer; |
| 11 | use Phpdftk\Html\TreeConstruction\TreeBuilder; |
| 12 | |
| 13 | /** |
| 14 | * WHATWG HTML5 parser entry point. Hand-rolls the tokenizer (§13.2.5) and |
| 15 | * tree-construction state machine (§13.2.6) — no `libxml`, no DOM extension. |
| 16 | * |
| 17 | * The public surface is intentionally tiny: parseDocument() for a full HTML |
| 18 | * document and parseFragment() for innerHTML-style operations and HTML |
| 19 | * embedded in SVG <foreignObject>. |
| 20 | * |
| 21 | * Implementation is staged across Phase 1B sub-phases: |
| 22 | * - 1B.1: public DOM types and parser shell (this file). |
| 23 | * - 1B.2: tokenizer state machine. |
| 24 | * - 1B.3: tree-construction insertion modes. |
| 25 | * - 1B.4: declarative-shadow-DOM tree-construction integration. |
| 26 | * - 1B.5: html5lib-tests integration to 100%. |
| 27 | */ |
| 28 | final class Parser |
| 29 | { |
| 30 | public function __construct(public readonly ParserOptions $options = new ParserOptions()) {} |
| 31 | |
| 32 | /** |
| 33 | * Parse a complete HTML document. |
| 34 | * |
| 35 | * @param string $html The HTML source. |
| 36 | * @param string|null $encoding Optional override for encoding sniffing. |
| 37 | */ |
| 38 | public function parseDocument(string $html, ?string $encoding = null): Document |
| 39 | { |
| 40 | $tokenizer = new Tokenizer($html); |
| 41 | $builder = new TreeBuilder($this->options); |
| 42 | return $builder->build($tokenizer); |
| 43 | } |
| 44 | |
| 45 | /** |
| 46 | * Parse an HTML fragment in the context of a host element per WHATWG |
| 47 | * §13.4. The context element determines the initial tokenizer state |
| 48 | * (e.g. RCDATA for <title>/<textarea>, RAWTEXT for <style>/<script>, |
| 49 | * PLAINTEXT for <plaintext>) and the initial insertion mode (via the |
| 50 | * "reset insertion mode appropriately" walk with the context as the |
| 51 | * implicit bottom of the stack). |
| 52 | */ |
| 53 | public function parseFragment(string $html, Element $context): DocumentFragment |
| 54 | { |
| 55 | // Step 1: new document, inherit mode from context's owner. |
| 56 | $doc = new \Phpdftk\Html\Dom\Document(); |
| 57 | $doc->mode = $context->ownerDocument->mode; |
| 58 | |
| 59 | // Step 2: tokenizer with context-aware initial state. |
| 60 | $tokenizer = new Tokenizer($html); |
| 61 | if ($context->namespaceURI === \Phpdftk\Html\Dom\Document::HTML_NS) { |
| 62 | $tokenizer->state = match ($context->localName) { |
| 63 | 'title', 'textarea' => \Phpdftk\Html\Tokenizer\TokenizerState::Rcdata, |
| 64 | 'style', 'xmp', 'iframe', 'noembed', 'noframes' => \Phpdftk\Html\Tokenizer\TokenizerState::Rawtext, |
| 65 | 'script' => \Phpdftk\Html\Tokenizer\TokenizerState::ScriptData, |
| 66 | 'noscript' => $this->options->scriptingEnabled |
| 67 | ? \Phpdftk\Html\Tokenizer\TokenizerState::Rawtext |
| 68 | : \Phpdftk\Html\Tokenizer\TokenizerState::Data, |
| 69 | 'plaintext' => \Phpdftk\Html\Tokenizer\TokenizerState::Plaintext, |
| 70 | default => \Phpdftk\Html\Tokenizer\TokenizerState::Data, |
| 71 | }; |
| 72 | } |
| 73 | |
| 74 | // Step 3-9 delegated to TreeBuilder::buildFragment, which configures |
| 75 | // the initial state (html root, form pointer, template stack, reset |
| 76 | // insertion mode based on context) and runs the parse. |
| 77 | $builder = new TreeBuilder($this->options, $doc); |
| 78 | return $builder->buildFragment($tokenizer, $context); |
| 79 | } |
| 80 | } |