Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
81.23% covered (warning)
81.23%
277 / 341
55.56% covered (warning)
55.56%
15 / 27
CRAP
0.00% covered (danger)
0.00%
0 / 1
Tokenizer
81.23% covered (warning)
81.23%
277 / 341
55.56% covered (warning)
55.56%
15 / 27
394.20
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
3
 tokenize
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 nextToken
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
4
 step
90.00% covered (success)
90.00%
108 / 120
0.00% covered (danger)
0.00%
0 / 1
40.52
 consumeComments
91.67% covered (success)
91.67%
11 / 12
0.00% covered (danger)
0.00%
0 / 1
7.03
 consumeString
50.00% covered (danger)
50.00%
11 / 22
0.00% covered (danger)
0.00%
0 / 1
16.00
 consumeIdentLikeToken
92.31% covered (success)
92.31%
12 / 13
0.00% covered (danger)
0.00%
0 / 1
8.03
 consumeUrlToken
41.38% covered (danger)
41.38%
12 / 29
0.00% covered (danger)
0.00%
0 / 1
60.32
 consumeRemnantsOfBadUrl
0.00% covered (danger)
0.00%
0 / 10
0.00% covered (danger)
0.00%
0 / 1
56
 consumeIdentSequence
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
5
 consumeNumericToken
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
3
 consumeNumber
93.94% covered (success)
93.94%
31 / 33
0.00% covered (danger)
0.00%
0 / 1
16.06
 consumeEscape
80.95% covered (warning)
80.95%
17 / 21
0.00% covered (danger)
0.00%
0 / 1
14.17
 isValidEscape
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 wouldStartIdentSequence
90.91% covered (success)
90.91%
10 / 11
0.00% covered (danger)
0.00%
0 / 1
8.05
 wouldStartNumber
75.00% covered (warning)
75.00%
9 / 12
0.00% covered (danger)
0.00%
0 / 1
7.77
 preprocess
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 peek
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
2
 advance
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 emit
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 isWhitespace
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
3
 isDigit
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
2
 isHexDigit
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
5
 isLetter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
4
 isIdentStartCodePoint
80.00% covered (warning)
80.00%
4 / 5
0.00% covered (danger)
0.00%
0 / 1
4.13
 isIdentCodePoint
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
3
 isNonPrintable
85.71% covered (warning)
85.71%
6 / 7
0.00% covered (danger)
0.00%
0 / 1
7.14
1<?php
2
3declare(strict_types=1);
4
5namespace Phpdftk\Css;
6
7use Phpdftk\Css\Token\AtKeywordToken;
8use Phpdftk\Css\Token\BadStringToken;
9use Phpdftk\Css\Token\BadUrlToken;
10use Phpdftk\Css\Token\CdcToken;
11use Phpdftk\Css\Token\CdoToken;
12use Phpdftk\Css\Token\ColonToken;
13use Phpdftk\Css\Token\CommaToken;
14use Phpdftk\Css\Token\DelimToken;
15use Phpdftk\Css\Token\DimensionToken;
16use Phpdftk\Css\Token\EofToken;
17use Phpdftk\Css\Token\FunctionToken;
18use Phpdftk\Css\Token\HashToken;
19use Phpdftk\Css\Token\HashTokenType;
20use Phpdftk\Css\Token\IdentToken;
21use Phpdftk\Css\Token\LeftBraceToken;
22use Phpdftk\Css\Token\LeftBracketToken;
23use Phpdftk\Css\Token\LeftParenToken;
24use Phpdftk\Css\Token\NumberToken;
25use Phpdftk\Css\Token\NumberTokenType;
26use Phpdftk\Css\Token\PercentageToken;
27use Phpdftk\Css\Token\RightBraceToken;
28use Phpdftk\Css\Token\RightBracketToken;
29use Phpdftk\Css\Token\RightParenToken;
30use Phpdftk\Css\Token\SemicolonToken;
31use Phpdftk\Css\Token\StringToken;
32use Phpdftk\Css\Token\Token;
33use Phpdftk\Css\Token\UrlToken;
34use Phpdftk\Css\Token\WhitespaceToken;
35
36/**
37 * CSS Syntax Module 3 tokenizer (§4). Walks the preprocessed input
38 * character-by-character, dispatching by the next character into one of the
39 * "consume X" sub-procedures.
40 *
41 * Preprocessing per §3.3 is applied at construction: CR / CRLF / FF → LF;
42 * NULL → U+FFFD (handled inline during consume). Comments are stripped
43 * (not emitted as tokens).
44 *
45 * Both `tokenize()` (returns full array) and `nextToken()` (streaming) are
46 * provided, mirroring the html tokenizer's contract so downstream parsers
47 * can drive either way.
48 */
49final class Tokenizer
50{
51    /** @var list<string> input as an array of UTF-8 single-codepoint strings */
52    private array $chars;
53    private int $length;
54    private int $pos = 0;
55    /** @var list<Token> */
56    private array $emitted = [];
57    private int $emittedCursor = 0;
58    private bool $done = false;
59
60    public function __construct(string $input)
61    {
62        $normalised = $this->preprocess($input);
63        $this->chars = $normalised === '' ? [] : (mb_str_split($normalised, 1, 'UTF-8') ?: []);
64        $this->length = count($this->chars);
65    }
66
67    /** @return list<Token> */
68    public function tokenize(): array
69    {
70        while (!$this->done) {
71            $this->step();
72        }
73        return $this->emitted;
74    }
75
76    public function nextToken(): ?Token
77    {
78        while ($this->emittedCursor >= count($this->emitted) && !$this->done) {
79            $this->step();
80        }
81        if ($this->emittedCursor < count($this->emitted)) {
82            return $this->emitted[$this->emittedCursor++];
83        }
84        return null;
85    }
86
87    private function step(): void
88    {
89        // Comments are stripped here so the dispatch below doesn't see them.
90        $this->consumeComments();
91        $c = $this->peek(0);
92        if ($c === null) {
93            $this->emit(new EofToken());
94            $this->done = true;
95            return;
96        }
97        if (self::isWhitespace($c)) {
98            while (($next = $this->peek(0)) !== null && self::isWhitespace($next)) {
99                $this->advance();
100            }
101            $this->emit(new WhitespaceToken());
102            return;
103        }
104        if ($c === '"' || $c === "'") {
105            $this->advance();
106            $this->emit($this->consumeString($c));
107            return;
108        }
109        if ($c === '#') {
110            // Hash if followed by ident-code-point or escape; else Delim('#').
111            $next = $this->peek(1);
112            if ($next !== null && (self::isIdentCodePoint($next) || $this->isValidEscape(1))) {
113                $this->advance(); // consume '#'
114                $type = $this->wouldStartIdentSequence(0) ? HashTokenType::Id : HashTokenType::Unrestricted;
115                $name = $this->consumeIdentSequence();
116                $this->emit(new HashToken($name, $type));
117                return;
118            }
119            $this->advance();
120            $this->emit(new DelimToken('#'));
121            return;
122        }
123        if ($c === '(') {
124            $this->advance();
125            $this->emit(new LeftParenToken());
126            return;
127        }
128        if ($c === ')') {
129            $this->advance();
130            $this->emit(new RightParenToken());
131            return;
132        }
133        if ($c === '+' || $c === '.') {
134            if ($this->wouldStartNumber(0)) {
135                $this->emit($this->consumeNumericToken());
136                return;
137            }
138            $this->advance();
139            $this->emit(new DelimToken($c));
140            return;
141        }
142        if ($c === ',') {
143            $this->advance();
144            $this->emit(new CommaToken());
145            return;
146        }
147        if ($c === '-') {
148            if ($this->wouldStartNumber(0)) {
149                $this->emit($this->consumeNumericToken());
150                return;
151            }
152            if ($this->peek(1) === '-' && $this->peek(2) === '>') {
153                $this->advance();
154                $this->advance();
155                $this->advance();
156                $this->emit(new CdcToken());
157                return;
158            }
159            if ($this->wouldStartIdentSequence(0)) {
160                $this->emit($this->consumeIdentLikeToken());
161                return;
162            }
163            $this->advance();
164            $this->emit(new DelimToken('-'));
165            return;
166        }
167        if ($c === ':') {
168            $this->advance();
169            $this->emit(new ColonToken());
170            return;
171        }
172        if ($c === ';') {
173            $this->advance();
174            $this->emit(new SemicolonToken());
175            return;
176        }
177        if ($c === '<') {
178            if ($this->peek(1) === '!' && $this->peek(2) === '-' && $this->peek(3) === '-') {
179                $this->advance();
180                $this->advance();
181                $this->advance();
182                $this->advance();
183                $this->emit(new CdoToken());
184                return;
185            }
186            $this->advance();
187            $this->emit(new DelimToken('<'));
188            return;
189        }
190        if ($c === '@') {
191            if ($this->wouldStartIdentSequence(1)) {
192                $this->advance(); // consume '@'
193                $name = $this->consumeIdentSequence();
194                $this->emit(new AtKeywordToken($name));
195                return;
196            }
197            $this->advance();
198            $this->emit(new DelimToken('@'));
199            return;
200        }
201        if ($c === '[') {
202            $this->advance();
203            $this->emit(new LeftBracketToken());
204            return;
205        }
206        if ($c === ']') {
207            $this->advance();
208            $this->emit(new RightBracketToken());
209            return;
210        }
211        if ($c === '\\') {
212            if ($this->isValidEscape(0)) {
213                $this->emit($this->consumeIdentLikeToken());
214                return;
215            }
216            $this->advance();
217            $this->emit(new DelimToken('\\'));
218            return;
219        }
220        if ($c === '{') {
221            $this->advance();
222            $this->emit(new LeftBraceToken());
223            return;
224        }
225        if ($c === '}') {
226            $this->advance();
227            $this->emit(new RightBraceToken());
228            return;
229        }
230        if (self::isDigit($c)) {
231            $this->emit($this->consumeNumericToken());
232            return;
233        }
234        if (self::isIdentStartCodePoint($c)) {
235            $this->emit($this->consumeIdentLikeToken());
236            return;
237        }
238        $this->advance();
239        $this->emit(new DelimToken($c));
240    }
241
242    // ============================================================
243    // Sub-procedures
244    // ============================================================
245
246    private function consumeComments(): void
247    {
248        while ($this->peek(0) === '/' && $this->peek(1) === '*') {
249            $this->advance();
250            $this->advance();
251            while (true) {
252                $c = $this->peek(0);
253                if ($c === null) {
254                    return; // EOF inside comment is a parse error per spec; we just stop.
255                }
256                if ($c === '*' && $this->peek(1) === '/') {
257                    $this->advance();
258                    $this->advance();
259                    break;
260                }
261                $this->advance();
262            }
263        }
264    }
265
266    private function consumeString(string $terminator): Token
267    {
268        $buf = '';
269        while (true) {
270            $c = $this->peek(0);
271            if ($c === null) {
272                return new StringToken($buf); // EOF — parse error per spec; return what we have.
273            }
274            if ($c === $terminator) {
275                $this->advance();
276                return new StringToken($buf);
277            }
278            if ($c === "\n") {
279                return new BadStringToken();
280            }
281            if ($c === '\\') {
282                if ($this->peek(1) === null) {
283                    $this->advance();
284                    continue;
285                }
286                if ($this->peek(1) === "\n") {
287                    $this->advance();
288                    $this->advance();
289                    continue;
290                }
291                $buf .= $this->consumeEscape();
292                continue;
293            }
294            $buf .= $c;
295            $this->advance();
296        }
297    }
298
299    private function consumeIdentLikeToken(): Token
300    {
301        $name = $this->consumeIdentSequence();
302        if (strcasecmp($name, 'url') === 0 && $this->peek(0) === '(') {
303            $this->advance(); // consume '('
304            // Skip leading whitespace.
305            while (($next = $this->peek(0)) !== null && self::isWhitespace($next)) {
306                $this->advance();
307            }
308            // If quote follows, it's a function-call url() with a string arg.
309            $n = $this->peek(0);
310            if ($n === '"' || $n === "'") {
311                return new FunctionToken($name);
312            }
313            return $this->consumeUrlToken();
314        }
315        if ($this->peek(0) === '(') {
316            $this->advance();
317            return new FunctionToken($name);
318        }
319        return new IdentToken($name);
320    }
321
322    private function consumeUrlToken(): Token
323    {
324        $buf = '';
325        while (true) {
326            $c = $this->peek(0);
327            if ($c === null) {
328                return new UrlToken($buf);
329            }
330            if ($c === ')') {
331                $this->advance();
332                return new UrlToken($buf);
333            }
334            if (self::isWhitespace($c)) {
335                while (($next = $this->peek(0)) !== null && self::isWhitespace($next)) {
336                    $this->advance();
337                }
338                if ($this->peek(0) === ')') {
339                    $this->advance();
340                    return new UrlToken($buf);
341                }
342                if ($this->peek(0) === null) {
343                    return new UrlToken($buf);
344                }
345                $this->consumeRemnantsOfBadUrl();
346                return new BadUrlToken();
347            }
348            if ($c === '"' || $c === "'" || $c === '(' || self::isNonPrintable($c)) {
349                $this->consumeRemnantsOfBadUrl();
350                return new BadUrlToken();
351            }
352            if ($c === '\\') {
353                if ($this->isValidEscape(0)) {
354                    $buf .= $this->consumeEscape();
355                    continue;
356                }
357                $this->consumeRemnantsOfBadUrl();
358                return new BadUrlToken();
359            }
360            $buf .= $c;
361            $this->advance();
362        }
363    }
364
365    private function consumeRemnantsOfBadUrl(): void
366    {
367        while (true) {
368            $c = $this->peek(0);
369            if ($c === null || $c === ')') {
370                if ($c === ')') {
371                    $this->advance();
372                }
373                return;
374            }
375            if ($c === '\\' && $this->isValidEscape(0)) {
376                $this->consumeEscape();
377                continue;
378            }
379            $this->advance();
380        }
381    }
382
383    private function consumeIdentSequence(): string
384    {
385        $out = '';
386        while (true) {
387            $c = $this->peek(0);
388            if ($c === null) {
389                return $out;
390            }
391            if (self::isIdentCodePoint($c)) {
392                $out .= $c;
393                $this->advance();
394                continue;
395            }
396            if ($this->isValidEscape(0)) {
397                $out .= $this->consumeEscape();
398                continue;
399            }
400            return $out;
401        }
402    }
403
404    private function consumeNumericToken(): Token
405    {
406        [$value, $type] = $this->consumeNumber();
407        if ($this->wouldStartIdentSequence(0)) {
408            $unit = $this->consumeIdentSequence();
409            return new DimensionToken($value, $unit, $type);
410        }
411        if ($this->peek(0) === '%') {
412            $this->advance();
413            return new PercentageToken($value);
414        }
415        return new NumberToken($value, $type);
416    }
417
418    /** @return array{0: float, 1: NumberTokenType} */
419    private function consumeNumber(): array
420    {
421        $type = NumberTokenType::Integer;
422        $buf = '';
423        $c = $this->peek(0);
424        if ($c === '+' || $c === '-') {
425            $buf .= $c;
426            $this->advance();
427        }
428        while (self::isDigit($this->peek(0) ?? '')) {
429            $buf .= $this->peek(0);
430            $this->advance();
431        }
432        if ($this->peek(0) === '.' && self::isDigit($this->peek(1) ?? '')) {
433            $buf .= $this->peek(0) . $this->peek(1);
434            $this->advance();
435            $this->advance();
436            $type = NumberTokenType::Number;
437            while (self::isDigit($this->peek(0) ?? '')) {
438                $buf .= $this->peek(0);
439                $this->advance();
440            }
441        }
442        $c = $this->peek(0);
443        $next = $this->peek(1);
444        $next2 = $this->peek(2);
445        if (($c === 'e' || $c === 'E')
446            && (self::isDigit($next ?? '')
447                || (($next === '+' || $next === '-') && self::isDigit($next2 ?? '')))
448        ) {
449            $buf .= $this->peek(0);
450            $this->advance();
451            if ($this->peek(0) === '+' || $this->peek(0) === '-') {
452                $buf .= $this->peek(0);
453                $this->advance();
454            }
455            $type = NumberTokenType::Number;
456            while (self::isDigit($this->peek(0) ?? '')) {
457                $buf .= $this->peek(0);
458                $this->advance();
459            }
460        }
461        return [(float) $buf, $type];
462    }
463
464    private function consumeEscape(): string
465    {
466        // Caller has positioned us on '\\'; advance past it.
467        $this->advance();
468        $c = $this->peek(0);
469        if ($c === null) {
470            return "\u{FFFD}";
471        }
472        if (self::isHexDigit($c)) {
473            $hex = '';
474            for ($i = 0; $i < 6; $i++) {
475                $n = $this->peek(0);
476                if ($n === null || !self::isHexDigit($n)) {
477                    break;
478                }
479                $hex .= $n;
480                $this->advance();
481            }
482            $next = $this->peek(0);
483            if ($next !== null && self::isWhitespace($next)) {
484                $this->advance();
485            }
486            $cp = (int) hexdec($hex);
487            if ($cp === 0 || $cp > 0x10FFFF || ($cp >= 0xD800 && $cp <= 0xDFFF)) {
488                return "\u{FFFD}";
489            }
490            return mb_chr($cp, 'UTF-8') ?: "\u{FFFD}";
491        }
492        $this->advance();
493        return $c;
494    }
495
496    // ============================================================
497    // Lookahead helpers (CSS Syntax 3 §4.3.8 / §4.3.9)
498    // ============================================================
499
500    private function isValidEscape(int $offset): bool
501    {
502        if ($this->peek($offset) !== '\\') {
503            return false;
504        }
505        return $this->peek($offset + 1) !== "\n";
506    }
507
508    private function wouldStartIdentSequence(int $offset): bool
509    {
510        $c1 = $this->peek($offset);
511        if ($c1 === '-') {
512            $c2 = $this->peek($offset + 1);
513            if ($c2 !== null && (self::isIdentStartCodePoint($c2) || $c2 === '-')) {
514                return true;
515            }
516            return $this->isValidEscape($offset + 1);
517        }
518        if ($c1 !== null && self::isIdentStartCodePoint($c1)) {
519            return true;
520        }
521        if ($c1 === '\\') {
522            return $this->isValidEscape($offset);
523        }
524        return false;
525    }
526
527    private function wouldStartNumber(int $offset): bool
528    {
529        $c1 = $this->peek($offset);
530        if ($c1 === '+' || $c1 === '-') {
531            $c2 = $this->peek($offset + 1);
532            if (self::isDigit($c2 ?? '')) {
533                return true;
534            }
535            if ($c2 === '.') {
536                $c3 = $this->peek($offset + 2);
537                return self::isDigit($c3 ?? '');
538            }
539            return false;
540        }
541        if ($c1 === '.') {
542            return self::isDigit($this->peek($offset + 1) ?? '');
543        }
544        return $c1 !== null && self::isDigit($c1);
545    }
546
547    // ============================================================
548    // I/O helpers
549    // ============================================================
550
551    private function preprocess(string $input): string
552    {
553        $input = str_replace(["\r\n", "\r", "\f"], "\n", $input);
554        return str_replace("\0", "\u{FFFD}", $input);
555    }
556
557    private function peek(int $offset): ?string
558    {
559        $i = $this->pos + $offset;
560        return $i < $this->length ? $this->chars[$i] : null;
561    }
562
563    private function advance(): void
564    {
565        $this->pos++;
566    }
567
568    private function emit(Token $t): void
569    {
570        $this->emitted[] = $t;
571    }
572
573    // ============================================================
574    // Character classification
575    // ============================================================
576
577    private static function isWhitespace(string $c): bool
578    {
579        return $c === ' ' || $c === "\t" || $c === "\n";
580    }
581
582    private static function isDigit(string $c): bool
583    {
584        return $c >= '0' && $c <= '9';
585    }
586
587    private static function isHexDigit(string $c): bool
588    {
589        return self::isDigit($c) || ($c >= 'A' && $c <= 'F') || ($c >= 'a' && $c <= 'f');
590    }
591
592    private static function isLetter(string $c): bool
593    {
594        return ($c >= 'a' && $c <= 'z') || ($c >= 'A' && $c <= 'Z');
595    }
596
597    private static function isIdentStartCodePoint(string $c): bool
598    {
599        if ($c === '') {
600            return false;
601        }
602        if (self::isLetter($c) || $c === '_') {
603            return true;
604        }
605        return mb_ord($c, 'UTF-8') >= 0x80;
606    }
607
608    private static function isIdentCodePoint(string $c): bool
609    {
610        return self::isIdentStartCodePoint($c) || self::isDigit($c) || $c === '-';
611    }
612
613    private static function isNonPrintable(string $c): bool
614    {
615        $cp = mb_ord($c, 'UTF-8');
616        if ($cp === false) {
617            return false;
618        }
619        return ($cp >= 0x00 && $cp <= 0x08)
620            || $cp === 0x0B
621            || ($cp >= 0x0E && $cp <= 0x1F)
622            || $cp === 0x7F;
623    }
624}