Code Coverage for /home/runner/work/phpdftk/phpdftk/packages/text/src/Bidi.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	82.98% covered (warning)	82.98%	78 / 94	50.00% covered (danger)	50.00%	2 / 4	CRAP	0.00% covered (danger)	0.00%	0 / 1
Bidi	82.98% covered (warning)	82.98%	78 / 94	50.00% covered (danger)	50.00%	2 / 4	47.89	0.00% covered (danger)	0.00%	0 / 1
analyze	100.00% covered (success)	100.00%	43 / 43	100.00% covered (success)	100.00%	1 / 1	17
classify	77.78% covered (warning)	77.78%	7 / 9	0.00% covered (danger)	0.00%	0 / 1	7.54
neutralLevel	100.00% covered (success)	100.00%	14 / 14	100.00% covered (success)	100.00%	1 / 1	10
decodeUtf8	50.00% covered (danger)	50.00%	14 / 28	0.00% covered (danger)	0.00%	0 / 1	10.50

1	<?php
2
3	declare(strict_types=1);
4
5	namespace Phpdftk\Text;
6
7	/**
8	* UAX #9 bidi analyser — Phase-1 implementation.
9	*
10	* The algorithm here is the "common path" subset sufficient for the MVP's
11	* Latin / CJK / Cyrillic / Greek scope. It implements:
12	*
13	* - P2 / P3: base direction detection (first strong char wins,
14	* LTR fallback) and base level assignment.
15	* - L → 0 / R, AL → 1 strong-type level assignment.
16	* - N1 (partial): neutrals between same-direction chars take that
17	* direction; neutrals between LTR and RTL adopt the base direction.
18	* - Run consolidation: contiguous same-level characters fuse into
19	* one `BidiRun`.
20	*
21	* Notably NOT implemented yet (Phase 2):
22	*
23	* - Explicit embedding controls (RLE / LRE / RLO / LRO / PDF) and the
24	* isolate controls (RLI / LRI / FSI / PDI). They are treated as
25	* neutrals here.
26	* - Weak types W1–W7 (Arabic shaping interactions, European number
27	* separators, common number separators around digits).
28	* - I1 / I2 implicit level resolution (separate odd/even-level rules
29	* for AN/EN/L).
30	* - Bracket pair handling (BD16 / N0).
31	* - L1 separator reset.
32	*
33	* For pure-LTR documents (the MVP target), all of these are no-ops, so
34	* the simplified algorithm produces spec-correct output. RTL documents
35	* with embedded numbers / parentheses / quotes will produce
36	* approximations that improve over time as the Phase-2 work lands.
37	*
38	* Byte offsets follow UTF-8 semantics throughout; we walk codepoints
39	* using PHP's `mb_*` family and use the byte position for run boundaries.
40	*/
41	final class Bidi
42	{
43	public function analyze(string $text, BidiBase $base = BidiBase::Auto): BidiResult
44	{
45	if ($text === '') {
46	return new BidiResult(
47	$base === BidiBase::Auto ? BidiBase::Ltr : $base,
48	[],
49	);
50	}
51
52	// Per-codepoint direction with byte offsets.
53	$chars = self::decodeUtf8($text);
54	$directions = [];
55	foreach ($chars as $i => $entry) {
56	$directions[$i] = $this->classify($entry['codepoint']);
57	}
58
59	// P2: find first strong character for auto-base.
60	$resolvedBase = $base;
61	if ($resolvedBase === BidiBase::Auto) {
62	$resolvedBase = BidiBase::Ltr;
63	foreach ($directions as $d) {
64	if ($d === 'L') {
65	$resolvedBase = BidiBase::Ltr;
66	break;
67	}
68	if ($d === 'R') {
69	$resolvedBase = BidiBase::Rtl;
70	break;
71	}
72	}
73	}
74	$baseLevel = $resolvedBase === BidiBase::Rtl ? 1 : 0;
75
76	// Assign per-char levels: strong types take their natural level,
77	// neutrals borrow direction from surrounding strong characters
78	// (N1 partial), falling back to the base level.
79	$levels = [];
80	$count = count($chars);
81	for ($i = 0; $i < $count; $i++) {
82	$d = $directions[$i];
83	$levels[$i] = match ($d) {
84	'L' => 0,
85	'R' => 1,
86	default => self::neutralLevel($directions, $i, $baseLevel),
87	};
88	}
89
90	// Consolidate contiguous same-level runs.
91	$runs = [];
92	$startIdx = 0;
93	for ($i = 1; $i <= $count; $i++) {
94	if ($i === $count \|\| $levels[$i] !== $levels[$startIdx]) {
95	$startByte = $chars[$startIdx]['byteOffset'];
96	$endByte = $i === $count
97	? strlen($text)
98	: $chars[$i]['byteOffset'];
99	$runs[] = new BidiRun(
100	offset: $startByte,
101	length: $endByte - $startByte,
102	level: $levels[$startIdx],
103	);
104	$startIdx = $i;
105	}
106	}
107
108	return new BidiResult($resolvedBase, $runs);
109	}
110
111	/**
112	* Classify a codepoint into UAX #9 strong / neutral categories. Returns
113	* one of: `'L'`, `'R'`, `'AL'`, `'EN'`, `'AN'`, `'WS'`, `'ON'`, `'BN'`.
114	* AL is merged into R for level assignment (both yield level 1).
115	*/
116	private function classify(int $cp): string
117	{
118	$direction = \IntlChar::charDirection($cp);
119	return match ($direction) {
120	\IntlChar::CHAR_DIRECTION_LEFT_TO_RIGHT => 'L',
121	\IntlChar::CHAR_DIRECTION_RIGHT_TO_LEFT,
122	\IntlChar::CHAR_DIRECTION_RIGHT_TO_LEFT_ARABIC => 'R',
123	\IntlChar::CHAR_DIRECTION_EUROPEAN_NUMBER => 'EN',
124	\IntlChar::CHAR_DIRECTION_ARABIC_NUMBER => 'AN',
125	\IntlChar::CHAR_DIRECTION_WHITE_SPACE_NEUTRAL => 'WS',
126	default => 'ON',
127	};
128	}
129
130	/**
131	* Decide the level for a neutral character. N1 partial: look back to
132	* the most recent strong, look forward to the next strong; if they
133	* agree, adopt that direction. Otherwise fall back to the base level.
134	*
135	* @param array<int, string> $directions
136	*/
137	private static function neutralLevel(array $directions, int $i, int $baseLevel): int
138	{
139	$prev = null;
140	for ($j = $i - 1; $j >= 0; $j--) {
141	if ($directions[$j] === 'L' \|\| $directions[$j] === 'R') {
142	$prev = $directions[$j];
143	break;
144	}
145	}
146	$next = null;
147	$count = count($directions);
148	for ($j = $i + 1; $j < $count; $j++) {
149	if ($directions[$j] === 'L' \|\| $directions[$j] === 'R') {
150	$next = $directions[$j];
151	break;
152	}
153	}
154	if ($prev !== null && $prev === $next) {
155	return $prev === 'R' ? 1 : 0;
156	}
157	return $baseLevel;
158	}
159
160	/**
161	* Decode a UTF-8 string into per-codepoint metadata: the codepoint
162	* value plus its starting byte offset.
163	*
164	* @return list<array{codepoint: int, byteOffset: int}>
165	*/
166	private static function decodeUtf8(string $text): array
167	{
168	$out = [];
169	$bytes = strlen($text);
170	$i = 0;
171	while ($i < $bytes) {
172	$byte = ord($text[$i]);
173	if ($byte < 0x80) {
174	$out[] = ['codepoint' => $byte, 'byteOffset' => $i];
175	$i++;
176	} elseif ($byte < 0xC0) {
177	// Invalid continuation; skip with replacement.
178	$out[] = ['codepoint' => 0xFFFD, 'byteOffset' => $i];
179	$i++;
180	} elseif ($byte < 0xE0) {
181	$cp = (($byte & 0x1F) << 6) \| (ord($text[$i + 1] ?? "\x00") & 0x3F);
182	$out[] = ['codepoint' => $cp, 'byteOffset' => $i];
183	$i += 2;
184	} elseif ($byte < 0xF0) {
185	$cp = (($byte & 0x0F) << 12)
186	\| ((ord($text[$i + 1] ?? "\x00") & 0x3F) << 6)
187	\| (ord($text[$i + 2] ?? "\x00") & 0x3F);
188	$out[] = ['codepoint' => $cp, 'byteOffset' => $i];
189	$i += 3;
190	} else {
191	$cp = (($byte & 0x07) << 18)
192	\| ((ord($text[$i + 1] ?? "\x00") & 0x3F) << 12)
193	\| ((ord($text[$i + 2] ?? "\x00") & 0x3F) << 6)
194	\| (ord($text[$i + 3] ?? "\x00") & 0x3F);
195	$out[] = ['codepoint' => $cp, 'byteOffset' => $i];
196	$i += 4;
197	}
198	}
199	return $out;
200	}
201	}