Code Coverage for /home/runner/work/phpdftk/phpdftk/packages/crypt/src/SaslPrep.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	96.48% covered (success)	96.48%	137 / 142	66.67% covered (warning)	66.67%	6 / 9	CRAP	0.00% covered (danger)	0.00%	0 / 1
SaslPrep	96.48% covered (success)	96.48%	137 / 142	66.67% covered (warning)	66.67%	6 / 9	106	0.00% covered (danger)	0.00%	0 / 1
prepare	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	2
map	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
normalize	83.33% covered (warning)	83.33%	5 / 6	0.00% covered (danger)	0.00%	0 / 1	3.04
checkProhibited	100.00% covered (success)	100.00%	53 / 53	100.00% covered (success)	100.00%	1 / 1	41
checkBidi	91.30% covered (success)	91.30%	21 / 23	0.00% covered (danger)	0.00%	0 / 1	10.07
isRandALCat	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	12
isLCat	100.00% covered (success)	100.00%	15 / 15	100.00% covered (success)	100.00%	1 / 1	30
readCodepoint	90.48% covered (success)	90.48%	19 / 21	0.00% covered (danger)	0.00%	0 / 1	5.02
toCodepoints	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	2

1	<?php
2
3	declare(strict_types=1);
4
5	namespace Phpdftk\Crypt;
6
7	/**
8	* SASLprep password normalization — RFC 4013.
9	*
10	* Prepares Unicode strings for use as passwords in PDF 2.0 encryption
11	* (ISO 32000-2 §7.6.4.3.2). Uses the Stringprep framework (RFC 3454)
12	* with the SASLprep profile.
13	*/
14	final class SaslPrep
15	{
16	/**
17	* Non-ASCII space characters mapped to U+0020 (RFC 3454 Table C.1.2).
18	*/
19	private const SPACE_MAP = [
20	"\xC2\xA0", // U+00A0 NO-BREAK SPACE
21	"\xE1\x9A\x80", // U+1680 OGHAM SPACE MARK
22	"\xE2\x80\x80", // U+2000 EN QUAD
23	"\xE2\x80\x81", // U+2001 EM QUAD
24	"\xE2\x80\x82", // U+2002 EN SPACE
25	"\xE2\x80\x83", // U+2003 EM SPACE
26	"\xE2\x80\x84", // U+2004 THREE-PER-EM SPACE
27	"\xE2\x80\x85", // U+2005 FOUR-PER-EM SPACE
28	"\xE2\x80\x86", // U+2006 SIX-PER-EM SPACE
29	"\xE2\x80\x87", // U+2007 FIGURE SPACE
30	"\xE2\x80\x88", // U+2008 PUNCTUATION SPACE
31	"\xE2\x80\x89", // U+2009 THIN SPACE
32	"\xE2\x80\x8A", // U+200A HAIR SPACE
33	"\xE2\x80\x8B", // U+200B ZERO WIDTH SPACE
34	"\xE2\x80\xAF", // U+202F NARROW NO-BREAK SPACE
35	"\xE2\x81\x9F", // U+205F MEDIUM MATHEMATICAL SPACE
36	"\xE3\x80\x80", // U+3000 IDEOGRAPHIC SPACE
37	];
38
39	/**
40	* "Commonly mapped to nothing" characters (RFC 3454 Table B.1).
41	*/
42	private const MAP_TO_NOTHING = [
43	"\xC2\xAD", // U+00AD SOFT HYPHEN
44	"\xE1\xA0\x86", // U+1806 MONGOLIAN TODO SOFT HYPHEN
45	"\xE2\x80\x8B", // U+200B ZERO WIDTH SPACE
46	"\xE2\x81\xA0", // U+2060 WORD JOINER
47	"\xEF\xBB\xBF", // U+FEFF ZERO WIDTH NO-BREAK SPACE
48	"\xCD\x8F", // U+034F COMBINING GRAPHEME JOINER
49	"\xE1\xA0\x8B", // U+180B MONGOLIAN FREE VARIATION SELECTOR ONE
50	"\xE1\xA0\x8C", // U+180C MONGOLIAN FREE VARIATION SELECTOR TWO
51	"\xE1\xA0\x8D", // U+180D MONGOLIAN FREE VARIATION SELECTOR THREE
52	"\xEF\xB8\x80", // U+FE00 VARIATION SELECTOR-1
53	"\xEF\xB8\x81", // U+FE01 VARIATION SELECTOR-2
54	"\xEF\xB8\x82", // U+FE02 VARIATION SELECTOR-3
55	"\xEF\xB8\x83", // U+FE03 VARIATION SELECTOR-4
56	"\xEF\xB8\x84", // U+FE04 VARIATION SELECTOR-5
57	"\xEF\xB8\x85", // U+FE05 VARIATION SELECTOR-6
58	"\xEF\xB8\x86", // U+FE06 VARIATION SELECTOR-7
59	"\xEF\xB8\x87", // U+FE07 VARIATION SELECTOR-8
60	"\xEF\xB8\x88", // U+FE08 VARIATION SELECTOR-9
61	"\xEF\xB8\x89", // U+FE09 VARIATION SELECTOR-10
62	"\xEF\xB8\x8A", // U+FE0A VARIATION SELECTOR-11
63	"\xEF\xB8\x8B", // U+FE0B VARIATION SELECTOR-12
64	"\xEF\xB8\x8C", // U+FE0C VARIATION SELECTOR-13
65	"\xEF\xB8\x8D", // U+FE0D VARIATION SELECTOR-14
66	"\xEF\xB8\x8E", // U+FE0E VARIATION SELECTOR-15
67	"\xEF\xB8\x8F", // U+FE0F VARIATION SELECTOR-16
68	];
69
70	/**
71	* Prepare a password string per SASLprep.
72	*
73	* Steps:
74	* 1. Map: replace non-ASCII spaces with U+0020, remove commonly-mapped-to-nothing chars
75	* 2. Normalize: NFKC normalization
76	* 3. Prohibit: reject strings with prohibited characters
77	* 4. Check bidi: validate bidirectional text rules
78	*
79	* If the PHP intl extension is not available, mapping and prohibit/bidi
80	* checks are still performed but NFKC normalization is skipped (most
81	* passwords are ASCII and don't need normalization).
82	*/
83	public static function prepare(string $input): string
84	{
85	if ($input === '') {
86	return '';
87	}
88
89	// Step 1: Mapping
90	$str = self::map($input);
91
92	// Step 2: NFKC normalization
93	$str = self::normalize($str);
94
95	// Step 3: Prohibit
96	self::checkProhibited($str);
97
98	// Step 4: Bidi check
99	self::checkBidi($str);
100
101	return $str;
102	}
103
104	/**
105	* Step 1: Map non-ASCII spaces to U+0020 and remove mapped-to-nothing chars.
106	*/
107	private static function map(string $input): string
108	{
109	// Replace non-ASCII spaces with regular space
110	$result = str_replace(self::SPACE_MAP, ' ', $input);
111
112	// Remove commonly mapped to nothing characters
113	$result = str_replace(self::MAP_TO_NOTHING, '', $result);
114
115	return $result;
116	}
117
118	/**
119	* Step 2: NFKC normalization via the intl extension.
120	*/
121	private static function normalize(string $input): string
122	{
123	if (!class_exists(\Normalizer::class)) {
124	return $input;
125	}
126
127	$normalized = \Normalizer::normalize($input, \Normalizer::FORM_KC);
128
129	if ($normalized === false) {
130	return $input;
131	}
132
133	return $normalized;
134	}
135
136	/**
137	* Step 3: Check for prohibited characters.
138	*
139	* Checks RFC 3454 Tables C.1.2, C.2.1, C.2.2, C.3-C.9.
140	*
141	* @throws \InvalidArgumentException if prohibited characters are found
142	*/
143	private static function checkProhibited(string $input): void
144	{
145	$len = strlen($input);
146	$i = 0;
147	$bytesConsumed = 0;
148
149	while ($i < $len) {
150	$codepoint = self::readCodepoint($input, $i, $bytesConsumed);
151	$i += $bytesConsumed;
152
153	// C.2.1: ASCII control characters (U+0000-U+001F, U+007F)
154	if ($codepoint <= 0x001F \|\| $codepoint === 0x007F) {
155	throw new \InvalidArgumentException(
156	sprintf('Prohibited character U+%04X (ASCII control) in SASLprep input', $codepoint),
157	);
158	}
159
160	// C.2.2: Non-ASCII control characters (U+0080-U+009F)
161	if ($codepoint >= 0x0080 && $codepoint <= 0x009F) {
162	throw new \InvalidArgumentException(
163	sprintf('Prohibited character U+%04X (non-ASCII control) in SASLprep input', $codepoint),
164	);
165	}
166
167	// C.2.2: Additional non-ASCII control characters
168	if ($codepoint === 0x06DD \|\| $codepoint === 0x070F
169	\|\| $codepoint === 0x180E
170	\|\| ($codepoint >= 0x200C && $codepoint <= 0x200D)
171	\|\| ($codepoint >= 0x2028 && $codepoint <= 0x2029)
172	\|\| ($codepoint >= 0x2060 && $codepoint <= 0x2063)
173	\|\| ($codepoint >= 0x206A && $codepoint <= 0x206F)
174	\|\| $codepoint === 0xFEFF
175	) {
176	throw new \InvalidArgumentException(
177	sprintf('Prohibited character U+%04X (non-ASCII control) in SASLprep input', $codepoint),
178	);
179	}
180
181	// C.3: Private use (U+E000-U+F8FF, U+F0000-U+FFFFD, U+100000-U+10FFFD)
182	if (($codepoint >= 0xE000 && $codepoint <= 0xF8FF)
183	\|\| ($codepoint >= 0xF0000 && $codepoint <= 0xFFFFD)
184	\|\| ($codepoint >= 0x100000 && $codepoint <= 0x10FFFD)
185	) {
186	throw new \InvalidArgumentException(
187	sprintf('Prohibited character U+%04X (private use) in SASLprep input', $codepoint),
188	);
189	}
190
191	// C.4: Non-characters (U+FDD0-U+FDEF, U+FFFE-U+FFFF, and plane-end non-characters)
192	if (($codepoint >= 0xFDD0 && $codepoint <= 0xFDEF)
193	\|\| ($codepoint & 0xFFFE) === 0xFFFE // catches U+xFFFE and U+xFFFF for all planes
194	) {
195	throw new \InvalidArgumentException(
196	sprintf('Prohibited character U+%04X (non-character) in SASLprep input', $codepoint),
197	);
198	}
199
200	// C.5: Surrogate codes (should not appear in valid UTF-8, but check anyway)
201	if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) {
202	throw new \InvalidArgumentException(
203	sprintf('Prohibited character U+%04X (surrogate) in SASLprep input', $codepoint),
204	);
205	}
206
207	// C.6: Inappropriate for plain text
208	if ($codepoint === 0xFFF9 \|\| $codepoint === 0xFFFA \|\| $codepoint === 0xFFFB) {
209	throw new \InvalidArgumentException(
210	sprintf('Prohibited character U+%04X (inappropriate for plain text) in SASLprep input', $codepoint),
211	);
212	}
213
214	// C.8: Change display properties / deprecated
215	if ($codepoint === 0x0340 \|\| $codepoint === 0x0341
216	\|\| $codepoint === 0x200E \|\| $codepoint === 0x200F
217	\|\| ($codepoint >= 0x202A && $codepoint <= 0x202E)
218	) {
219	throw new \InvalidArgumentException(
220	sprintf('Prohibited character U+%04X (change display / deprecated) in SASLprep input', $codepoint),
221	);
222	}
223
224	// C.9: Tagging characters
225	if ($codepoint === 0xE0001 \|\| ($codepoint >= 0xE0020 && $codepoint <= 0xE007F)) {
226	throw new \InvalidArgumentException(
227	sprintf('Prohibited character U+%04X (tagging character) in SASLprep input', $codepoint),
228	);
229	}
230	}
231	}
232
233	/**
234	* Step 4: Bidirectional text check (RFC 3454 §6).
235	*
236	* If a string contains any RandALCat character, the first and last
237	* characters must also be RandALCat, and the string must not contain
238	* any LCat characters.
239	*
240	* @throws \InvalidArgumentException if bidi rules are violated
241	*/
242	private static function checkBidi(string $input): void
243	{
244	if ($input === '') {
245	return;
246	}
247
248	$codepoints = self::toCodepoints($input);
249	if ($codepoints === []) {
250	return;
251	}
252
253	$hasRandAL = false;
254	$hasL = false;
255
256	foreach ($codepoints as $cp) {
257	if (self::isRandALCat($cp)) {
258	$hasRandAL = true;
259	}
260	if (self::isLCat($cp)) {
261	$hasL = true;
262	}
263	}
264
265	if ($hasRandAL) {
266	if ($hasL) {
267	throw new \InvalidArgumentException(
268	'SASLprep bidi violation: string with RandALCat characters must not contain LCat characters',
269	);
270	}
271
272	$first = $codepoints[0];
273	$last = $codepoints[count($codepoints) - 1];
274
275	if (!self::isRandALCat($first) \|\| !self::isRandALCat($last)) {
276	throw new \InvalidArgumentException(
277	'SASLprep bidi violation: first and last characters must be RandALCat',
278	);
279	}
280	}
281	}
282
283	/**
284	* Simplified RandALCat check — covers Arabic, Hebrew, and related blocks.
285	*/
286	private static function isRandALCat(int $codepoint): bool
287	{
288	return ($codepoint >= 0x0590 && $codepoint <= 0x05FF) // Hebrew
289	\|\| ($codepoint >= 0x0600 && $codepoint <= 0x06FF) // Arabic
290	\|\| ($codepoint >= 0x0700 && $codepoint <= 0x074F) // Syriac
291	\|\| ($codepoint >= 0x0780 && $codepoint <= 0x07BF) // Thaana
292	\|\| ($codepoint >= 0xFB50 && $codepoint <= 0xFDFF) // Arabic Presentation Forms-A
293	\|\| ($codepoint >= 0xFE70 && $codepoint <= 0xFEFF); // Arabic Presentation Forms-B
294	}
295
296	/**
297	* Simplified LCat check — covers Latin, Greek, Cyrillic, CJK, etc.
298	*/
299	private static function isLCat(int $codepoint): bool
300	{
301	return ($codepoint >= 0x0041 && $codepoint <= 0x005A) // A-Z
302	\|\| ($codepoint >= 0x0061 && $codepoint <= 0x007A) // a-z
303	\|\| ($codepoint >= 0x00C0 && $codepoint <= 0x00D6) // Latin Extended
304	\|\| ($codepoint >= 0x00D8 && $codepoint <= 0x00F6)
305	\|\| ($codepoint >= 0x00F8 && $codepoint <= 0x024F) // Latin Extended Additional
306	\|\| ($codepoint >= 0x0370 && $codepoint <= 0x0373) // Greek
307	\|\| ($codepoint >= 0x0376 && $codepoint <= 0x0377)
308	\|\| ($codepoint >= 0x037A && $codepoint <= 0x037D)
309	\|\| ($codepoint >= 0x0386 && $codepoint <= 0x03FF)
310	\|\| ($codepoint >= 0x0400 && $codepoint <= 0x04FF) // Cyrillic
311	\|\| ($codepoint >= 0x1E00 && $codepoint <= 0x1EFF) // Latin Extended Additional
312	\|\| ($codepoint >= 0x1F00 && $codepoint <= 0x1FFF) // Greek Extended
313	\|\| ($codepoint >= 0x4E00 && $codepoint <= 0x9FFF) // CJK Unified Ideographs
314	\|\| ($codepoint >= 0x3040 && $codepoint <= 0x309F) // Hiragana
315	\|\| ($codepoint >= 0x30A0 && $codepoint <= 0x30FF); // Katakana
316	}
317
318	/**
319	* Read a single UTF-8 codepoint from a string at the given offset.
320	*/
321	private static function readCodepoint(string $str, int $offset, int &$bytesConsumed): int
322	{
323	$byte = ord($str[$offset]);
324
325	if ($byte < 0x80) {
326	$bytesConsumed = 1;
327	return $byte;
328	}
329
330	if (($byte & 0xE0) === 0xC0) {
331	$bytesConsumed = 2;
332	return (($byte & 0x1F) << 6)
333	\| (ord($str[$offset + 1]) & 0x3F);
334	}
335
336	if (($byte & 0xF0) === 0xE0) {
337	$bytesConsumed = 3;
338	return (($byte & 0x0F) << 12)
339	\| ((ord($str[$offset + 1]) & 0x3F) << 6)
340	\| (ord($str[$offset + 2]) & 0x3F);
341	}
342
343	if (($byte & 0xF8) === 0xF0) {
344	$bytesConsumed = 4;
345	return (($byte & 0x07) << 18)
346	\| ((ord($str[$offset + 1]) & 0x3F) << 12)
347	\| ((ord($str[$offset + 2]) & 0x3F) << 6)
348	\| (ord($str[$offset + 3]) & 0x3F);
349	}
350
351	// Invalid UTF-8 byte — treat as single byte
352	$bytesConsumed = 1;
353	return $byte;
354	}
355
356	/**
357	* Convert a UTF-8 string to an array of codepoints.
358	*
359	* @return int[]
360	*/
361	private static function toCodepoints(string $str): array
362	{
363	$codepoints = [];
364	$len = strlen($str);
365	$i = 0;
366	$bytesConsumed = 0;
367
368	while ($i < $len) {
369	$codepoints[] = self::readCodepoint($str, $i, $bytesConsumed);
370	$i += $bytesConsumed;
371	}
372
373	return $codepoints;
374	}
375	}