Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
96.48% covered (success)
96.48%
137 / 142
66.67% covered (warning)
66.67%
6 / 9
CRAP
0.00% covered (danger)
0.00%
0 / 1
SaslPrep
96.48% covered (success)
96.48%
137 / 142
66.67% covered (warning)
66.67%
6 / 9
106
0.00% covered (danger)
0.00%
0 / 1
 prepare
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
2
 map
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 normalize
83.33% covered (warning)
83.33%
5 / 6
0.00% covered (danger)
0.00%
0 / 1
3.04
 checkProhibited
100.00% covered (success)
100.00%
53 / 53
100.00% covered (success)
100.00%
1 / 1
41
 checkBidi
91.30% covered (success)
91.30%
21 / 23
0.00% covered (danger)
0.00%
0 / 1
10.07
 isRandALCat
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
12
 isLCat
100.00% covered (success)
100.00%
15 / 15
100.00% covered (success)
100.00%
1 / 1
30
 readCodepoint
90.48% covered (success)
90.48%
19 / 21
0.00% covered (danger)
0.00%
0 / 1
5.02
 toCodepoints
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
2
1<?php
2
3declare(strict_types=1);
4
5namespace Phpdftk\Crypt;
6
7/**
8 * SASLprep password normalization — RFC 4013.
9 *
10 * Prepares Unicode strings for use as passwords in PDF 2.0 encryption
11 * (ISO 32000-2 §7.6.4.3.2). Uses the Stringprep framework (RFC 3454)
12 * with the SASLprep profile.
13 */
14final class SaslPrep
15{
16    /**
17     * Non-ASCII space characters mapped to U+0020 (RFC 3454 Table C.1.2).
18     */
19    private const SPACE_MAP = [
20        "\xC2\xA0",             // U+00A0 NO-BREAK SPACE
21        "\xE1\x9A\x80",         // U+1680 OGHAM SPACE MARK
22        "\xE2\x80\x80",         // U+2000 EN QUAD
23        "\xE2\x80\x81",         // U+2001 EM QUAD
24        "\xE2\x80\x82",         // U+2002 EN SPACE
25        "\xE2\x80\x83",         // U+2003 EM SPACE
26        "\xE2\x80\x84",         // U+2004 THREE-PER-EM SPACE
27        "\xE2\x80\x85",         // U+2005 FOUR-PER-EM SPACE
28        "\xE2\x80\x86",         // U+2006 SIX-PER-EM SPACE
29        "\xE2\x80\x87",         // U+2007 FIGURE SPACE
30        "\xE2\x80\x88",         // U+2008 PUNCTUATION SPACE
31        "\xE2\x80\x89",         // U+2009 THIN SPACE
32        "\xE2\x80\x8A",         // U+200A HAIR SPACE
33        "\xE2\x80\x8B",         // U+200B ZERO WIDTH SPACE
34        "\xE2\x80\xAF",         // U+202F NARROW NO-BREAK SPACE
35        "\xE2\x81\x9F",         // U+205F MEDIUM MATHEMATICAL SPACE
36        "\xE3\x80\x80",         // U+3000 IDEOGRAPHIC SPACE
37    ];
38
39    /**
40     * "Commonly mapped to nothing" characters (RFC 3454 Table B.1).
41     */
42    private const MAP_TO_NOTHING = [
43        "\xC2\xAD",             // U+00AD SOFT HYPHEN
44        "\xE1\xA0\x86",         // U+1806 MONGOLIAN TODO SOFT HYPHEN
45        "\xE2\x80\x8B",         // U+200B ZERO WIDTH SPACE
46        "\xE2\x81\xA0",         // U+2060 WORD JOINER
47        "\xEF\xBB\xBF",         // U+FEFF ZERO WIDTH NO-BREAK SPACE
48        "\xCD\x8F",             // U+034F COMBINING GRAPHEME JOINER
49        "\xE1\xA0\x8B",         // U+180B MONGOLIAN FREE VARIATION SELECTOR ONE
50        "\xE1\xA0\x8C",         // U+180C MONGOLIAN FREE VARIATION SELECTOR TWO
51        "\xE1\xA0\x8D",         // U+180D MONGOLIAN FREE VARIATION SELECTOR THREE
52        "\xEF\xB8\x80",         // U+FE00 VARIATION SELECTOR-1
53        "\xEF\xB8\x81",         // U+FE01 VARIATION SELECTOR-2
54        "\xEF\xB8\x82",         // U+FE02 VARIATION SELECTOR-3
55        "\xEF\xB8\x83",         // U+FE03 VARIATION SELECTOR-4
56        "\xEF\xB8\x84",         // U+FE04 VARIATION SELECTOR-5
57        "\xEF\xB8\x85",         // U+FE05 VARIATION SELECTOR-6
58        "\xEF\xB8\x86",         // U+FE06 VARIATION SELECTOR-7
59        "\xEF\xB8\x87",         // U+FE07 VARIATION SELECTOR-8
60        "\xEF\xB8\x88",         // U+FE08 VARIATION SELECTOR-9
61        "\xEF\xB8\x89",         // U+FE09 VARIATION SELECTOR-10
62        "\xEF\xB8\x8A",         // U+FE0A VARIATION SELECTOR-11
63        "\xEF\xB8\x8B",         // U+FE0B VARIATION SELECTOR-12
64        "\xEF\xB8\x8C",         // U+FE0C VARIATION SELECTOR-13
65        "\xEF\xB8\x8D",         // U+FE0D VARIATION SELECTOR-14
66        "\xEF\xB8\x8E",         // U+FE0E VARIATION SELECTOR-15
67        "\xEF\xB8\x8F",         // U+FE0F VARIATION SELECTOR-16
68    ];
69
70    /**
71     * Prepare a password string per SASLprep.
72     *
73     * Steps:
74     * 1. Map: replace non-ASCII spaces with U+0020, remove commonly-mapped-to-nothing chars
75     * 2. Normalize: NFKC normalization
76     * 3. Prohibit: reject strings with prohibited characters
77     * 4. Check bidi: validate bidirectional text rules
78     *
79     * If the PHP intl extension is not available, mapping and prohibit/bidi
80     * checks are still performed but NFKC normalization is skipped (most
81     * passwords are ASCII and don't need normalization).
82     */
83    public static function prepare(string $input): string
84    {
85        if ($input === '') {
86            return '';
87        }
88
89        // Step 1: Mapping
90        $str = self::map($input);
91
92        // Step 2: NFKC normalization
93        $str = self::normalize($str);
94
95        // Step 3: Prohibit
96        self::checkProhibited($str);
97
98        // Step 4: Bidi check
99        self::checkBidi($str);
100
101        return $str;
102    }
103
104    /**
105     * Step 1: Map non-ASCII spaces to U+0020 and remove mapped-to-nothing chars.
106     */
107    private static function map(string $input): string
108    {
109        // Replace non-ASCII spaces with regular space
110        $result = str_replace(self::SPACE_MAP, ' ', $input);
111
112        // Remove commonly mapped to nothing characters
113        $result = str_replace(self::MAP_TO_NOTHING, '', $result);
114
115        return $result;
116    }
117
118    /**
119     * Step 2: NFKC normalization via the intl extension.
120     */
121    private static function normalize(string $input): string
122    {
123        if (!class_exists(\Normalizer::class)) {
124            return $input;
125        }
126
127        $normalized = \Normalizer::normalize($input, \Normalizer::FORM_KC);
128
129        if ($normalized === false) {
130            return $input;
131        }
132
133        return $normalized;
134    }
135
136    /**
137     * Step 3: Check for prohibited characters.
138     *
139     * Checks RFC 3454 Tables C.1.2, C.2.1, C.2.2, C.3-C.9.
140     *
141     * @throws \InvalidArgumentException if prohibited characters are found
142     */
143    private static function checkProhibited(string $input): void
144    {
145        $len = strlen($input);
146        $i = 0;
147        $bytesConsumed = 0;
148
149        while ($i < $len) {
150            $codepoint = self::readCodepoint($input, $i, $bytesConsumed);
151            $i += $bytesConsumed;
152
153            // C.2.1: ASCII control characters (U+0000-U+001F, U+007F)
154            if ($codepoint <= 0x001F || $codepoint === 0x007F) {
155                throw new \InvalidArgumentException(
156                    sprintf('Prohibited character U+%04X (ASCII control) in SASLprep input', $codepoint),
157                );
158            }
159
160            // C.2.2: Non-ASCII control characters (U+0080-U+009F)
161            if ($codepoint >= 0x0080 && $codepoint <= 0x009F) {
162                throw new \InvalidArgumentException(
163                    sprintf('Prohibited character U+%04X (non-ASCII control) in SASLprep input', $codepoint),
164                );
165            }
166
167            // C.2.2: Additional non-ASCII control characters
168            if ($codepoint === 0x06DD || $codepoint === 0x070F
169                || $codepoint === 0x180E
170                || ($codepoint >= 0x200C && $codepoint <= 0x200D)
171                || ($codepoint >= 0x2028 && $codepoint <= 0x2029)
172                || ($codepoint >= 0x2060 && $codepoint <= 0x2063)
173                || ($codepoint >= 0x206A && $codepoint <= 0x206F)
174                || $codepoint === 0xFEFF
175            ) {
176                throw new \InvalidArgumentException(
177                    sprintf('Prohibited character U+%04X (non-ASCII control) in SASLprep input', $codepoint),
178                );
179            }
180
181            // C.3: Private use (U+E000-U+F8FF, U+F0000-U+FFFFD, U+100000-U+10FFFD)
182            if (($codepoint >= 0xE000 && $codepoint <= 0xF8FF)
183                || ($codepoint >= 0xF0000 && $codepoint <= 0xFFFFD)
184                || ($codepoint >= 0x100000 && $codepoint <= 0x10FFFD)
185            ) {
186                throw new \InvalidArgumentException(
187                    sprintf('Prohibited character U+%04X (private use) in SASLprep input', $codepoint),
188                );
189            }
190
191            // C.4: Non-characters (U+FDD0-U+FDEF, U+FFFE-U+FFFF, and plane-end non-characters)
192            if (($codepoint >= 0xFDD0 && $codepoint <= 0xFDEF)
193                || ($codepoint & 0xFFFE) === 0xFFFE // catches U+xFFFE and U+xFFFF for all planes
194            ) {
195                throw new \InvalidArgumentException(
196                    sprintf('Prohibited character U+%04X (non-character) in SASLprep input', $codepoint),
197                );
198            }
199
200            // C.5: Surrogate codes (should not appear in valid UTF-8, but check anyway)
201            if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) {
202                throw new \InvalidArgumentException(
203                    sprintf('Prohibited character U+%04X (surrogate) in SASLprep input', $codepoint),
204                );
205            }
206
207            // C.6: Inappropriate for plain text
208            if ($codepoint === 0xFFF9 || $codepoint === 0xFFFA || $codepoint === 0xFFFB) {
209                throw new \InvalidArgumentException(
210                    sprintf('Prohibited character U+%04X (inappropriate for plain text) in SASLprep input', $codepoint),
211                );
212            }
213
214            // C.8: Change display properties / deprecated
215            if ($codepoint === 0x0340 || $codepoint === 0x0341
216                || $codepoint === 0x200E || $codepoint === 0x200F
217                || ($codepoint >= 0x202A && $codepoint <= 0x202E)
218            ) {
219                throw new \InvalidArgumentException(
220                    sprintf('Prohibited character U+%04X (change display / deprecated) in SASLprep input', $codepoint),
221                );
222            }
223
224            // C.9: Tagging characters
225            if ($codepoint === 0xE0001 || ($codepoint >= 0xE0020 && $codepoint <= 0xE007F)) {
226                throw new \InvalidArgumentException(
227                    sprintf('Prohibited character U+%04X (tagging character) in SASLprep input', $codepoint),
228                );
229            }
230        }
231    }
232
233    /**
234     * Step 4: Bidirectional text check (RFC 3454 §6).
235     *
236     * If a string contains any RandALCat character, the first and last
237     * characters must also be RandALCat, and the string must not contain
238     * any LCat characters.
239     *
240     * @throws \InvalidArgumentException if bidi rules are violated
241     */
242    private static function checkBidi(string $input): void
243    {
244        if ($input === '') {
245            return;
246        }
247
248        $codepoints = self::toCodepoints($input);
249        if ($codepoints === []) {
250            return;
251        }
252
253        $hasRandAL = false;
254        $hasL = false;
255
256        foreach ($codepoints as $cp) {
257            if (self::isRandALCat($cp)) {
258                $hasRandAL = true;
259            }
260            if (self::isLCat($cp)) {
261                $hasL = true;
262            }
263        }
264
265        if ($hasRandAL) {
266            if ($hasL) {
267                throw new \InvalidArgumentException(
268                    'SASLprep bidi violation: string with RandALCat characters must not contain LCat characters',
269                );
270            }
271
272            $first = $codepoints[0];
273            $last = $codepoints[count($codepoints) - 1];
274
275            if (!self::isRandALCat($first) || !self::isRandALCat($last)) {
276                throw new \InvalidArgumentException(
277                    'SASLprep bidi violation: first and last characters must be RandALCat',
278                );
279            }
280        }
281    }
282
283    /**
284     * Simplified RandALCat check — covers Arabic, Hebrew, and related blocks.
285     */
286    private static function isRandALCat(int $codepoint): bool
287    {
288        return ($codepoint >= 0x0590 && $codepoint <= 0x05FF)   // Hebrew
289            || ($codepoint >= 0x0600 && $codepoint <= 0x06FF)   // Arabic
290            || ($codepoint >= 0x0700 && $codepoint <= 0x074F)   // Syriac
291            || ($codepoint >= 0x0780 && $codepoint <= 0x07BF)   // Thaana
292            || ($codepoint >= 0xFB50 && $codepoint <= 0xFDFF)   // Arabic Presentation Forms-A
293            || ($codepoint >= 0xFE70 && $codepoint <= 0xFEFF);  // Arabic Presentation Forms-B
294    }
295
296    /**
297     * Simplified LCat check — covers Latin, Greek, Cyrillic, CJK, etc.
298     */
299    private static function isLCat(int $codepoint): bool
300    {
301        return ($codepoint >= 0x0041 && $codepoint <= 0x005A)   // A-Z
302            || ($codepoint >= 0x0061 && $codepoint <= 0x007A)   // a-z
303            || ($codepoint >= 0x00C0 && $codepoint <= 0x00D6)   // Latin Extended
304            || ($codepoint >= 0x00D8 && $codepoint <= 0x00F6)
305            || ($codepoint >= 0x00F8 && $codepoint <= 0x024F)   // Latin Extended Additional
306            || ($codepoint >= 0x0370 && $codepoint <= 0x0373)   // Greek
307            || ($codepoint >= 0x0376 && $codepoint <= 0x0377)
308            || ($codepoint >= 0x037A && $codepoint <= 0x037D)
309            || ($codepoint >= 0x0386 && $codepoint <= 0x03FF)
310            || ($codepoint >= 0x0400 && $codepoint <= 0x04FF)   // Cyrillic
311            || ($codepoint >= 0x1E00 && $codepoint <= 0x1EFF)   // Latin Extended Additional
312            || ($codepoint >= 0x1F00 && $codepoint <= 0x1FFF)   // Greek Extended
313            || ($codepoint >= 0x4E00 && $codepoint <= 0x9FFF)   // CJK Unified Ideographs
314            || ($codepoint >= 0x3040 && $codepoint <= 0x309F)   // Hiragana
315            || ($codepoint >= 0x30A0 && $codepoint <= 0x30FF);  // Katakana
316    }
317
318    /**
319     * Read a single UTF-8 codepoint from a string at the given offset.
320     */
321    private static function readCodepoint(string $str, int $offset, int &$bytesConsumed): int
322    {
323        $byte = ord($str[$offset]);
324
325        if ($byte < 0x80) {
326            $bytesConsumed = 1;
327            return $byte;
328        }
329
330        if (($byte & 0xE0) === 0xC0) {
331            $bytesConsumed = 2;
332            return (($byte & 0x1F) << 6)
333                | (ord($str[$offset + 1]) & 0x3F);
334        }
335
336        if (($byte & 0xF0) === 0xE0) {
337            $bytesConsumed = 3;
338            return (($byte & 0x0F) << 12)
339                | ((ord($str[$offset + 1]) & 0x3F) << 6)
340                | (ord($str[$offset + 2]) & 0x3F);
341        }
342
343        if (($byte & 0xF8) === 0xF0) {
344            $bytesConsumed = 4;
345            return (($byte & 0x07) << 18)
346                | ((ord($str[$offset + 1]) & 0x3F) << 12)
347                | ((ord($str[$offset + 2]) & 0x3F) << 6)
348                | (ord($str[$offset + 3]) & 0x3F);
349        }
350
351        // Invalid UTF-8 byte — treat as single byte
352        $bytesConsumed = 1;
353        return $byte;
354    }
355
356    /**
357     * Convert a UTF-8 string to an array of codepoints.
358     *
359     * @return int[]
360     */
361    private static function toCodepoints(string $str): array
362    {
363        $codepoints = [];
364        $len = strlen($str);
365        $i = 0;
366        $bytesConsumed = 0;
367
368        while ($i < $len) {
369            $codepoints[] = self::readCodepoint($str, $i, $bytesConsumed);
370            $i += $bytesConsumed;
371        }
372
373        return $codepoints;
374    }
375}