Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
96.48% |
137 / 142 |
|
66.67% |
6 / 9 |
CRAP | |
0.00% |
0 / 1 |
| SaslPrep | |
96.48% |
137 / 142 |
|
66.67% |
6 / 9 |
106 | |
0.00% |
0 / 1 |
| prepare | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
2 | |||
| map | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| normalize | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
3.04 | |||
| checkProhibited | |
100.00% |
53 / 53 |
|
100.00% |
1 / 1 |
41 | |||
| checkBidi | |
91.30% |
21 / 23 |
|
0.00% |
0 / 1 |
10.07 | |||
| isRandALCat | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
12 | |||
| isLCat | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
30 | |||
| readCodepoint | |
90.48% |
19 / 21 |
|
0.00% |
0 / 1 |
5.02 | |||
| toCodepoints | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
2 | |||
| 1 | <?php |
| 2 | |
| 3 | declare(strict_types=1); |
| 4 | |
| 5 | namespace Phpdftk\Crypt; |
| 6 | |
| 7 | /** |
| 8 | * SASLprep password normalization — RFC 4013. |
| 9 | * |
| 10 | * Prepares Unicode strings for use as passwords in PDF 2.0 encryption |
| 11 | * (ISO 32000-2 §7.6.4.3.2). Uses the Stringprep framework (RFC 3454) |
| 12 | * with the SASLprep profile. |
| 13 | */ |
| 14 | final class SaslPrep |
| 15 | { |
| 16 | /** |
| 17 | * Non-ASCII space characters mapped to U+0020 (RFC 3454 Table C.1.2). |
| 18 | */ |
| 19 | private const SPACE_MAP = [ |
| 20 | "\xC2\xA0", // U+00A0 NO-BREAK SPACE |
| 21 | "\xE1\x9A\x80", // U+1680 OGHAM SPACE MARK |
| 22 | "\xE2\x80\x80", // U+2000 EN QUAD |
| 23 | "\xE2\x80\x81", // U+2001 EM QUAD |
| 24 | "\xE2\x80\x82", // U+2002 EN SPACE |
| 25 | "\xE2\x80\x83", // U+2003 EM SPACE |
| 26 | "\xE2\x80\x84", // U+2004 THREE-PER-EM SPACE |
| 27 | "\xE2\x80\x85", // U+2005 FOUR-PER-EM SPACE |
| 28 | "\xE2\x80\x86", // U+2006 SIX-PER-EM SPACE |
| 29 | "\xE2\x80\x87", // U+2007 FIGURE SPACE |
| 30 | "\xE2\x80\x88", // U+2008 PUNCTUATION SPACE |
| 31 | "\xE2\x80\x89", // U+2009 THIN SPACE |
| 32 | "\xE2\x80\x8A", // U+200A HAIR SPACE |
| 33 | "\xE2\x80\x8B", // U+200B ZERO WIDTH SPACE |
| 34 | "\xE2\x80\xAF", // U+202F NARROW NO-BREAK SPACE |
| 35 | "\xE2\x81\x9F", // U+205F MEDIUM MATHEMATICAL SPACE |
| 36 | "\xE3\x80\x80", // U+3000 IDEOGRAPHIC SPACE |
| 37 | ]; |
| 38 | |
| 39 | /** |
| 40 | * "Commonly mapped to nothing" characters (RFC 3454 Table B.1). |
| 41 | */ |
| 42 | private const MAP_TO_NOTHING = [ |
| 43 | "\xC2\xAD", // U+00AD SOFT HYPHEN |
| 44 | "\xE1\xA0\x86", // U+1806 MONGOLIAN TODO SOFT HYPHEN |
| 45 | "\xE2\x80\x8B", // U+200B ZERO WIDTH SPACE |
| 46 | "\xE2\x81\xA0", // U+2060 WORD JOINER |
| 47 | "\xEF\xBB\xBF", // U+FEFF ZERO WIDTH NO-BREAK SPACE |
| 48 | "\xCD\x8F", // U+034F COMBINING GRAPHEME JOINER |
| 49 | "\xE1\xA0\x8B", // U+180B MONGOLIAN FREE VARIATION SELECTOR ONE |
| 50 | "\xE1\xA0\x8C", // U+180C MONGOLIAN FREE VARIATION SELECTOR TWO |
| 51 | "\xE1\xA0\x8D", // U+180D MONGOLIAN FREE VARIATION SELECTOR THREE |
| 52 | "\xEF\xB8\x80", // U+FE00 VARIATION SELECTOR-1 |
| 53 | "\xEF\xB8\x81", // U+FE01 VARIATION SELECTOR-2 |
| 54 | "\xEF\xB8\x82", // U+FE02 VARIATION SELECTOR-3 |
| 55 | "\xEF\xB8\x83", // U+FE03 VARIATION SELECTOR-4 |
| 56 | "\xEF\xB8\x84", // U+FE04 VARIATION SELECTOR-5 |
| 57 | "\xEF\xB8\x85", // U+FE05 VARIATION SELECTOR-6 |
| 58 | "\xEF\xB8\x86", // U+FE06 VARIATION SELECTOR-7 |
| 59 | "\xEF\xB8\x87", // U+FE07 VARIATION SELECTOR-8 |
| 60 | "\xEF\xB8\x88", // U+FE08 VARIATION SELECTOR-9 |
| 61 | "\xEF\xB8\x89", // U+FE09 VARIATION SELECTOR-10 |
| 62 | "\xEF\xB8\x8A", // U+FE0A VARIATION SELECTOR-11 |
| 63 | "\xEF\xB8\x8B", // U+FE0B VARIATION SELECTOR-12 |
| 64 | "\xEF\xB8\x8C", // U+FE0C VARIATION SELECTOR-13 |
| 65 | "\xEF\xB8\x8D", // U+FE0D VARIATION SELECTOR-14 |
| 66 | "\xEF\xB8\x8E", // U+FE0E VARIATION SELECTOR-15 |
| 67 | "\xEF\xB8\x8F", // U+FE0F VARIATION SELECTOR-16 |
| 68 | ]; |
| 69 | |
| 70 | /** |
| 71 | * Prepare a password string per SASLprep. |
| 72 | * |
| 73 | * Steps: |
| 74 | * 1. Map: replace non-ASCII spaces with U+0020, remove commonly-mapped-to-nothing chars |
| 75 | * 2. Normalize: NFKC normalization |
| 76 | * 3. Prohibit: reject strings with prohibited characters |
| 77 | * 4. Check bidi: validate bidirectional text rules |
| 78 | * |
| 79 | * If the PHP intl extension is not available, mapping and prohibit/bidi |
| 80 | * checks are still performed but NFKC normalization is skipped (most |
| 81 | * passwords are ASCII and don't need normalization). |
| 82 | */ |
| 83 | public static function prepare(string $input): string |
| 84 | { |
| 85 | if ($input === '') { |
| 86 | return ''; |
| 87 | } |
| 88 | |
| 89 | // Step 1: Mapping |
| 90 | $str = self::map($input); |
| 91 | |
| 92 | // Step 2: NFKC normalization |
| 93 | $str = self::normalize($str); |
| 94 | |
| 95 | // Step 3: Prohibit |
| 96 | self::checkProhibited($str); |
| 97 | |
| 98 | // Step 4: Bidi check |
| 99 | self::checkBidi($str); |
| 100 | |
| 101 | return $str; |
| 102 | } |
| 103 | |
| 104 | /** |
| 105 | * Step 1: Map non-ASCII spaces to U+0020 and remove mapped-to-nothing chars. |
| 106 | */ |
| 107 | private static function map(string $input): string |
| 108 | { |
| 109 | // Replace non-ASCII spaces with regular space |
| 110 | $result = str_replace(self::SPACE_MAP, ' ', $input); |
| 111 | |
| 112 | // Remove commonly mapped to nothing characters |
| 113 | $result = str_replace(self::MAP_TO_NOTHING, '', $result); |
| 114 | |
| 115 | return $result; |
| 116 | } |
| 117 | |
| 118 | /** |
| 119 | * Step 2: NFKC normalization via the intl extension. |
| 120 | */ |
| 121 | private static function normalize(string $input): string |
| 122 | { |
| 123 | if (!class_exists(\Normalizer::class)) { |
| 124 | return $input; |
| 125 | } |
| 126 | |
| 127 | $normalized = \Normalizer::normalize($input, \Normalizer::FORM_KC); |
| 128 | |
| 129 | if ($normalized === false) { |
| 130 | return $input; |
| 131 | } |
| 132 | |
| 133 | return $normalized; |
| 134 | } |
| 135 | |
| 136 | /** |
| 137 | * Step 3: Check for prohibited characters. |
| 138 | * |
| 139 | * Checks RFC 3454 Tables C.1.2, C.2.1, C.2.2, C.3-C.9. |
| 140 | * |
| 141 | * @throws \InvalidArgumentException if prohibited characters are found |
| 142 | */ |
| 143 | private static function checkProhibited(string $input): void |
| 144 | { |
| 145 | $len = strlen($input); |
| 146 | $i = 0; |
| 147 | $bytesConsumed = 0; |
| 148 | |
| 149 | while ($i < $len) { |
| 150 | $codepoint = self::readCodepoint($input, $i, $bytesConsumed); |
| 151 | $i += $bytesConsumed; |
| 152 | |
| 153 | // C.2.1: ASCII control characters (U+0000-U+001F, U+007F) |
| 154 | if ($codepoint <= 0x001F || $codepoint === 0x007F) { |
| 155 | throw new \InvalidArgumentException( |
| 156 | sprintf('Prohibited character U+%04X (ASCII control) in SASLprep input', $codepoint), |
| 157 | ); |
| 158 | } |
| 159 | |
| 160 | // C.2.2: Non-ASCII control characters (U+0080-U+009F) |
| 161 | if ($codepoint >= 0x0080 && $codepoint <= 0x009F) { |
| 162 | throw new \InvalidArgumentException( |
| 163 | sprintf('Prohibited character U+%04X (non-ASCII control) in SASLprep input', $codepoint), |
| 164 | ); |
| 165 | } |
| 166 | |
| 167 | // C.2.2: Additional non-ASCII control characters |
| 168 | if ($codepoint === 0x06DD || $codepoint === 0x070F |
| 169 | || $codepoint === 0x180E |
| 170 | || ($codepoint >= 0x200C && $codepoint <= 0x200D) |
| 171 | || ($codepoint >= 0x2028 && $codepoint <= 0x2029) |
| 172 | || ($codepoint >= 0x2060 && $codepoint <= 0x2063) |
| 173 | || ($codepoint >= 0x206A && $codepoint <= 0x206F) |
| 174 | || $codepoint === 0xFEFF |
| 175 | ) { |
| 176 | throw new \InvalidArgumentException( |
| 177 | sprintf('Prohibited character U+%04X (non-ASCII control) in SASLprep input', $codepoint), |
| 178 | ); |
| 179 | } |
| 180 | |
| 181 | // C.3: Private use (U+E000-U+F8FF, U+F0000-U+FFFFD, U+100000-U+10FFFD) |
| 182 | if (($codepoint >= 0xE000 && $codepoint <= 0xF8FF) |
| 183 | || ($codepoint >= 0xF0000 && $codepoint <= 0xFFFFD) |
| 184 | || ($codepoint >= 0x100000 && $codepoint <= 0x10FFFD) |
| 185 | ) { |
| 186 | throw new \InvalidArgumentException( |
| 187 | sprintf('Prohibited character U+%04X (private use) in SASLprep input', $codepoint), |
| 188 | ); |
| 189 | } |
| 190 | |
| 191 | // C.4: Non-characters (U+FDD0-U+FDEF, U+FFFE-U+FFFF, and plane-end non-characters) |
| 192 | if (($codepoint >= 0xFDD0 && $codepoint <= 0xFDEF) |
| 193 | || ($codepoint & 0xFFFE) === 0xFFFE // catches U+xFFFE and U+xFFFF for all planes |
| 194 | ) { |
| 195 | throw new \InvalidArgumentException( |
| 196 | sprintf('Prohibited character U+%04X (non-character) in SASLprep input', $codepoint), |
| 197 | ); |
| 198 | } |
| 199 | |
| 200 | // C.5: Surrogate codes (should not appear in valid UTF-8, but check anyway) |
| 201 | if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) { |
| 202 | throw new \InvalidArgumentException( |
| 203 | sprintf('Prohibited character U+%04X (surrogate) in SASLprep input', $codepoint), |
| 204 | ); |
| 205 | } |
| 206 | |
| 207 | // C.6: Inappropriate for plain text |
| 208 | if ($codepoint === 0xFFF9 || $codepoint === 0xFFFA || $codepoint === 0xFFFB) { |
| 209 | throw new \InvalidArgumentException( |
| 210 | sprintf('Prohibited character U+%04X (inappropriate for plain text) in SASLprep input', $codepoint), |
| 211 | ); |
| 212 | } |
| 213 | |
| 214 | // C.8: Change display properties / deprecated |
| 215 | if ($codepoint === 0x0340 || $codepoint === 0x0341 |
| 216 | || $codepoint === 0x200E || $codepoint === 0x200F |
| 217 | || ($codepoint >= 0x202A && $codepoint <= 0x202E) |
| 218 | ) { |
| 219 | throw new \InvalidArgumentException( |
| 220 | sprintf('Prohibited character U+%04X (change display / deprecated) in SASLprep input', $codepoint), |
| 221 | ); |
| 222 | } |
| 223 | |
| 224 | // C.9: Tagging characters |
| 225 | if ($codepoint === 0xE0001 || ($codepoint >= 0xE0020 && $codepoint <= 0xE007F)) { |
| 226 | throw new \InvalidArgumentException( |
| 227 | sprintf('Prohibited character U+%04X (tagging character) in SASLprep input', $codepoint), |
| 228 | ); |
| 229 | } |
| 230 | } |
| 231 | } |
| 232 | |
| 233 | /** |
| 234 | * Step 4: Bidirectional text check (RFC 3454 §6). |
| 235 | * |
| 236 | * If a string contains any RandALCat character, the first and last |
| 237 | * characters must also be RandALCat, and the string must not contain |
| 238 | * any LCat characters. |
| 239 | * |
| 240 | * @throws \InvalidArgumentException if bidi rules are violated |
| 241 | */ |
| 242 | private static function checkBidi(string $input): void |
| 243 | { |
| 244 | if ($input === '') { |
| 245 | return; |
| 246 | } |
| 247 | |
| 248 | $codepoints = self::toCodepoints($input); |
| 249 | if ($codepoints === []) { |
| 250 | return; |
| 251 | } |
| 252 | |
| 253 | $hasRandAL = false; |
| 254 | $hasL = false; |
| 255 | |
| 256 | foreach ($codepoints as $cp) { |
| 257 | if (self::isRandALCat($cp)) { |
| 258 | $hasRandAL = true; |
| 259 | } |
| 260 | if (self::isLCat($cp)) { |
| 261 | $hasL = true; |
| 262 | } |
| 263 | } |
| 264 | |
| 265 | if ($hasRandAL) { |
| 266 | if ($hasL) { |
| 267 | throw new \InvalidArgumentException( |
| 268 | 'SASLprep bidi violation: string with RandALCat characters must not contain LCat characters', |
| 269 | ); |
| 270 | } |
| 271 | |
| 272 | $first = $codepoints[0]; |
| 273 | $last = $codepoints[count($codepoints) - 1]; |
| 274 | |
| 275 | if (!self::isRandALCat($first) || !self::isRandALCat($last)) { |
| 276 | throw new \InvalidArgumentException( |
| 277 | 'SASLprep bidi violation: first and last characters must be RandALCat', |
| 278 | ); |
| 279 | } |
| 280 | } |
| 281 | } |
| 282 | |
| 283 | /** |
| 284 | * Simplified RandALCat check — covers Arabic, Hebrew, and related blocks. |
| 285 | */ |
| 286 | private static function isRandALCat(int $codepoint): bool |
| 287 | { |
| 288 | return ($codepoint >= 0x0590 && $codepoint <= 0x05FF) // Hebrew |
| 289 | || ($codepoint >= 0x0600 && $codepoint <= 0x06FF) // Arabic |
| 290 | || ($codepoint >= 0x0700 && $codepoint <= 0x074F) // Syriac |
| 291 | || ($codepoint >= 0x0780 && $codepoint <= 0x07BF) // Thaana |
| 292 | || ($codepoint >= 0xFB50 && $codepoint <= 0xFDFF) // Arabic Presentation Forms-A |
| 293 | || ($codepoint >= 0xFE70 && $codepoint <= 0xFEFF); // Arabic Presentation Forms-B |
| 294 | } |
| 295 | |
| 296 | /** |
| 297 | * Simplified LCat check — covers Latin, Greek, Cyrillic, CJK, etc. |
| 298 | */ |
| 299 | private static function isLCat(int $codepoint): bool |
| 300 | { |
| 301 | return ($codepoint >= 0x0041 && $codepoint <= 0x005A) // A-Z |
| 302 | || ($codepoint >= 0x0061 && $codepoint <= 0x007A) // a-z |
| 303 | || ($codepoint >= 0x00C0 && $codepoint <= 0x00D6) // Latin Extended |
| 304 | || ($codepoint >= 0x00D8 && $codepoint <= 0x00F6) |
| 305 | || ($codepoint >= 0x00F8 && $codepoint <= 0x024F) // Latin Extended Additional |
| 306 | || ($codepoint >= 0x0370 && $codepoint <= 0x0373) // Greek |
| 307 | || ($codepoint >= 0x0376 && $codepoint <= 0x0377) |
| 308 | || ($codepoint >= 0x037A && $codepoint <= 0x037D) |
| 309 | || ($codepoint >= 0x0386 && $codepoint <= 0x03FF) |
| 310 | || ($codepoint >= 0x0400 && $codepoint <= 0x04FF) // Cyrillic |
| 311 | || ($codepoint >= 0x1E00 && $codepoint <= 0x1EFF) // Latin Extended Additional |
| 312 | || ($codepoint >= 0x1F00 && $codepoint <= 0x1FFF) // Greek Extended |
| 313 | || ($codepoint >= 0x4E00 && $codepoint <= 0x9FFF) // CJK Unified Ideographs |
| 314 | || ($codepoint >= 0x3040 && $codepoint <= 0x309F) // Hiragana |
| 315 | || ($codepoint >= 0x30A0 && $codepoint <= 0x30FF); // Katakana |
| 316 | } |
| 317 | |
| 318 | /** |
| 319 | * Read a single UTF-8 codepoint from a string at the given offset. |
| 320 | */ |
| 321 | private static function readCodepoint(string $str, int $offset, int &$bytesConsumed): int |
| 322 | { |
| 323 | $byte = ord($str[$offset]); |
| 324 | |
| 325 | if ($byte < 0x80) { |
| 326 | $bytesConsumed = 1; |
| 327 | return $byte; |
| 328 | } |
| 329 | |
| 330 | if (($byte & 0xE0) === 0xC0) { |
| 331 | $bytesConsumed = 2; |
| 332 | return (($byte & 0x1F) << 6) |
| 333 | | (ord($str[$offset + 1]) & 0x3F); |
| 334 | } |
| 335 | |
| 336 | if (($byte & 0xF0) === 0xE0) { |
| 337 | $bytesConsumed = 3; |
| 338 | return (($byte & 0x0F) << 12) |
| 339 | | ((ord($str[$offset + 1]) & 0x3F) << 6) |
| 340 | | (ord($str[$offset + 2]) & 0x3F); |
| 341 | } |
| 342 | |
| 343 | if (($byte & 0xF8) === 0xF0) { |
| 344 | $bytesConsumed = 4; |
| 345 | return (($byte & 0x07) << 18) |
| 346 | | ((ord($str[$offset + 1]) & 0x3F) << 12) |
| 347 | | ((ord($str[$offset + 2]) & 0x3F) << 6) |
| 348 | | (ord($str[$offset + 3]) & 0x3F); |
| 349 | } |
| 350 | |
| 351 | // Invalid UTF-8 byte — treat as single byte |
| 352 | $bytesConsumed = 1; |
| 353 | return $byte; |
| 354 | } |
| 355 | |
| 356 | /** |
| 357 | * Convert a UTF-8 string to an array of codepoints. |
| 358 | * |
| 359 | * @return int[] |
| 360 | */ |
| 361 | private static function toCodepoints(string $str): array |
| 362 | { |
| 363 | $codepoints = []; |
| 364 | $len = strlen($str); |
| 365 | $i = 0; |
| 366 | $bytesConsumed = 0; |
| 367 | |
| 368 | while ($i < $len) { |
| 369 | $codepoints[] = self::readCodepoint($str, $i, $bytesConsumed); |
| 370 | $i += $bytesConsumed; |
| 371 | } |
| 372 | |
| 373 | return $codepoints; |
| 374 | } |
| 375 | } |