1 /** 2 * Functions related to UTF encoding. 3 * 4 * Copyright: Copyright (C) 1999-2023 by The D Language Foundation, All Rights Reserved 5 * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright) 6 * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/root/utf.d, _utf.d) 8 * Documentation: https://dlang.org/phobos/dmd_root_utf.html 9 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/root/utf.d 10 */ 11 12 module dmd.root.utf; 13 14 @nogc nothrow pure @safe: 15 16 /// The Unicode code space is the range of code points [0x000000,0x10FFFF] 17 /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF] 18 bool utf_isValidDchar(dchar c) 19 { 20 // TODO: Whether non-char code points should be rejected is pending review. 21 // 0xFFFE and 0xFFFF are valid for internal use, like Phobos std.utf.isValidDChar 22 // See also https://issues.dlang.org/show_bug.cgi?id=1357 23 if (c < 0xD800) // Almost all characters in a typical document. 24 return true; 25 if (c > 0xDFFF && c <= 0x10FFFF) 26 return true; 27 return false; 28 } 29 30 /******************************* 31 * Return !=0 if unicode alpha. 32 * Use table from C99 Appendix D. 33 */ 34 bool isUniAlpha(dchar c) 35 { 36 static immutable wchar[2][] ALPHA_TABLE = 37 [ 38 [0x00AA, 0x00AA], 39 [0x00B5, 0x00B5], 40 [0x00B7, 0x00B7], 41 [0x00BA, 0x00BA], 42 [0x00C0, 0x00D6], 43 [0x00D8, 0x00F6], 44 [0x00F8, 0x01F5], 45 [0x01FA, 0x0217], 46 [0x0250, 0x02A8], 47 [0x02B0, 0x02B8], 48 [0x02BB, 0x02BB], 49 [0x02BD, 0x02C1], 50 [0x02D0, 0x02D1], 51 [0x02E0, 0x02E4], 52 [0x037A, 0x037A], 53 [0x0386, 0x0386], 54 [0x0388, 0x038A], 55 [0x038C, 0x038C], 56 [0x038E, 0x03A1], 57 [0x03A3, 0x03CE], 58 [0x03D0, 0x03D6], 59 [0x03DA, 0x03DA], 60 [0x03DC, 0x03DC], 61 [0x03DE, 0x03DE], 62 [0x03E0, 0x03E0], 63 [0x03E2, 0x03F3], 64 [0x0401, 0x040C], 65 [0x040E, 0x044F], 66 [0x0451, 0x045C], 67 [0x045E, 0x0481], 68 [0x0490, 0x04C4], 69 [0x04C7, 0x04C8], 70 [0x04CB, 0x04CC], 71 [0x04D0, 0x04EB], 72 [0x04EE, 0x04F5], 73 [0x04F8, 0x04F9], 74 [0x0531, 0x0556], 75 [0x0559, 0x0559], 76 [0x0561, 0x0587], 77 [0x05B0, 0x05B9], 78 [0x05BB, 0x05BD], 79 [0x05BF, 0x05BF], 80 [0x05C1, 0x05C2], 81 [0x05D0, 0x05EA], 82 [0x05F0, 0x05F2], 83 [0x0621, 0x063A], 84 [0x0640, 0x0652], 85 [0x0660, 0x0669], 86 [0x0670, 0x06B7], 87 [0x06BA, 0x06BE], 88 [0x06C0, 0x06CE], 89 [0x06D0, 0x06DC], 90 [0x06E5, 0x06E8], 91 [0x06EA, 0x06ED], 92 [0x06F0, 0x06F9], 93 [0x0901, 0x0903], 94 [0x0905, 0x0939], 95 [0x093D, 0x094D], 96 [0x0950, 0x0952], 97 [0x0958, 0x0963], 98 [0x0966, 0x096F], 99 [0x0981, 0x0983], 100 [0x0985, 0x098C], 101 [0x098F, 0x0990], 102 [0x0993, 0x09A8], 103 [0x09AA, 0x09B0], 104 [0x09B2, 0x09B2], 105 [0x09B6, 0x09B9], 106 [0x09BE, 0x09C4], 107 [0x09C7, 0x09C8], 108 [0x09CB, 0x09CD], 109 [0x09DC, 0x09DD], 110 [0x09DF, 0x09E3], 111 [0x09E6, 0x09F1], 112 [0x0A02, 0x0A02], 113 [0x0A05, 0x0A0A], 114 [0x0A0F, 0x0A10], 115 [0x0A13, 0x0A28], 116 [0x0A2A, 0x0A30], 117 [0x0A32, 0x0A33], 118 [0x0A35, 0x0A36], 119 [0x0A38, 0x0A39], 120 [0x0A3E, 0x0A42], 121 [0x0A47, 0x0A48], 122 [0x0A4B, 0x0A4D], 123 [0x0A59, 0x0A5C], 124 [0x0A5E, 0x0A5E], 125 [0x0A66, 0x0A6F], 126 [0x0A74, 0x0A74], 127 [0x0A81, 0x0A83], 128 [0x0A85, 0x0A8B], 129 [0x0A8D, 0x0A8D], 130 [0x0A8F, 0x0A91], 131 [0x0A93, 0x0AA8], 132 [0x0AAA, 0x0AB0], 133 [0x0AB2, 0x0AB3], 134 [0x0AB5, 0x0AB9], 135 [0x0ABD, 0x0AC5], 136 [0x0AC7, 0x0AC9], 137 [0x0ACB, 0x0ACD], 138 [0x0AD0, 0x0AD0], 139 [0x0AE0, 0x0AE0], 140 [0x0AE6, 0x0AEF], 141 [0x0B01, 0x0B03], 142 [0x0B05, 0x0B0C], 143 [0x0B0F, 0x0B10], 144 [0x0B13, 0x0B28], 145 [0x0B2A, 0x0B30], 146 [0x0B32, 0x0B33], 147 [0x0B36, 0x0B39], 148 [0x0B3D, 0x0B43], 149 [0x0B47, 0x0B48], 150 [0x0B4B, 0x0B4D], 151 [0x0B5C, 0x0B5D], 152 [0x0B5F, 0x0B61], 153 [0x0B66, 0x0B6F], 154 [0x0B82, 0x0B83], 155 [0x0B85, 0x0B8A], 156 [0x0B8E, 0x0B90], 157 [0x0B92, 0x0B95], 158 [0x0B99, 0x0B9A], 159 [0x0B9C, 0x0B9C], 160 [0x0B9E, 0x0B9F], 161 [0x0BA3, 0x0BA4], 162 [0x0BA8, 0x0BAA], 163 [0x0BAE, 0x0BB5], 164 [0x0BB7, 0x0BB9], 165 [0x0BBE, 0x0BC2], 166 [0x0BC6, 0x0BC8], 167 [0x0BCA, 0x0BCD], 168 [0x0BE7, 0x0BEF], 169 [0x0C01, 0x0C03], 170 [0x0C05, 0x0C0C], 171 [0x0C0E, 0x0C10], 172 [0x0C12, 0x0C28], 173 [0x0C2A, 0x0C33], 174 [0x0C35, 0x0C39], 175 [0x0C3E, 0x0C44], 176 [0x0C46, 0x0C48], 177 [0x0C4A, 0x0C4D], 178 [0x0C60, 0x0C61], 179 [0x0C66, 0x0C6F], 180 [0x0C82, 0x0C83], 181 [0x0C85, 0x0C8C], 182 [0x0C8E, 0x0C90], 183 [0x0C92, 0x0CA8], 184 [0x0CAA, 0x0CB3], 185 [0x0CB5, 0x0CB9], 186 [0x0CBE, 0x0CC4], 187 [0x0CC6, 0x0CC8], 188 [0x0CCA, 0x0CCD], 189 [0x0CDE, 0x0CDE], 190 [0x0CE0, 0x0CE1], 191 [0x0CE6, 0x0CEF], 192 [0x0D02, 0x0D03], 193 [0x0D05, 0x0D0C], 194 [0x0D0E, 0x0D10], 195 [0x0D12, 0x0D28], 196 [0x0D2A, 0x0D39], 197 [0x0D3E, 0x0D43], 198 [0x0D46, 0x0D48], 199 [0x0D4A, 0x0D4D], 200 [0x0D60, 0x0D61], 201 [0x0D66, 0x0D6F], 202 [0x0E01, 0x0E3A], 203 [0x0E40, 0x0E5B], 204 [0x0E81, 0x0E82], 205 [0x0E84, 0x0E84], 206 [0x0E87, 0x0E88], 207 [0x0E8A, 0x0E8A], 208 [0x0E8D, 0x0E8D], 209 [0x0E94, 0x0E97], 210 [0x0E99, 0x0E9F], 211 [0x0EA1, 0x0EA3], 212 [0x0EA5, 0x0EA5], 213 [0x0EA7, 0x0EA7], 214 [0x0EAA, 0x0EAB], 215 [0x0EAD, 0x0EAE], 216 [0x0EB0, 0x0EB9], 217 [0x0EBB, 0x0EBD], 218 [0x0EC0, 0x0EC4], 219 [0x0EC6, 0x0EC6], 220 [0x0EC8, 0x0ECD], 221 [0x0ED0, 0x0ED9], 222 [0x0EDC, 0x0EDD], 223 [0x0F00, 0x0F00], 224 [0x0F18, 0x0F19], 225 [0x0F20, 0x0F33], 226 [0x0F35, 0x0F35], 227 [0x0F37, 0x0F37], 228 [0x0F39, 0x0F39], 229 [0x0F3E, 0x0F47], 230 [0x0F49, 0x0F69], 231 [0x0F71, 0x0F84], 232 [0x0F86, 0x0F8B], 233 [0x0F90, 0x0F95], 234 [0x0F97, 0x0F97], 235 [0x0F99, 0x0FAD], 236 [0x0FB1, 0x0FB7], 237 [0x0FB9, 0x0FB9], 238 [0x10A0, 0x10C5], 239 [0x10D0, 0x10F6], 240 [0x1E00, 0x1E9B], 241 [0x1EA0, 0x1EF9], 242 [0x1F00, 0x1F15], 243 [0x1F18, 0x1F1D], 244 [0x1F20, 0x1F45], 245 [0x1F48, 0x1F4D], 246 [0x1F50, 0x1F57], 247 [0x1F59, 0x1F59], 248 [0x1F5B, 0x1F5B], 249 [0x1F5D, 0x1F5D], 250 [0x1F5F, 0x1F7D], 251 [0x1F80, 0x1FB4], 252 [0x1FB6, 0x1FBC], 253 [0x1FBE, 0x1FBE], 254 [0x1FC2, 0x1FC4], 255 [0x1FC6, 0x1FCC], 256 [0x1FD0, 0x1FD3], 257 [0x1FD6, 0x1FDB], 258 [0x1FE0, 0x1FEC], 259 [0x1FF2, 0x1FF4], 260 [0x1FF6, 0x1FFC], 261 [0x203F, 0x2040], 262 [0x207F, 0x207F], 263 [0x2102, 0x2102], 264 [0x2107, 0x2107], 265 [0x210A, 0x2113], 266 [0x2115, 0x2115], 267 [0x2118, 0x211D], 268 [0x2124, 0x2124], 269 [0x2126, 0x2126], 270 [0x2128, 0x2128], 271 [0x212A, 0x2131], 272 [0x2133, 0x2138], 273 [0x2160, 0x2182], 274 [0x3005, 0x3007], 275 [0x3021, 0x3029], 276 [0x3041, 0x3093], 277 [0x309B, 0x309C], 278 [0x30A1, 0x30F6], 279 [0x30FB, 0x30FC], 280 [0x3105, 0x312C], 281 [0x4E00, 0x9FA5], 282 [0xAC00, 0xD7A3] 283 ]; 284 285 size_t high = ALPHA_TABLE.length - 1; 286 // Shortcut search if c is out of range 287 size_t low = (c < ALPHA_TABLE[0][0] || ALPHA_TABLE[high][1] < c) ? high + 1 : 0; 288 // Binary search 289 while (low <= high) 290 { 291 const size_t mid = low + ((high - low) >> 1); 292 if (c < ALPHA_TABLE[mid][0]) 293 high = mid - 1; 294 else if (ALPHA_TABLE[mid][1] < c) 295 low = mid + 1; 296 else 297 { 298 assert(ALPHA_TABLE[mid][0] <= c && c <= ALPHA_TABLE[mid][1]); 299 return true; 300 } 301 } 302 return false; 303 } 304 305 /** 306 * Returns the code length of c in code units. 307 */ 308 int utf_codeLengthChar(dchar c) 309 { 310 if (c <= 0x7F) 311 return 1; 312 if (c <= 0x7FF) 313 return 2; 314 if (c <= 0xFFFF) 315 return 3; 316 if (c <= 0x10FFFF) 317 return 4; 318 assert(false); 319 } 320 321 int utf_codeLengthWchar(dchar c) 322 { 323 return c <= 0xFFFF ? 1 : 2; 324 } 325 326 /** 327 * Returns the code length of c in code units for the encoding. 328 * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32. 329 */ 330 int utf_codeLength(int sz, dchar c) 331 { 332 if (sz == 1) 333 return utf_codeLengthChar(c); 334 if (sz == 2) 335 return utf_codeLengthWchar(c); 336 assert(sz == 4); 337 return 1; 338 } 339 340 void utf_encodeChar(char* s, dchar c) @system 341 { 342 assert(s !is null); 343 assert(utf_isValidDchar(c)); 344 if (c <= 0x7F) 345 { 346 s[0] = cast(char)c; 347 } 348 else if (c <= 0x07FF) 349 { 350 s[0] = cast(char)(0xC0 | (c >> 6)); 351 s[1] = cast(char)(0x80 | (c & 0x3F)); 352 } 353 else if (c <= 0xFFFF) 354 { 355 s[0] = cast(char)(0xE0 | (c >> 12)); 356 s[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 357 s[2] = cast(char)(0x80 | (c & 0x3F)); 358 } 359 else if (c <= 0x10FFFF) 360 { 361 s[0] = cast(char)(0xF0 | (c >> 18)); 362 s[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 363 s[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 364 s[3] = cast(char)(0x80 | (c & 0x3F)); 365 } 366 else 367 assert(0); 368 } 369 370 void utf_encodeWchar(wchar* s, dchar c) @system 371 { 372 assert(s !is null); 373 assert(utf_isValidDchar(c)); 374 if (c <= 0xFFFF) 375 { 376 s[0] = cast(wchar)c; 377 } 378 else 379 { 380 s[0] = cast(wchar)((((c - 0x010000) >> 10) & 0x03FF) + 0xD800); 381 s[1] = cast(wchar)(((c - 0x010000) & 0x03FF) + 0xDC00); 382 } 383 } 384 385 void utf_encode(int sz, void* s, dchar c) @system 386 { 387 if (sz == 1) 388 utf_encodeChar(cast(char*)s, c); 389 else if (sz == 2) 390 utf_encodeWchar(cast(wchar*)s, c); 391 else 392 { 393 assert(sz == 4); 394 *(cast(dchar*)s) = c; 395 } 396 } 397 398 /******************************************** 399 * Checks whether an Unicode code point is a bidirectional 400 * control character. 401 */ 402 bool isBidiControl(dchar c) 403 { 404 // Source: https://www.unicode.org/versions/Unicode15.0.0, table 23-3. 405 switch(c) 406 { 407 case '\u061C': 408 case '\u200E': 409 case '\u200F': 410 case '\u202A': .. case '\u202E': 411 case '\u2066': .. case '\u2069': 412 return true; 413 default: 414 return false; 415 } 416 } 417 418 /******************************************** 419 * Decode a UTF-8 sequence as a single UTF-32 code point. 420 * Params: 421 * s = UTF-8 sequence 422 * ridx = starting index in s[], updated to reflect number of code units decoded 423 * rresult = set to character decoded 424 * Returns: 425 * null on success, otherwise error message string 426 */ 427 string utf_decodeChar(const(char)[] s, ref size_t ridx, out dchar rresult) 428 { 429 // UTF-8 decoding errors 430 static immutable string UTF8_DECODE_OK = null; // no error 431 static immutable string UTF8_DECODE_OUTSIDE_CODE_SPACE = "Outside Unicode code space"; 432 static immutable string UTF8_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-8 sequence"; 433 static immutable string UTF8_DECODE_OVERLONG = "Overlong UTF-8 sequence"; 434 static immutable string UTF8_DECODE_INVALID_TRAILER = "Invalid trailing code unit"; 435 static immutable string UTF8_DECODE_INVALID_CODE_POINT = "Invalid code point decoded"; 436 437 /* The following encodings are valid, except for the 5 and 6 byte 438 * combinations: 439 * 0xxxxxxx 440 * 110xxxxx 10xxxxxx 441 * 1110xxxx 10xxxxxx 10xxxxxx 442 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 443 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 444 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 445 */ 446 static immutable ubyte[256] UTF8_STRIDE = 447 [ 448 1,1,1,1, 1,1,1,1, 449 1,1,1,1, 1,1,1,1, 450 1,1,1,1, 1,1,1,1, 451 1,1,1,1, 1,1,1,1, 452 1,1,1,1, 1,1,1,1, 453 1,1,1,1, 1,1,1,1, 454 1,1,1,1, 1,1,1,1, 455 1,1,1,1, 1,1,1,1, 456 457 1,1,1,1, 1,1,1,1, 458 1,1,1,1, 1,1,1,1, 459 1,1,1,1, 1,1,1,1, 460 1,1,1,1, 1,1,1,1, 461 1,1,1,1, 1,1,1,1, 462 1,1,1,1, 1,1,1,1, 463 1,1,1,1, 1,1,1,1, 464 1,1,1,1, 1,1,1,1, 465 466 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 467 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 468 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 469 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 470 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 471 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 472 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 473 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 474 475 2,2,2,2, 2,2,2,2, 476 2,2,2,2, 2,2,2,2, 477 2,2,2,2, 2,2,2,2, 478 2,2,2,2, 2,2,2,2, 479 480 3,3,3,3, 3,3,3,3, 481 3,3,3,3, 3,3,3,3, 482 483 4,4,4,4, 4,4,4,4, 484 5,5,5,5, 6,6,0xFF,0xFF 485 ]; 486 487 assert(s !is null); 488 size_t i = ridx++; 489 490 const char u = s[i]; 491 // Pre-stage results for ASCII and error cases 492 rresult = u; 493 //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len); 494 // Get expected sequence length 495 const size_t n = UTF8_STRIDE[u]; 496 switch (n) 497 { 498 case 1: 499 // ASCII 500 return UTF8_DECODE_OK; 501 case 2: 502 case 3: 503 case 4: 504 // multi-byte UTF-8 505 break; 506 default: 507 // 5- or 6-byte sequence 508 return UTF8_DECODE_OUTSIDE_CODE_SPACE; 509 } 510 if (s.length < i + n) // source too short 511 return UTF8_DECODE_TRUNCATED_SEQUENCE; 512 // Pick off 7 - n low bits from first code unit 513 dchar c = u & ((1 << (7 - n)) - 1); 514 /* The following combinations are overlong, and illegal: 515 * 1100000x (10xxxxxx) 516 * 11100000 100xxxxx (10xxxxxx) 517 * 11110000 1000xxxx (10xxxxxx 10xxxxxx) 518 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) 519 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) 520 */ 521 const char u2 = s[++i]; 522 // overlong combination 523 if ((u & 0xFE) == 0xC0 || (u == 0xE0 && (u2 & 0xE0) == 0x80) || (u == 0xF0 && (u2 & 0xF0) == 0x80) || (u == 0xF8 && (u2 & 0xF8) == 0x80) || (u == 0xFC && (u2 & 0xFC) == 0x80)) 524 return UTF8_DECODE_OVERLONG; 525 // Decode remaining bits 526 for (const m = n + i - 1; i != m; ++i) 527 { 528 const u3 = s[i]; 529 if ((u3 & 0xC0) != 0x80) // trailing bytes are 10xxxxxx 530 return UTF8_DECODE_INVALID_TRAILER; 531 c = (c << 6) | (u3 & 0x3F); 532 } 533 if (!utf_isValidDchar(c)) 534 return UTF8_DECODE_INVALID_CODE_POINT; 535 ridx = i; 536 rresult = c; 537 return UTF8_DECODE_OK; 538 } 539 540 /******************************************** 541 * Decode a UTF-16 sequence as a single UTF-32 code point. 542 * Params: 543 * s = UTF-16 sequence 544 * ridx = starting index in s[], updated to reflect number of code units decoded 545 * rresult = set to character decoded 546 * Returns: 547 * null on success, otherwise error message string 548 */ 549 string utf_decodeWchar(const(wchar)[] s, ref size_t ridx, out dchar rresult) 550 { 551 // UTF-16 decoding errors 552 static immutable string UTF16_DECODE_OK = null; // no error 553 static immutable string UTF16_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-16 sequence"; 554 static immutable string UTF16_DECODE_INVALID_SURROGATE = "Invalid low surrogate"; 555 static immutable string UTF16_DECODE_UNPAIRED_SURROGATE = "Unpaired surrogate"; 556 static immutable string UTF16_DECODE_INVALID_CODE_POINT = "Invalid code point decoded"; 557 558 assert(s !is null); 559 size_t i = ridx++; 560 561 // Pre-stage results for single wchar and error cases 562 dchar u = rresult = s[i]; 563 if (u < 0xD800) // Single wchar codepoint 564 return UTF16_DECODE_OK; 565 if (0xD800 <= u && u <= 0xDBFF) // Surrogate pair 566 { 567 if (s.length <= i + 1) 568 return UTF16_DECODE_TRUNCATED_SEQUENCE; 569 wchar u2 = s[i + 1]; 570 if (u2 < 0xDC00 || 0xDFFF < u) 571 return UTF16_DECODE_INVALID_SURROGATE; 572 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); 573 ++ridx; 574 } 575 else if (0xDC00 <= u && u <= 0xDFFF) 576 return UTF16_DECODE_UNPAIRED_SURROGATE; 577 if (!utf_isValidDchar(u)) 578 return UTF16_DECODE_INVALID_CODE_POINT; 579 rresult = u; 580 return UTF16_DECODE_OK; 581 }