1 /******************************************** 2 * Encode and decode UTF-8, UTF-16 and UTF-32 strings. 3 * 4 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D 5 * wchar type. 6 * For Posix systems, the C wchar_t type is UTF-32 and corresponds to 7 * the D utf.dchar type. 8 * 9 * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF). 10 * 11 * See_Also: 12 * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br> 13 * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br> 14 * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) 15 * 16 * Copyright: Copyright Digital Mars 2003 - 2016. 17 * License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 18 * Authors: Walter Bright, Sean Kelly 19 * Source: $(DRUNTIMESRC core/internal/_utf.d) 20 */ 21 22 module core.internal.utf; 23 24 extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ ) @safe pure; 25 26 /******************************* 27 * Test if c is a valid UTF-32 character. 28 * 29 * \uFFFE and \uFFFF are considered valid by this function, 30 * as they are permitted for internal use by an application, 31 * but they are not allowed for interchange by the Unicode standard. 32 * 33 * Returns: true if it is, false if not. 34 */ 35 36 @safe @nogc pure nothrow 37 bool isValidDchar(dchar c) 38 { 39 /* Note: FFFE and FFFF are specifically permitted by the 40 * Unicode standard for application internal use, but are not 41 * allowed for interchange. 42 * (thanks to Arcane Jill) 43 */ 44 45 return c < 0xD800 || 46 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); 47 } 48 49 unittest 50 { 51 debug(utf) printf("utf.isValidDchar.unittest\n"); 52 assert(isValidDchar(cast(dchar)'a') == true); 53 assert(isValidDchar(cast(dchar)0x1FFFFF) == false); 54 } 55 56 57 58 static immutable UTF8stride = 59 [ 60 cast(ubyte) 61 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 62 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 63 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 64 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 65 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 66 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 67 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 69 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 70 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 71 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 72 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 73 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 74 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 75 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 76 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, 77 ]; 78 79 /** 80 * stride() returns the length of a UTF-8 sequence starting at index i 81 * in string s. 82 * Returns: 83 * The number of bytes in the UTF-8 sequence or 84 * 0xFF meaning s[i] is not the start of of UTF-8 sequence. 85 */ 86 @safe @nogc pure nothrow 87 uint stride(const scope char[] s, size_t i) 88 { 89 return UTF8stride[s[i]]; 90 } 91 92 /** 93 * stride() returns the length of a UTF-16 sequence starting at index i 94 * in string s. 95 */ 96 @safe @nogc pure nothrow 97 uint stride(const scope wchar[] s, size_t i) 98 { uint u = s[i]; 99 return 1 + (u >= 0xD800 && u <= 0xDBFF); 100 } 101 102 /** 103 * stride() returns the length of a UTF-32 sequence starting at index i 104 * in string s. 105 * Returns: The return value will always be 1. 106 */ 107 @safe @nogc pure nothrow 108 uint stride(const scope dchar[] s, size_t i) 109 { 110 return 1; 111 } 112 113 /******************************************* 114 * Given an index i into an array of characters s[], 115 * and assuming that index i is at the start of a UTF character, 116 * determine the number of UCS characters up to that index i. 117 */ 118 @safe pure 119 size_t toUCSindex(const scope char[] s, size_t i) 120 { 121 size_t n; 122 size_t j; 123 124 for (j = 0; j < i; ) 125 { 126 j += stride(s, j); 127 n++; 128 } 129 if (j > i) 130 { 131 onUnicodeError("invalid UTF-8 sequence", j); 132 } 133 return n; 134 } 135 136 /** ditto */ 137 @safe pure 138 size_t toUCSindex(const scope wchar[] s, size_t i) 139 { 140 size_t n; 141 size_t j; 142 143 for (j = 0; j < i; ) 144 { 145 j += stride(s, j); 146 n++; 147 } 148 if (j > i) 149 { 150 onUnicodeError("invalid UTF-16 sequence", j); 151 } 152 return n; 153 } 154 155 /** ditto */ 156 @safe @nogc pure nothrow 157 size_t toUCSindex(const scope dchar[] s, size_t i) 158 { 159 return i; 160 } 161 162 /****************************************** 163 * Given a UCS index n into an array of characters s[], return the UTF index. 164 */ 165 @safe pure 166 size_t toUTFindex(const scope char[] s, size_t n) 167 { 168 size_t i; 169 170 while (n--) 171 { 172 uint j = UTF8stride[s[i]]; 173 if (j == 0xFF) 174 onUnicodeError("invalid UTF-8 sequence", i); 175 i += j; 176 } 177 return i; 178 } 179 180 /** ditto */ 181 @safe @nogc pure nothrow 182 size_t toUTFindex(const scope wchar[] s, size_t n) 183 { 184 size_t i; 185 186 while (n--) 187 { wchar u = s[i]; 188 189 i += 1 + (u >= 0xD800 && u <= 0xDBFF); 190 } 191 return i; 192 } 193 194 /** ditto */ 195 @safe @nogc pure nothrow 196 size_t toUTFindex(const scope dchar[] s, size_t n) 197 { 198 return n; 199 } 200 201 /* =================== Decode ======================= */ 202 203 /*************** 204 * Decodes and returns character starting at s[idx]. idx is advanced past the 205 * decoded character. If the character is not well formed, a UtfException is 206 * thrown and idx remains unchanged. 207 */ 208 @safe pure 209 dchar decode(const scope char[] s, ref size_t idx) 210 in 211 { 212 assert(idx >= 0 && idx < s.length); 213 } 214 out (result) 215 { 216 assert(isValidDchar(result)); 217 } 218 do 219 { 220 size_t len = s.length; 221 dchar V; 222 size_t i = idx; 223 char u = s[i]; 224 225 if (u & 0x80) 226 { uint n; 227 char u2; 228 229 /* The following encodings are valid, except for the 5 and 6 byte 230 * combinations: 231 * 0xxxxxxx 232 * 110xxxxx 10xxxxxx 233 * 1110xxxx 10xxxxxx 10xxxxxx 234 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 235 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 236 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 237 */ 238 for (n = 1; ; n++) 239 { 240 if (n > 4) 241 goto Lerr; // only do the first 4 of 6 encodings 242 if (((u << n) & 0x80) == 0) 243 { 244 if (n == 1) 245 goto Lerr; 246 break; 247 } 248 } 249 250 // Pick off (7 - n) significant bits of B from first byte of octet 251 V = cast(dchar)(u & ((1 << (7 - n)) - 1)); 252 253 if (i + (n - 1) >= len) 254 goto Lerr; // off end of string 255 256 /* The following combinations are overlong, and illegal: 257 * 1100000x (10xxxxxx) 258 * 11100000 100xxxxx (10xxxxxx) 259 * 11110000 1000xxxx (10xxxxxx 10xxxxxx) 260 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) 261 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) 262 */ 263 u2 = s[i + 1]; 264 if ((u & 0xFE) == 0xC0 || 265 (u == 0xE0 && (u2 & 0xE0) == 0x80) || 266 (u == 0xF0 && (u2 & 0xF0) == 0x80) || 267 (u == 0xF8 && (u2 & 0xF8) == 0x80) || 268 (u == 0xFC && (u2 & 0xFC) == 0x80)) 269 goto Lerr; // overlong combination 270 271 for (uint j = 1; j != n; j++) 272 { 273 u = s[i + j]; 274 if ((u & 0xC0) != 0x80) 275 goto Lerr; // trailing bytes are 10xxxxxx 276 V = (V << 6) | (u & 0x3F); 277 } 278 if (!isValidDchar(V)) 279 goto Lerr; 280 i += n; 281 } 282 else 283 { 284 V = cast(dchar) u; 285 i++; 286 } 287 288 idx = i; 289 return V; 290 291 Lerr: 292 onUnicodeError("invalid UTF-8 sequence", i); 293 return V; // dummy return 294 } 295 296 unittest 297 { size_t i; 298 dchar c; 299 300 debug(utf) printf("utf.decode.unittest\n"); 301 302 static s1 = "abcd"c; 303 i = 0; 304 c = decode(s1, i); 305 assert(c == cast(dchar)'a'); 306 assert(i == 1); 307 c = decode(s1, i); 308 assert(c == cast(dchar)'b'); 309 assert(i == 2); 310 311 static s2 = "\xC2\xA9"c; 312 i = 0; 313 c = decode(s2, i); 314 assert(c == cast(dchar)'\u00A9'); 315 assert(i == 2); 316 317 static s3 = "\xE2\x89\xA0"c; 318 i = 0; 319 c = decode(s3, i); 320 assert(c == cast(dchar)'\u2260'); 321 assert(i == 3); 322 323 static s4 = 324 [ "\xE2\x89"c[], // too short 325 "\xC0\x8A", 326 "\xE0\x80\x8A", 327 "\xF0\x80\x80\x8A", 328 "\xF8\x80\x80\x80\x8A", 329 "\xFC\x80\x80\x80\x80\x8A", 330 ]; 331 332 for (int j = 0; j < s4.length; j++) 333 { 334 try 335 { 336 i = 0; 337 c = decode(s4[j], i); 338 assert(0); 339 } 340 catch (Throwable o) 341 { 342 i = 23; 343 } 344 assert(i == 23); 345 } 346 } 347 348 /** ditto */ 349 @safe pure 350 dchar decode(const scope wchar[] s, ref size_t idx) 351 in 352 { 353 assert(idx >= 0 && idx < s.length); 354 } 355 out (result) 356 { 357 assert(isValidDchar(result)); 358 } 359 do 360 { 361 string msg; 362 dchar V; 363 size_t i = idx; 364 uint u = s[i]; 365 366 if (u & ~0x7F) 367 { if (u >= 0xD800 && u <= 0xDBFF) 368 { uint u2; 369 370 if (i + 1 == s.length) 371 { msg = "surrogate UTF-16 high value past end of string"; 372 goto Lerr; 373 } 374 u2 = s[i + 1]; 375 if (u2 < 0xDC00 || u2 > 0xDFFF) 376 { msg = "surrogate UTF-16 low value out of range"; 377 goto Lerr; 378 } 379 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); 380 i += 2; 381 } 382 else if (u >= 0xDC00 && u <= 0xDFFF) 383 { msg = "unpaired surrogate UTF-16 value"; 384 goto Lerr; 385 } 386 else if (u == 0xFFFE || u == 0xFFFF) 387 { msg = "illegal UTF-16 value"; 388 goto Lerr; 389 } 390 else 391 i++; 392 } 393 else 394 { 395 i++; 396 } 397 398 idx = i; 399 return cast(dchar)u; 400 401 Lerr: 402 onUnicodeError(msg, i); 403 return cast(dchar)u; // dummy return 404 } 405 406 /** ditto */ 407 @safe pure 408 dchar decode(const scope dchar[] s, ref size_t idx) 409 in 410 { 411 assert(idx >= 0 && idx < s.length); 412 } 413 do 414 { 415 size_t i = idx; 416 dchar c = s[i]; 417 418 if (!isValidDchar(c)) 419 goto Lerr; 420 idx = i + 1; 421 return c; 422 423 Lerr: 424 onUnicodeError("invalid UTF-32 value", i); 425 return c; // dummy return 426 } 427 428 429 /* =================== Encode ======================= */ 430 431 /******************************* 432 * Encodes character c and appends it to array s[]. 433 */ 434 @safe pure nothrow 435 void encode(ref char[] s, dchar c) 436 in 437 { 438 assert(isValidDchar(c)); 439 } 440 do 441 { 442 char[] r = s; 443 444 if (c <= 0x7F) 445 { 446 r ~= cast(char) c; 447 } 448 else 449 { 450 char[4] buf = void; 451 uint L; 452 453 if (c <= 0x7FF) 454 { 455 buf[0] = cast(char)(0xC0 | (c >> 6)); 456 buf[1] = cast(char)(0x80 | (c & 0x3F)); 457 L = 2; 458 } 459 else if (c <= 0xFFFF) 460 { 461 buf[0] = cast(char)(0xE0 | (c >> 12)); 462 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 463 buf[2] = cast(char)(0x80 | (c & 0x3F)); 464 L = 3; 465 } 466 else if (c <= 0x10FFFF) 467 { 468 buf[0] = cast(char)(0xF0 | (c >> 18)); 469 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 470 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 471 buf[3] = cast(char)(0x80 | (c & 0x3F)); 472 L = 4; 473 } 474 else 475 { 476 assert(0); 477 } 478 r ~= buf[0 .. L]; 479 } 480 s = r; 481 } 482 483 unittest 484 { 485 debug(utf) printf("utf.encode.unittest\n"); 486 487 char[] s = "abcd".dup; 488 encode(s, cast(dchar)'a'); 489 assert(s.length == 5); 490 assert(s == "abcda"); 491 492 encode(s, cast(dchar)'\u00A9'); 493 assert(s.length == 7); 494 assert(s == "abcda\xC2\xA9"); 495 //assert(s == "abcda\u00A9"); // BUG: fix compiler 496 497 encode(s, cast(dchar)'\u2260'); 498 assert(s.length == 10); 499 assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); 500 } 501 502 /** ditto */ 503 @safe pure nothrow 504 void encode(ref wchar[] s, dchar c) 505 in 506 { 507 assert(isValidDchar(c)); 508 } 509 do 510 { 511 wchar[] r = s; 512 513 if (c <= 0xFFFF) 514 { 515 r ~= cast(wchar) c; 516 } 517 else 518 { 519 wchar[2] buf = void; 520 521 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 522 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); 523 r ~= buf; 524 } 525 s = r; 526 } 527 528 /** ditto */ 529 @safe pure nothrow 530 void encode(ref dchar[] s, dchar c) 531 in 532 { 533 assert(isValidDchar(c)); 534 } 535 do 536 { 537 s ~= c; 538 } 539 540 /** 541 Returns the code length of $(D c) in the encoding using $(D C) as a 542 code point. The code is returned in character count, not in bytes. 543 */ 544 @safe pure nothrow @nogc 545 ubyte codeLength(C)(dchar c) 546 { 547 static if (C.sizeof == 1) 548 { 549 if (c <= 0x7F) return 1; 550 if (c <= 0x7FF) return 2; 551 if (c <= 0xFFFF) return 3; 552 if (c <= 0x10FFFF) return 4; 553 assert(false); 554 } 555 else static if (C.sizeof == 2) 556 { 557 return c <= 0xFFFF ? 1 : 2; 558 } 559 else 560 { 561 static assert(C.sizeof == 4); 562 return 1; 563 } 564 } 565 566 /* =================== Validation ======================= */ 567 568 /*********************************** 569 Checks to see if string is well formed or not. $(D S) can be an array 570 of $(D char), $(D wchar), or $(D dchar). Returns $(D false) if it is not. 571 Use to check all untrusted input for correctness. 572 */ 573 @safe pure nothrow 574 bool isValidString(S)(const scope S s) 575 { 576 auto len = s.length; 577 for (size_t i = 0; i < len; ) 578 { 579 try 580 decode(s, i); 581 catch (Exception e) 582 return false; 583 } 584 585 return true; 586 } 587 588 /* =================== Conversion to UTF8 ======================= */ 589 590 @safe pure nothrow @nogc 591 char[] toUTF8(return scope char[] buf, dchar c) 592 in 593 { 594 assert(isValidDchar(c)); 595 } 596 do 597 { 598 if (c <= 0x7F) 599 { 600 buf[0] = cast(char) c; 601 return buf[0 .. 1]; 602 } 603 else if (c <= 0x7FF) 604 { 605 buf[0] = cast(char)(0xC0 | (c >> 6)); 606 buf[1] = cast(char)(0x80 | (c & 0x3F)); 607 return buf[0 .. 2]; 608 } 609 else if (c <= 0xFFFF) 610 { 611 buf[0] = cast(char)(0xE0 | (c >> 12)); 612 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 613 buf[2] = cast(char)(0x80 | (c & 0x3F)); 614 return buf[0 .. 3]; 615 } 616 else if (c <= 0x10FFFF) 617 { 618 buf[0] = cast(char)(0xF0 | (c >> 18)); 619 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 620 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 621 buf[3] = cast(char)(0x80 | (c & 0x3F)); 622 return buf[0 .. 4]; 623 } 624 assert(0); 625 } 626 627 /******************* 628 * Encodes string s into UTF-8 and returns the encoded string. 629 */ 630 @safe pure nothrow 631 string toUTF8(return scope string s) 632 in 633 { 634 assert(isValidString(s)); 635 } 636 do 637 { 638 return s; 639 } 640 641 /** ditto */ 642 @trusted pure 643 string toUTF8(const scope wchar[] s) 644 { 645 char[] r; 646 size_t i; 647 size_t slen = s.length; 648 649 r.length = slen; 650 651 for (i = 0; i < slen; i++) 652 { wchar c = s[i]; 653 654 if (c <= 0x7F) 655 r[i] = cast(char)c; // fast path for ascii 656 else 657 { 658 r.length = i; 659 foreach (dchar ch; s[i .. slen]) 660 { 661 encode(r, ch); 662 } 663 break; 664 } 665 } 666 return cast(string)r; 667 } 668 669 /** ditto */ 670 @trusted pure 671 string toUTF8(const scope dchar[] s) 672 { 673 char[] r; 674 size_t i; 675 size_t slen = s.length; 676 677 r.length = slen; 678 679 for (i = 0; i < slen; i++) 680 { dchar c = s[i]; 681 682 if (c <= 0x7F) 683 r[i] = cast(char)c; // fast path for ascii 684 else 685 { 686 r.length = i; 687 foreach (dchar d; s[i .. slen]) 688 { 689 encode(r, d); 690 } 691 break; 692 } 693 } 694 return cast(string)r; 695 } 696 697 /* =================== Conversion to UTF16 ======================= */ 698 699 @safe pure nothrow @nogc 700 wchar[] toUTF16(return scope wchar[] buf, dchar c) 701 in 702 { 703 assert(isValidDchar(c)); 704 } 705 do 706 { 707 if (c <= 0xFFFF) 708 { 709 buf[0] = cast(wchar) c; 710 return buf[0 .. 1]; 711 } 712 else 713 { 714 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 715 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); 716 return buf[0 .. 2]; 717 } 718 } 719 720 /**************** 721 * Encodes string s into UTF-16 and returns the encoded string. 722 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take 723 * an LPWSTR or LPCWSTR argument. 724 */ 725 @trusted pure 726 wstring toUTF16(const scope char[] s) 727 { 728 wchar[] r; 729 size_t slen = s.length; 730 731 if (!__ctfe) 732 { 733 // Reserve still does a lot if slen is zero. 734 // Return early for that case. 735 if (0 == slen) 736 return ""w; 737 r.reserve(slen); 738 } 739 for (size_t i = 0; i < slen; ) 740 { 741 dchar c = s[i]; 742 if (c <= 0x7F) 743 { 744 i++; 745 r ~= cast(wchar)c; 746 } 747 else 748 { 749 c = decode(s, i); 750 encode(r, c); 751 } 752 } 753 return cast(wstring)r; 754 } 755 756 alias const(wchar)* wptr; 757 /** ditto */ 758 @safe pure 759 wptr toUTF16z(const scope char[] s) 760 { 761 wchar[] r; 762 size_t slen = s.length; 763 764 if (!__ctfe) 765 { 766 // Reserve still does a lot if slen is zero. 767 // Return early for that case. 768 if (0 == slen) 769 return &"\0"w[0]; 770 r.reserve(slen + 1); 771 } 772 for (size_t i = 0; i < slen; ) 773 { 774 dchar c = s[i]; 775 if (c <= 0x7F) 776 { 777 i++; 778 r ~= cast(wchar)c; 779 } 780 else 781 { 782 c = decode(s, i); 783 encode(r, c); 784 } 785 } 786 r ~= '\000'; 787 return &r[0]; 788 } 789 790 /** ditto */ 791 @safe pure nothrow 792 wstring toUTF16(return scope wstring s) 793 in 794 { 795 assert(isValidString(s)); 796 } 797 do 798 { 799 return s; 800 } 801 802 /** ditto */ 803 @trusted pure nothrow 804 wstring toUTF16(const scope dchar[] s) 805 { 806 wchar[] r; 807 size_t slen = s.length; 808 809 if (!__ctfe) 810 { 811 // Reserve still does a lot if slen is zero. 812 // Return early for that case. 813 if (0 == slen) 814 return ""w; 815 r.reserve(slen); 816 } 817 for (size_t i = 0; i < slen; i++) 818 { 819 encode(r, s[i]); 820 } 821 return cast(wstring)r; 822 } 823 824 /* =================== Conversion to UTF32 ======================= */ 825 826 /***** 827 * Encodes string s into UTF-32 and returns the encoded string. 828 */ 829 @trusted pure 830 dstring toUTF32(const scope char[] s) 831 { 832 dchar[] r; 833 size_t slen = s.length; 834 size_t j = 0; 835 836 r.length = slen; // r[] will never be longer than s[] 837 for (size_t i = 0; i < slen; ) 838 { 839 dchar c = s[i]; 840 if (c >= 0x80) 841 c = decode(s, i); 842 else 843 i++; // c is ascii, no need for decode 844 r[j++] = c; 845 } 846 return cast(dstring)r[0 .. j]; 847 } 848 849 /** ditto */ 850 @trusted pure 851 dstring toUTF32(const scope wchar[] s) 852 { 853 dchar[] r; 854 size_t slen = s.length; 855 size_t j = 0; 856 857 r.length = slen; // r[] will never be longer than s[] 858 for (size_t i = 0; i < slen; ) 859 { 860 dchar c = s[i]; 861 if (c >= 0x80) 862 c = decode(s, i); 863 else 864 i++; // c is ascii, no need for decode 865 r[j++] = c; 866 } 867 return cast(dstring)r[0 .. j]; 868 } 869 870 /** ditto */ 871 @safe pure nothrow 872 dstring toUTF32(return scope dstring s) 873 in 874 { 875 assert(isValidString(s)); 876 } 877 do 878 { 879 return s; 880 } 881 882 /* ================================ tests ================================== */ 883 884 unittest 885 { 886 debug(utf) printf("utf.toUTF.unittest\n"); 887 888 auto c = "hello"c[]; 889 auto w = toUTF16(c); 890 assert(w == "hello"); 891 auto d = toUTF32(c); 892 assert(d == "hello"); 893 894 c = toUTF8(w); 895 assert(c == "hello"); 896 d = toUTF32(w); 897 assert(d == "hello"); 898 899 c = toUTF8(d); 900 assert(c == "hello"); 901 w = toUTF16(d); 902 assert(w == "hello"); 903 904 905 c = "hel\u1234o"; 906 w = toUTF16(c); 907 assert(w == "hel\u1234o"); 908 d = toUTF32(c); 909 assert(d == "hel\u1234o"); 910 911 c = toUTF8(w); 912 assert(c == "hel\u1234o"); 913 d = toUTF32(w); 914 assert(d == "hel\u1234o"); 915 916 c = toUTF8(d); 917 assert(c == "hel\u1234o"); 918 w = toUTF16(d); 919 assert(w == "hel\u1234o"); 920 921 922 c = "he\U000BAAAAllo"; 923 w = toUTF16(c); 924 //foreach (wchar c; w) printf("c = x%x\n", c); 925 //foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c); 926 assert(w == "he\U000BAAAAllo"); 927 d = toUTF32(c); 928 assert(d == "he\U000BAAAAllo"); 929 930 c = toUTF8(w); 931 assert(c == "he\U000BAAAAllo"); 932 d = toUTF32(w); 933 assert(d == "he\U000BAAAAllo"); 934 935 c = toUTF8(d); 936 assert(c == "he\U000BAAAAllo"); 937 w = toUTF16(d); 938 assert(w == "he\U000BAAAAllo"); 939 940 wchar[2] buf; 941 auto ret = toUTF16(buf, '\U000BAAAA'); 942 assert(ret == "\U000BAAAA"); 943 }