1 /** 2 * Implements the lexical analyzer, which converts source code into lexical tokens. 3 * 4 * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical) 5 * 6 * Copyright: Copyright (C) 1999-2023 by The D Language Foundation, All Rights Reserved 7 * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright) 8 * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d) 10 * Documentation: https://dlang.org/phobos/dmd_lexer.html 11 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d 12 */ 13 14 module dmd.lexer; 15 16 import core.stdc.ctype; 17 import core.stdc.stdio; 18 import core.stdc.string; 19 20 import dmd.entity; 21 import dmd.errorsink; 22 import dmd.id; 23 import dmd.identifier; 24 import dmd.location; 25 import dmd.root.array; 26 import dmd.root.ctfloat; 27 import dmd.common.outbuffer; 28 import dmd.root.port; 29 import dmd.root.rmem; 30 import dmd.root.utf; 31 import dmd.tokens; 32 33 nothrow: 34 35 version (DMDLIB) 36 { 37 version = LocOffset; 38 } 39 40 /*********************************************************** 41 * Values to use for various magic identifiers 42 */ 43 struct CompileEnv 44 { 45 uint versionNumber; /// __VERSION__ 46 const(char)[] date; /// __DATE__ 47 const(char)[] time; /// __TIME__ 48 const(char)[] vendor; /// __VENDOR__ 49 const(char)[] timestamp; /// __TIMESTAMP__ 50 51 bool previewIn; /// `in` means `[ref] scope const`, accepts rvalues 52 bool ddocOutput; /// collect embedded documentation comments 53 bool masm; /// use MASM inline asm syntax 54 } 55 56 /*********************************************************** 57 */ 58 class Lexer 59 { 60 private __gshared OutBuffer stringbuffer; 61 62 Loc scanloc; // for error messages 63 Loc prevloc; // location of token before current 64 65 const(char)* p; // current character 66 67 Token token; 68 69 // For ImportC 70 bool Ccompile; /// true if compiling ImportC 71 72 // The following are valid only if (Ccompile == true) 73 ubyte boolsize; /// size of a C _Bool, default 1 74 ubyte shortsize; /// size of a C short, default 2 75 ubyte intsize; /// size of a C int, default 4 76 ubyte longsize; /// size of C long, 4 or 8 77 ubyte long_longsize; /// size of a C long long, default 8 78 ubyte long_doublesize; /// size of C long double, 8 or D real.sizeof 79 ubyte wchar_tsize; /// size of C wchar_t, 2 or 4 80 81 ErrorSink eSink; /// send error messages through this interface 82 CompileEnv compileEnv; /// environment 83 84 private 85 { 86 const(char)* base; // pointer to start of buffer 87 const(char)* end; // pointer to last element of buffer 88 const(char)* line; // start of current line 89 90 bool doDocComment; // collect doc comment information 91 bool anyToken; // seen at least one token 92 bool commentToken; // comments are TOK.comment's 93 bool tokenizeNewlines; // newlines are turned into TOK.endOfLine's 94 95 bool whitespaceToken; // tokenize whitespaces (only for DMDLIB) 96 97 int inTokenStringConstant; // can be larger than 1 when in nested q{} strings 98 int lastDocLine; // last line of previous doc comment 99 100 Token* tokenFreelist; 101 } 102 103 nothrow: 104 105 /********************* 106 * Creates a Lexer for the source code base[begoffset..endoffset+1]. 107 * The last character, base[endoffset], must be null (0) or EOF (0x1A). 108 * 109 * Params: 110 * filename = used for error messages 111 * base = source code, must be terminated by a null (0) or EOF (0x1A) character 112 * begoffset = starting offset into base[] 113 * endoffset = the last offset to read into base[] 114 * doDocComment = handle documentation comments 115 * commentToken = comments become TOK.comment's 116 * errorSink = where error messages go, must not be null 117 * compileEnv = version, vendor, date, time, etc. 118 */ 119 this(const(char)* filename, const(char)* base, size_t begoffset, 120 size_t endoffset, bool doDocComment, bool commentToken, 121 ErrorSink errorSink, 122 const CompileEnv* compileEnv) scope 123 { 124 scanloc = Loc(filename, 1, 1); 125 // debug printf("Lexer::Lexer(%p)\n", base); 126 // debug printf("lexer.filename = %s\n", filename); 127 token = Token.init; 128 this.base = base; 129 this.end = base + endoffset; 130 p = base + begoffset; 131 line = p; 132 this.doDocComment = doDocComment; 133 this.commentToken = commentToken; 134 this.tokenizeNewlines = false; 135 this.inTokenStringConstant = 0; 136 this.lastDocLine = 0; 137 this.eSink = errorSink; 138 assert(errorSink); 139 if (compileEnv) 140 this.compileEnv = *compileEnv; 141 else 142 { 143 this.compileEnv.versionNumber = 1; 144 this.compileEnv.vendor = "DLF"; 145 } 146 //initKeywords(); 147 /* If first line starts with '#!', ignore the line 148 */ 149 if (p && p[0] == '#' && p[1] == '!') 150 { 151 p += 2; 152 for (;;p++) 153 { 154 char c = *p; 155 switch (c) 156 { 157 case '\n': 158 p++; 159 goto case; 160 case 0: 161 case 0x1A: 162 break; 163 164 default: 165 // Note: We do allow malformed UTF-8 on shebang line. 166 // It could have a meaning if the native system 167 // encoding is not Unicode. See test compilable/test13512.d 168 // for example encoded in KOI-8. 169 // We also allow bidirectional control characters. 170 // We do not execute the shebang line, so it can't be used 171 // to conceal code. It is up to the shell to sanitize it. 172 continue; 173 } 174 break; 175 } 176 endOfLine(); 177 } 178 } 179 180 /*********************** 181 * Alternative entry point for DMDLIB, adds `whitespaceToken` 182 */ 183 this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset, 184 bool doDocComment, bool commentToken, bool whitespaceToken, 185 ErrorSink errorSink, const CompileEnv* compileEnv = null 186 ) 187 { 188 this(filename, base, begoffset, endoffset, doDocComment, commentToken, errorSink, compileEnv); 189 this.whitespaceToken = whitespaceToken; 190 } 191 192 /****************** 193 * Used for unittests for a mock Lexer 194 */ 195 this(ErrorSink errorSink) scope @safe { assert(errorSink); this.eSink = errorSink; } 196 197 /************************************** 198 * Reset lexer to lex #define's 199 */ 200 final void resetDefineLines(const(char)[] slice) 201 { 202 base = slice.ptr; 203 end = base + slice.length; 204 assert(*end == 0); 205 p = base; 206 line = p; 207 tokenizeNewlines = true; 208 inTokenStringConstant = 0; 209 lastDocLine = 0; 210 scanloc = Loc("#defines", 1, 1); 211 } 212 213 /********************************** 214 * Set up for next #define line. 215 * p should be at start of next line. 216 */ 217 final void nextDefineLine() 218 { 219 tokenizeNewlines = true; 220 } 221 222 /*************** 223 * Range interface 224 */ 225 226 final bool empty() const pure @property @nogc @safe 227 { 228 return front() == TOK.endOfFile; 229 } 230 231 final TOK front() const pure @property @nogc @safe 232 { 233 return token.value; 234 } 235 236 final void popFront() 237 { 238 nextToken(); 239 } 240 241 /// Returns: a newly allocated `Token`. 242 Token* allocateToken() pure nothrow @safe 243 { 244 if (tokenFreelist) 245 { 246 Token* t = tokenFreelist; 247 tokenFreelist = t.next; 248 t.next = null; 249 return t; 250 } 251 return new Token(); 252 } 253 254 /// Frees the given token by returning it to the freelist. 255 private void releaseToken(Token* token) pure nothrow @nogc @safe 256 { 257 if (mem.isGCEnabled) 258 *token = Token.init; 259 token.next = tokenFreelist; 260 tokenFreelist = token; 261 } 262 263 final TOK nextToken() 264 { 265 prevloc = token.loc; 266 if (token.next) 267 { 268 Token* t = token.next; 269 memcpy(&token, t, Token.sizeof); 270 releaseToken(t); 271 } 272 else 273 { 274 scan(&token); 275 } 276 //printf(token.toChars()); 277 return token.value; 278 } 279 280 /*********************** 281 * Look ahead at next token's value. 282 */ 283 final TOK peekNext() 284 { 285 return peek(&token).value; 286 } 287 288 /*********************** 289 * Look 2 tokens ahead at value. 290 */ 291 final TOK peekNext2() 292 { 293 Token* t = peek(&token); 294 return peek(t).value; 295 } 296 297 /**************************** 298 * Turn next token in buffer into a token. 299 * Params: 300 * t = the token to set the resulting Token to 301 */ 302 final void scan(Token* t) 303 { 304 const lastLine = scanloc.linnum; 305 Loc startLoc; 306 t.blockComment = null; 307 t.lineComment = null; 308 309 while (1) 310 { 311 t.ptr = p; 312 //printf("p = %p, *p = '%c'\n",p,*p); 313 t.loc = loc(); 314 switch (*p) 315 { 316 case 0: 317 case 0x1A: 318 t.value = TOK.endOfFile; // end of file 319 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile. 320 return; 321 case ' ': 322 // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary. 323 while ((cast(size_t)p) % uint.sizeof) 324 { 325 if (*p != ' ') 326 goto LendSkipFourSpaces; 327 p++; 328 } 329 while (*(cast(uint*)p) == 0x20202020) // ' ' == 0x20 330 p += 4; 331 // Skip over any remaining space on the line. 332 while (*p == ' ') 333 p++; 334 LendSkipFourSpaces: 335 version (DMDLIB) 336 { 337 if (whitespaceToken) 338 { 339 t.value = TOK.whitespace; 340 return; 341 } 342 } 343 continue; // skip white space 344 case '\t': 345 case '\v': 346 case '\f': 347 p++; 348 version (DMDLIB) 349 { 350 if (whitespaceToken) 351 { 352 t.value = TOK.whitespace; 353 return; 354 } 355 } 356 continue; // skip white space 357 case '\r': 358 p++; 359 if (*p != '\n') // if CR stands by itself 360 { 361 endOfLine(); 362 if (tokenizeNewlines) 363 { 364 t.value = TOK.endOfLine; 365 tokenizeNewlines = false; 366 return; 367 } 368 } 369 version (DMDLIB) 370 { 371 if (whitespaceToken) 372 { 373 t.value = TOK.whitespace; 374 return; 375 } 376 } 377 continue; // skip white space 378 case '\n': 379 p++; 380 endOfLine(); 381 if (tokenizeNewlines) 382 { 383 t.value = TOK.endOfLine; 384 tokenizeNewlines = false; 385 return; 386 } 387 version (DMDLIB) 388 { 389 if (whitespaceToken) 390 { 391 t.value = TOK.whitespace; 392 return; 393 } 394 } 395 continue; // skip white space 396 397 case '\\': 398 if (Ccompile && (p[1] == '\r' || p[1] == '\n')) 399 { 400 ++p; // ignore \ followed by new line, like VC does 401 continue; 402 } 403 goto default; 404 405 case '0': 406 if (!isZeroSecond(p[1])) // if numeric literal does not continue 407 { 408 ++p; 409 t.unsvalue = 0; 410 t.value = TOK.int32Literal; 411 return; 412 } 413 goto Lnumber; 414 415 case '1': .. case '9': 416 if (!isDigitSecond(p[1])) // if numeric literal does not continue 417 { 418 t.unsvalue = *p - '0'; 419 ++p; 420 t.value = TOK.int32Literal; 421 return; 422 } 423 Lnumber: 424 t.value = number(t); 425 return; 426 427 case '\'': 428 if (issinglechar(p[1]) && p[2] == '\'') 429 { 430 t.unsvalue = p[1]; // simple one character literal 431 t.value = TOK.charLiteral; 432 p += 3; 433 } 434 else if (Ccompile) 435 { 436 clexerCharConstant(*t, 0); 437 } 438 else 439 { 440 t.value = charConstant(t); 441 } 442 return; 443 444 case 'u': 445 case 'U': 446 case 'L': 447 if (!Ccompile) 448 goto case_ident; 449 if (p[1] == '\'') // C wide character constant 450 { 451 char c = *p; 452 if (c == 'L') // convert L to u or U 453 c = (wchar_tsize == 4) ? 'u' : 'U'; 454 ++p; 455 clexerCharConstant(*t, c); 456 return; 457 } 458 else if (p[1] == '\"') // C wide string literal 459 { 460 const c = *p; 461 ++p; 462 escapeStringConstant(t); 463 t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') : 464 c == 'u' ? 'w' : 465 'd'; 466 return; 467 } 468 else if (p[1] == '8' && p[2] == '\"') // C UTF-8 string literal 469 { 470 p += 2; 471 escapeStringConstant(t); 472 return; 473 } 474 goto case_ident; 475 476 case 'r': 477 if (Ccompile || p[1] != '"') 478 goto case_ident; 479 p++; 480 goto case '`'; 481 case '`': 482 if (Ccompile) 483 goto default; 484 wysiwygStringConstant(t); 485 return; 486 case 'x': 487 if (p[1] != '"') 488 goto case_ident; 489 p++; 490 t.value = hexStringConstant(t); 491 return; 492 case 'q': 493 if (Ccompile) 494 goto case_ident; 495 if (p[1] == '"') 496 { 497 p++; 498 delimitedStringConstant(t); 499 return; 500 } 501 else if (p[1] == '{') 502 { 503 p++; 504 tokenStringConstant(t); 505 return; 506 } 507 else 508 goto case_ident; 509 case '"': 510 escapeStringConstant(t); 511 return; 512 case 'a': 513 case 'b': 514 case 'c': 515 case 'd': 516 case 'e': 517 case 'f': 518 case 'g': 519 case 'h': 520 case 'i': 521 case 'j': 522 case 'k': 523 case 'l': 524 case 'm': 525 case 'n': 526 case 'o': 527 case 'p': 528 /*case 'q': case 'r':*/ 529 case 's': 530 case 't': 531 //case 'u': 532 case 'v': 533 case 'w': 534 /*case 'x':*/ 535 case 'y': 536 case 'z': 537 case 'A': 538 case 'B': 539 case 'C': 540 case 'D': 541 case 'E': 542 case 'F': 543 case 'G': 544 case 'H': 545 case 'I': 546 case 'J': 547 case 'K': 548 //case 'L': 549 case 'M': 550 case 'N': 551 case 'O': 552 case 'P': 553 case 'Q': 554 case 'R': 555 case 'S': 556 case 'T': 557 //case 'U': 558 case 'V': 559 case 'W': 560 case 'X': 561 case 'Y': 562 case 'Z': 563 case '_': 564 case_ident: 565 { 566 while (1) 567 { 568 const c = *++p; 569 if (isidchar(c)) 570 continue; 571 else if (c & 0x80) 572 { 573 const s = p; 574 const u = decodeUTF(); 575 if (isUniAlpha(u)) 576 continue; 577 error(t.loc, "char 0x%04x not allowed in identifier", u); 578 p = s; 579 } 580 break; 581 } 582 Identifier id = Identifier.idPool((cast(char*)t.ptr)[0 .. p - t.ptr], false); 583 t.ident = id; 584 t.value = cast(TOK)id.getValue(); 585 586 anyToken = 1; 587 588 /* Different keywords for C and D 589 */ 590 if (Ccompile) 591 { 592 if (t.value != TOK.identifier) 593 { 594 t.value = Ckeywords[t.value]; // filter out D keywords 595 } 596 } 597 else if (t.value >= FirstCKeyword) 598 t.value = TOK.identifier; // filter out C keywords 599 600 else if (*t.ptr == '_') // if special identifier token 601 { 602 void toToken(const(char)[] s) 603 { 604 t.value = TOK.string_; 605 t.ustring = s.ptr; 606 t.len = cast(uint)s.length; 607 t.postfix = 0; 608 } 609 610 if (id == Id.DATE) 611 toToken(compileEnv.date); 612 else if (id == Id.TIME) 613 toToken(compileEnv.time); 614 else if (id == Id.VENDOR) 615 toToken(compileEnv.vendor); 616 else if (id == Id.TIMESTAMP) 617 toToken(compileEnv.timestamp); 618 else if (id == Id.VERSIONX) 619 { 620 t.value = TOK.int64Literal; 621 t.unsvalue = compileEnv.versionNumber; 622 } 623 else if (id == Id.EOFX) 624 { 625 t.value = TOK.endOfFile; 626 // Advance scanner to end of file 627 while (!(*p == 0 || *p == 0x1A)) 628 p++; 629 } 630 } 631 //printf("t.value = %d\n",t.value); 632 return; 633 } 634 case '/': 635 p++; 636 switch (*p) 637 { 638 case '=': 639 p++; 640 t.value = TOK.divAssign; 641 return; 642 case '*': 643 p++; 644 startLoc = loc(); 645 while (1) 646 { 647 while (1) 648 { 649 const c = *p; 650 switch (c) 651 { 652 case '/': 653 break; 654 case '\n': 655 endOfLine(); 656 p++; 657 continue; 658 case '\r': 659 p++; 660 if (*p != '\n') 661 endOfLine(); 662 continue; 663 case 0: 664 case 0x1A: 665 error(t.loc, "unterminated /* */ comment"); 666 p = end; 667 t.loc = loc(); 668 t.value = TOK.endOfFile; 669 return; 670 default: 671 if (c & 0x80) 672 { 673 const u = decodeUTF(); 674 if (u == PS || u == LS) 675 endOfLine(); 676 } 677 p++; 678 continue; 679 } 680 break; 681 } 682 p++; 683 if (p[-2] == '*' && p - 3 != t.ptr) 684 break; 685 } 686 if (commentToken) 687 { 688 t.loc = startLoc; 689 t.value = TOK.comment; 690 return; 691 } 692 else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr) 693 { 694 // if /** but not /**/ 695 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 696 lastDocLine = scanloc.linnum; 697 } 698 continue; 699 case '/': // do // style comments 700 startLoc = loc(); 701 while (1) 702 { 703 const c = *++p; 704 switch (c) 705 { 706 case '\n': 707 break; 708 case '\r': 709 if (p[1] == '\n') 710 p++; 711 break; 712 case 0: 713 case 0x1A: 714 if (commentToken) 715 { 716 p = end; 717 t.loc = startLoc; 718 t.value = TOK.comment; 719 return; 720 } 721 if (doDocComment && t.ptr[2] == '/') 722 { 723 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 724 lastDocLine = scanloc.linnum; 725 } 726 p = end; 727 t.loc = loc(); 728 t.value = TOK.endOfFile; 729 return; 730 default: 731 if (c & 0x80) 732 { 733 const u = decodeUTF(); 734 if (u == PS || u == LS) 735 break; 736 } 737 continue; 738 } 739 break; 740 } 741 if (commentToken) 742 { 743 version (DMDLIB) {} 744 else 745 { 746 p++; 747 endOfLine(); 748 } 749 t.loc = startLoc; 750 t.value = TOK.comment; 751 return; 752 } 753 if (doDocComment && t.ptr[2] == '/') 754 { 755 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 756 lastDocLine = scanloc.linnum; 757 } 758 p++; 759 endOfLine(); 760 continue; 761 case '+': 762 if (!Ccompile) 763 { 764 int nest; 765 startLoc = loc(); 766 p++; 767 nest = 1; 768 while (1) 769 { 770 char c = *p; 771 switch (c) 772 { 773 case '/': 774 p++; 775 if (*p == '+') 776 { 777 p++; 778 nest++; 779 } 780 continue; 781 case '+': 782 p++; 783 if (*p == '/') 784 { 785 p++; 786 if (--nest == 0) 787 break; 788 } 789 continue; 790 case '\r': 791 p++; 792 if (*p != '\n') 793 endOfLine(); 794 continue; 795 case '\n': 796 endOfLine(); 797 p++; 798 continue; 799 case 0: 800 case 0x1A: 801 error(t.loc, "unterminated /+ +/ comment"); 802 p = end; 803 t.loc = loc(); 804 t.value = TOK.endOfFile; 805 return; 806 default: 807 if (c & 0x80) 808 { 809 uint u = decodeUTF(); 810 if (u == PS || u == LS) 811 endOfLine(); 812 } 813 p++; 814 continue; 815 } 816 break; 817 } 818 if (commentToken) 819 { 820 t.loc = startLoc; 821 t.value = TOK.comment; 822 return; 823 } 824 if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr) 825 { 826 // if /++ but not /++/ 827 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 828 lastDocLine = scanloc.linnum; 829 } 830 continue; 831 } 832 break; 833 default: 834 break; 835 } 836 t.value = TOK.div; 837 return; 838 case '.': 839 p++; 840 if (isdigit(*p)) 841 { 842 /* Note that we don't allow ._1 and ._ as being 843 * valid floating point numbers. 844 */ 845 p--; 846 t.value = inreal(t); 847 } 848 else if (p[0] == '.') 849 { 850 if (p[1] == '.') 851 { 852 p += 2; 853 t.value = TOK.dotDotDot; 854 } 855 else 856 { 857 p++; 858 t.value = TOK.slice; 859 } 860 } 861 else 862 t.value = TOK.dot; 863 return; 864 case '&': 865 p++; 866 if (*p == '=') 867 { 868 p++; 869 t.value = TOK.andAssign; 870 } 871 else if (*p == '&') 872 { 873 p++; 874 t.value = TOK.andAnd; 875 } 876 else 877 t.value = TOK.and; 878 return; 879 case '|': 880 p++; 881 if (*p == '=') 882 { 883 p++; 884 t.value = TOK.orAssign; 885 } 886 else if (*p == '|') 887 { 888 p++; 889 t.value = TOK.orOr; 890 } 891 else 892 t.value = TOK.or; 893 return; 894 case '-': 895 p++; 896 if (*p == '=') 897 { 898 p++; 899 t.value = TOK.minAssign; 900 } 901 else if (*p == '-') 902 { 903 p++; 904 t.value = TOK.minusMinus; 905 } 906 else if (*p == '>') 907 { 908 ++p; 909 t.value = TOK.arrow; 910 } 911 else 912 t.value = TOK.min; 913 return; 914 case '+': 915 p++; 916 if (*p == '=') 917 { 918 p++; 919 t.value = TOK.addAssign; 920 } 921 else if (*p == '+') 922 { 923 p++; 924 t.value = TOK.plusPlus; 925 } 926 else 927 t.value = TOK.add; 928 return; 929 case '<': 930 p++; 931 if (*p == '=') 932 { 933 p++; 934 t.value = TOK.lessOrEqual; // <= 935 } 936 else if (*p == '<') 937 { 938 p++; 939 if (*p == '=') 940 { 941 p++; 942 t.value = TOK.leftShiftAssign; // <<= 943 } 944 else 945 t.value = TOK.leftShift; // << 946 } 947 else if (*p == ':' && Ccompile) 948 { 949 ++p; 950 t.value = TOK.leftBracket; // <: 951 } 952 else if (*p == '%' && Ccompile) 953 { 954 ++p; 955 t.value = TOK.leftCurly; // <% 956 } 957 else 958 t.value = TOK.lessThan; // < 959 return; 960 case '>': 961 p++; 962 if (*p == '=') 963 { 964 p++; 965 t.value = TOK.greaterOrEqual; // >= 966 } 967 else if (*p == '>') 968 { 969 p++; 970 if (*p == '=') 971 { 972 p++; 973 t.value = TOK.rightShiftAssign; // >>= 974 } 975 else if (*p == '>') 976 { 977 p++; 978 if (*p == '=') 979 { 980 p++; 981 t.value = TOK.unsignedRightShiftAssign; // >>>= 982 } 983 else 984 t.value = TOK.unsignedRightShift; // >>> 985 } 986 else 987 t.value = TOK.rightShift; // >> 988 } 989 else 990 t.value = TOK.greaterThan; // > 991 return; 992 case '!': 993 p++; 994 if (*p == '=') 995 { 996 p++; 997 t.value = TOK.notEqual; // != 998 } 999 else 1000 t.value = TOK.not; // ! 1001 return; 1002 case '=': 1003 p++; 1004 if (*p == '=') 1005 { 1006 p++; 1007 t.value = TOK.equal; // == 1008 } 1009 else if (*p == '>') 1010 { 1011 p++; 1012 t.value = TOK.goesTo; // => 1013 } 1014 else 1015 t.value = TOK.assign; // = 1016 return; 1017 case '~': 1018 p++; 1019 if (*p == '=') 1020 { 1021 p++; 1022 t.value = TOK.concatenateAssign; // ~= 1023 } 1024 else 1025 t.value = TOK.tilde; // ~ 1026 return; 1027 case '^': 1028 p++; 1029 if (*p == '^') 1030 { 1031 p++; 1032 if (*p == '=') 1033 { 1034 p++; 1035 t.value = TOK.powAssign; // ^^= 1036 } 1037 else 1038 t.value = TOK.pow; // ^^ 1039 } 1040 else if (*p == '=') 1041 { 1042 p++; 1043 t.value = TOK.xorAssign; // ^= 1044 } 1045 else 1046 t.value = TOK.xor; // ^ 1047 return; 1048 case '(': 1049 p++; 1050 t.value = TOK.leftParenthesis; 1051 return; 1052 case ')': 1053 p++; 1054 t.value = TOK.rightParenthesis; 1055 return; 1056 case '[': 1057 p++; 1058 t.value = TOK.leftBracket; 1059 return; 1060 case ']': 1061 p++; 1062 t.value = TOK.rightBracket; 1063 return; 1064 case '{': 1065 p++; 1066 t.value = TOK.leftCurly; 1067 return; 1068 case '}': 1069 p++; 1070 t.value = TOK.rightCurly; 1071 return; 1072 case '?': 1073 p++; 1074 t.value = TOK.question; 1075 return; 1076 case ',': 1077 p++; 1078 t.value = TOK.comma; 1079 return; 1080 case ';': 1081 p++; 1082 t.value = TOK.semicolon; 1083 return; 1084 case ':': 1085 p++; 1086 if (*p == ':') 1087 { 1088 ++p; 1089 t.value = TOK.colonColon; 1090 } 1091 else if (*p == '>' && Ccompile) 1092 { 1093 ++p; 1094 t.value = TOK.rightBracket; 1095 } 1096 else 1097 t.value = TOK.colon; 1098 return; 1099 case '$': 1100 p++; 1101 t.value = TOK.dollar; 1102 return; 1103 case '@': 1104 p++; 1105 t.value = TOK.at; 1106 return; 1107 case '*': 1108 p++; 1109 if (*p == '=') 1110 { 1111 p++; 1112 t.value = TOK.mulAssign; 1113 } 1114 else 1115 t.value = TOK.mul; 1116 return; 1117 case '%': 1118 p++; 1119 if (*p == '=') 1120 { 1121 p++; 1122 t.value = TOK.modAssign; 1123 } 1124 else if (*p == '>' && Ccompile) 1125 { 1126 ++p; 1127 t.value = TOK.rightCurly; 1128 } 1129 else if (*p == ':' && Ccompile) 1130 { 1131 goto case '#'; // %: means # 1132 } 1133 else 1134 t.value = TOK.mod; 1135 return; 1136 case '#': 1137 { 1138 // https://issues.dlang.org/show_bug.cgi?id=22825 1139 // Special token sequences are terminated by newlines, 1140 // and should not be skipped over. 1141 this.tokenizeNewlines = true; 1142 p++; 1143 if (parseSpecialTokenSequence()) 1144 continue; 1145 t.value = TOK.pound; 1146 return; 1147 } 1148 default: 1149 { 1150 dchar c = *p; 1151 if (c & 0x80) 1152 { 1153 c = decodeUTF(); 1154 // Check for start of unicode identifier 1155 if (isUniAlpha(c)) 1156 goto case_ident; 1157 if (c == PS || c == LS) 1158 { 1159 endOfLine(); 1160 p++; 1161 if (tokenizeNewlines) 1162 { 1163 t.value = TOK.endOfLine; 1164 tokenizeNewlines = false; 1165 return; 1166 } 1167 continue; 1168 } 1169 } 1170 if (c < 0x80 && isprint(c)) 1171 error(t.loc, "character '%c' is not a valid token", c); 1172 else 1173 error(t.loc, "character 0x%02x is not a valid token", c); 1174 p++; 1175 continue; 1176 // assert(0); 1177 } 1178 } 1179 } 1180 } 1181 1182 final Token* peek(Token* ct) 1183 { 1184 Token* t; 1185 if (ct.next) 1186 t = ct.next; 1187 else 1188 { 1189 t = allocateToken(); 1190 scan(t); 1191 ct.next = t; 1192 } 1193 return t; 1194 } 1195 1196 /********************************* 1197 * tk is on the opening (. 1198 * Look ahead and return token that is past the closing ). 1199 */ 1200 final Token* peekPastParen(Token* tk) 1201 { 1202 //printf("peekPastParen()\n"); 1203 int parens = 1; 1204 int curlynest = 0; 1205 while (1) 1206 { 1207 tk = peek(tk); 1208 //tk.print(); 1209 switch (tk.value) 1210 { 1211 case TOK.leftParenthesis: 1212 parens++; 1213 continue; 1214 case TOK.rightParenthesis: 1215 --parens; 1216 if (parens) 1217 continue; 1218 tk = peek(tk); 1219 break; 1220 case TOK.leftCurly: 1221 curlynest++; 1222 continue; 1223 case TOK.rightCurly: 1224 if (--curlynest >= 0) 1225 continue; 1226 break; 1227 case TOK.semicolon: 1228 if (curlynest) 1229 continue; 1230 break; 1231 case TOK.endOfFile: 1232 break; 1233 default: 1234 continue; 1235 } 1236 return tk; 1237 } 1238 } 1239 1240 /******************************************* 1241 * Parse escape sequence. 1242 */ 1243 private uint escapeSequence(out dchar c2) 1244 { 1245 return Lexer.escapeSequence(token.loc, p, Ccompile, c2); 1246 } 1247 1248 /******** 1249 * Parse the given string literal escape sequence into a single character. 1250 * D https://dlang.org/spec/lex.html#escape_sequences 1251 * C11 6.4.4.4 1252 * Params: 1253 * loc = location to use for error messages 1254 * sequence = pointer to string with escape sequence to parse. Updated to 1255 * point past the end of the escape sequence 1256 * Ccompile = true for compile C11 escape sequences 1257 * c2 = returns second `dchar` of html entity with 2 code units, otherwise stays `dchar.init` 1258 * Returns: 1259 * the escape sequence as a single character 1260 */ 1261 private dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile, out dchar c2) 1262 { 1263 const(char)* p = sequence; // cache sequence reference on stack 1264 scope(exit) sequence = p; 1265 1266 uint c = *p; 1267 int ndigits; 1268 switch (c) 1269 { 1270 case '\'': 1271 case '"': 1272 case '?': 1273 case '\\': 1274 Lconsume: 1275 p++; 1276 break; 1277 case 'a': 1278 c = 7; 1279 goto Lconsume; 1280 case 'b': 1281 c = 8; 1282 goto Lconsume; 1283 case 'f': 1284 c = 12; 1285 goto Lconsume; 1286 case 'n': 1287 c = 10; 1288 goto Lconsume; 1289 case 'r': 1290 c = 13; 1291 goto Lconsume; 1292 case 't': 1293 c = 9; 1294 goto Lconsume; 1295 case 'v': 1296 c = 11; 1297 goto Lconsume; 1298 case 'u': 1299 ndigits = 4; 1300 goto Lhex; 1301 case 'U': 1302 ndigits = 8; 1303 goto Lhex; 1304 case 'x': 1305 ndigits = 2; 1306 Lhex: 1307 p++; 1308 c = *p; 1309 if (ishex(cast(char)c)) 1310 { 1311 uint v = 0; 1312 int n = 0; 1313 if (Ccompile && ndigits == 2) 1314 { 1315 /* C11 6.4.4.4-7 one to infinity hex digits 1316 */ 1317 do 1318 { 1319 if (isdigit(cast(char)c)) 1320 c -= '0'; 1321 else if (islower(c)) 1322 c -= 'a' - 10; 1323 else 1324 c -= 'A' - 10; 1325 v = v * 16 + c; 1326 c = *++p; 1327 } while (ishex(cast(char)c)); 1328 } 1329 else 1330 { 1331 while (1) 1332 { 1333 if (isdigit(cast(char)c)) 1334 c -= '0'; 1335 else if (islower(c)) 1336 c -= 'a' - 10; 1337 else 1338 c -= 'A' - 10; 1339 v = v * 16 + c; 1340 c = *++p; 1341 if (++n == ndigits) 1342 break; 1343 if (!ishex(cast(char)c)) 1344 { 1345 error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits); 1346 break; 1347 } 1348 } 1349 if (ndigits != 2 && !utf_isValidDchar(v)) 1350 { 1351 error(loc, "invalid UTF character \\U%08x", v); 1352 v = '?'; // recover with valid UTF character 1353 } 1354 } 1355 c = v; 1356 } 1357 else 1358 { 1359 error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c); 1360 p++; 1361 } 1362 break; 1363 case '&': 1364 if (Ccompile) 1365 goto default; 1366 1367 // named character entity 1368 for (const idstart = ++p; 1; p++) 1369 { 1370 switch (*p) 1371 { 1372 case ';': 1373 auto entity = HtmlNamedEntity(idstart[0 .. p - idstart]); 1374 c = entity[0]; 1375 if (entity == entity.init) 1376 { 1377 error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart); 1378 c = '?'; 1379 } 1380 if (entity[1] != entity.init[1]) 1381 c2 = entity[1]; 1382 1383 p++; 1384 break; 1385 default: 1386 if (isalpha(*p) || (p != idstart && isdigit(*p))) 1387 continue; 1388 error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart); 1389 c = '?'; 1390 break; 1391 } 1392 break; 1393 } 1394 break; 1395 case 0: 1396 case 0x1A: 1397 // end of file 1398 c = '\\'; 1399 break; 1400 default: 1401 if (isoctal(cast(char)c)) 1402 { 1403 uint v = 0; 1404 int n = 0; 1405 do 1406 { 1407 v = v * 8 + (c - '0'); 1408 c = *++p; 1409 } 1410 while (++n < 3 && isoctal(cast(char)c)); 1411 c = v; 1412 if (c > 0xFF) 1413 error(loc, "escape octal sequence \\%03o is larger than \\377", c); 1414 } 1415 else 1416 { 1417 error(loc, "undefined escape sequence \\%c", c); 1418 p++; 1419 } 1420 break; 1421 } 1422 return c; 1423 } 1424 1425 /** 1426 Lex a wysiwyg string. `p` must be pointing to the first character before the 1427 contents of the string literal. The character pointed to by `p` will be used as 1428 the terminating character (i.e. backtick or double-quote). 1429 Params: 1430 result = pointer to the token that accepts the result 1431 */ 1432 private void wysiwygStringConstant(Token* result) 1433 { 1434 result.value = TOK.string_; 1435 Loc start = loc(); 1436 auto terminator = p[0]; 1437 p++; 1438 stringbuffer.setsize(0); 1439 while (1) 1440 { 1441 dchar c = p[0]; 1442 p++; 1443 switch (c) 1444 { 1445 case '\n': 1446 endOfLine(); 1447 break; 1448 case '\r': 1449 if (p[0] == '\n') 1450 continue; // ignore 1451 c = '\n'; // treat EndOfLine as \n character 1452 endOfLine(); 1453 break; 1454 case 0: 1455 case 0x1A: 1456 error("unterminated string constant starting at %s", start.toChars()); 1457 result.setString(); 1458 // rewind `p` so it points to the EOF character 1459 p--; 1460 return; 1461 default: 1462 if (c == terminator) 1463 { 1464 result.setString(stringbuffer); 1465 stringPostfix(result); 1466 return; 1467 } 1468 else if (c & 0x80) 1469 { 1470 p--; 1471 const u = decodeUTF(); 1472 p++; 1473 if (u == PS || u == LS) 1474 endOfLine(); 1475 stringbuffer.writeUTF8(u); 1476 continue; 1477 } 1478 break; 1479 } 1480 stringbuffer.writeByte(c); 1481 } 1482 } 1483 1484 /************************************** 1485 * Lex hex strings: 1486 * x"0A ae 34FE BD" 1487 */ 1488 final TOK hexStringConstant(Token* t) 1489 { 1490 Loc start = loc(); 1491 uint n = 0; 1492 uint v = ~0; // dead assignment, needed to suppress warning 1493 p++; 1494 stringbuffer.setsize(0); 1495 while (1) 1496 { 1497 dchar c = *p++; 1498 switch (c) 1499 { 1500 case ' ': 1501 case '\t': 1502 case '\v': 1503 case '\f': 1504 continue; // skip white space 1505 case '\r': 1506 if (*p == '\n') 1507 continue; // ignore '\r' if followed by '\n' 1508 // Treat isolated '\r' as if it were a '\n' 1509 goto case '\n'; 1510 case '\n': 1511 endOfLine(); 1512 continue; 1513 case 0: 1514 case 0x1A: 1515 error("unterminated string constant starting at %s", start.toChars()); 1516 t.setString(); 1517 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1518 p--; 1519 return TOK.hexadecimalString; 1520 case '"': 1521 if (n & 1) 1522 { 1523 error("odd number (%d) of hex characters in hex string", n); 1524 stringbuffer.writeByte(v); 1525 } 1526 t.setString(stringbuffer); 1527 t.postfix = 'h'; 1528 stringPostfix(t); 1529 return TOK.hexadecimalString; 1530 default: 1531 if (c >= '0' && c <= '9') 1532 c -= '0'; 1533 else if (c >= 'a' && c <= 'f') 1534 c -= 'a' - 10; 1535 else if (c >= 'A' && c <= 'F') 1536 c -= 'A' - 10; 1537 else if (c & 0x80) 1538 { 1539 p--; 1540 const u = decodeUTF(); 1541 p++; 1542 if (u == PS || u == LS) 1543 endOfLine(); 1544 else 1545 error("non-hex character \\u%04x in hex string", u); 1546 } 1547 else 1548 error("non-hex character '%c' in hex string", c); 1549 if (n & 1) 1550 { 1551 v = (v << 4) | c; 1552 stringbuffer.writeByte(v); 1553 } 1554 else 1555 v = c; 1556 n++; 1557 break; 1558 } 1559 } 1560 assert(0); // see bug 15731 1561 } 1562 1563 /** 1564 Lex a delimited string. Some examples of delimited strings are: 1565 --- 1566 q"(foo(xxx))" // "foo(xxx)" 1567 q"[foo$(LPAREN)]" // "foo$(LPAREN)" 1568 q"/foo]/" // "foo]" 1569 q"HERE 1570 foo 1571 HERE" // "foo\n" 1572 --- 1573 It is assumed that `p` points to the opening double-quote '"'. 1574 Params: 1575 result = pointer to the token that accepts the result 1576 */ 1577 private void delimitedStringConstant(Token* result) 1578 { 1579 result.value = TOK.string_; 1580 Loc start = loc(); 1581 dchar delimleft = 0; 1582 dchar delimright = 0; 1583 uint nest = 1; 1584 uint nestcount = ~0; // dead assignment, needed to suppress warning 1585 Identifier hereid = null; 1586 uint blankrol = 0; 1587 uint startline = 0; 1588 p++; 1589 stringbuffer.setsize(0); 1590 while (1) 1591 { 1592 const s = p; 1593 dchar c = *p++; 1594 //printf("c = '%c'\n", c); 1595 switch (c) 1596 { 1597 case '\n': 1598 Lnextline: 1599 endOfLine(); 1600 startline = 1; 1601 if (blankrol) 1602 { 1603 blankrol = 0; 1604 continue; 1605 } 1606 if (hereid) 1607 { 1608 stringbuffer.writeUTF8(c); 1609 continue; 1610 } 1611 break; 1612 case '\r': 1613 if (*p == '\n') 1614 continue; // ignore 1615 c = '\n'; // treat EndOfLine as \n character 1616 goto Lnextline; 1617 case 0: 1618 case 0x1A: 1619 error("unterminated delimited string constant starting at %s", start.toChars()); 1620 result.setString(); 1621 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1622 p--; 1623 return; 1624 default: 1625 if (c & 0x80) 1626 { 1627 p--; 1628 c = decodeUTF(); 1629 p++; 1630 if (c == PS || c == LS) 1631 goto Lnextline; 1632 } 1633 break; 1634 } 1635 if (delimleft == 0) 1636 { 1637 delimleft = c; 1638 nest = 1; 1639 nestcount = 1; 1640 if (c == '(') 1641 delimright = ')'; 1642 else if (c == '{') 1643 delimright = '}'; 1644 else if (c == '[') 1645 delimright = ']'; 1646 else if (c == '<') 1647 delimright = '>'; 1648 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) 1649 { 1650 // Start of identifier; must be a heredoc 1651 Token tok; 1652 p = s; 1653 scan(&tok); // read in heredoc identifier 1654 if (tok.value != TOK.identifier) 1655 { 1656 error("identifier expected for heredoc, not %s", tok.toChars()); 1657 delimright = c; 1658 } 1659 else 1660 { 1661 hereid = tok.ident; 1662 //printf("hereid = '%s'\n", hereid.toChars()); 1663 blankrol = 1; 1664 } 1665 nest = 0; 1666 } 1667 else 1668 { 1669 delimright = c; 1670 nest = 0; 1671 if (isspace(c)) 1672 error("delimiter cannot be whitespace"); 1673 } 1674 } 1675 else 1676 { 1677 if (blankrol) 1678 { 1679 error("heredoc rest of line should be blank"); 1680 blankrol = 0; 1681 continue; 1682 } 1683 if (nest == 1) 1684 { 1685 if (c == delimleft) 1686 nestcount++; 1687 else if (c == delimright) 1688 { 1689 nestcount--; 1690 if (nestcount == 0) 1691 goto Ldone; 1692 } 1693 } 1694 else if (c == delimright) 1695 goto Ldone; 1696 if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid) 1697 { 1698 Token tok; 1699 auto psave = p; 1700 p = s; 1701 scan(&tok); // read in possible heredoc identifier 1702 //printf("endid = '%s'\n", tok.ident.toChars()); 1703 if (tok.value == TOK.identifier && tok.ident is hereid) 1704 { 1705 /* should check that rest of line is blank 1706 */ 1707 goto Ldone; 1708 } 1709 p = psave; 1710 } 1711 stringbuffer.writeUTF8(c); 1712 startline = 0; 1713 } 1714 } 1715 Ldone: 1716 if (*p == '"') 1717 p++; 1718 else if (hereid) 1719 error("delimited string must end in `%s\"`", hereid.toChars()); 1720 else if (isspace(delimright)) 1721 error("delimited string must end in `\"`"); 1722 else 1723 error(token.loc, "delimited string must end in `%c\"`", delimright); 1724 result.setString(stringbuffer); 1725 stringPostfix(result); 1726 } 1727 1728 /** 1729 Lex a token string. Some examples of token strings are: 1730 --- 1731 q{ foo(xxx) } // " foo(xxx) " 1732 q{foo$(LPAREN)} // "foo$(LPAREN)" 1733 q{{foo}"}"} // "{foo}"}"" 1734 --- 1735 It is assumed that `p` points to the opening curly-brace. 1736 Params: 1737 result = pointer to the token that accepts the result 1738 */ 1739 private void tokenStringConstant(Token* result) 1740 { 1741 result.value = TOK.string_; 1742 1743 uint nest = 1; 1744 const start = loc(); 1745 const pstart = ++p; 1746 inTokenStringConstant++; 1747 scope(exit) inTokenStringConstant--; 1748 while (1) 1749 { 1750 Token tok; 1751 scan(&tok); 1752 switch (tok.value) 1753 { 1754 case TOK.leftCurly: 1755 nest++; 1756 continue; 1757 case TOK.rightCurly: 1758 if (--nest == 0) 1759 { 1760 result.setString(pstart, p - 1 - pstart); 1761 stringPostfix(result); 1762 return; 1763 } 1764 continue; 1765 case TOK.endOfFile: 1766 error("unterminated token string constant starting at %s", start.toChars()); 1767 result.setString(); 1768 return; 1769 default: 1770 continue; 1771 } 1772 } 1773 } 1774 1775 /** 1776 Scan a quoted string while building the processed string value by 1777 handling escape sequences. The result is returned in the given `t` token. 1778 This function assumes that `p` currently points to the opening quote 1779 of the string. 1780 Params: 1781 t = the token to set the resulting string to 1782 * References: 1783 * D https://dlang.org/spec/lex.html#double_quoted_strings 1784 * ImportC C11 6.4.5 1785 */ 1786 private void escapeStringConstant(Token* t) 1787 { 1788 t.value = TOK.string_; 1789 1790 const start = loc(); 1791 const tc = *p++; // opening quote 1792 stringbuffer.setsize(0); 1793 while (1) 1794 { 1795 dchar c = *p++; 1796 dchar c2; 1797 switch (c) 1798 { 1799 case '\\': 1800 switch (*p) 1801 { 1802 case '&': 1803 if (Ccompile) 1804 goto default; 1805 1806 c = escapeSequence(c2); 1807 stringbuffer.writeUTF8(c); 1808 if (c2 != dchar.init) 1809 stringbuffer.writeUTF8(c2); 1810 continue; 1811 case 'u': 1812 case 'U': 1813 c = escapeSequence(c2); 1814 stringbuffer.writeUTF8(c); 1815 continue; 1816 default: 1817 c = escapeSequence(c2); 1818 break; 1819 } 1820 break; 1821 case '\n': 1822 endOfLine(); 1823 if (Ccompile) 1824 goto Lunterminated; 1825 break; 1826 case '\r': 1827 if (*p == '\n') 1828 continue; // ignore 1829 c = '\n'; // treat EndOfLine as \n character 1830 endOfLine(); 1831 if (Ccompile) 1832 goto Lunterminated; 1833 break; 1834 case '\'': 1835 case '"': 1836 if (c != tc) 1837 goto default; 1838 t.setString(stringbuffer); 1839 if (!Ccompile) 1840 stringPostfix(t); 1841 return; 1842 case 0: 1843 case 0x1A: 1844 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1845 p--; 1846 Lunterminated: 1847 error("unterminated string constant starting at %s", start.toChars()); 1848 t.setString(); 1849 return; 1850 default: 1851 if (c & 0x80) 1852 { 1853 p--; 1854 c = decodeUTF(); 1855 if (c == LS || c == PS) 1856 { 1857 c = '\n'; 1858 endOfLine(); 1859 if (Ccompile) 1860 goto Lunterminated; 1861 } 1862 p++; 1863 stringbuffer.writeUTF8(c); 1864 continue; 1865 } 1866 break; 1867 } 1868 stringbuffer.writeByte(c); 1869 } 1870 } 1871 1872 /************************************** 1873 * Reference: 1874 * https://dlang.org/spec/lex.html#characterliteral 1875 */ 1876 private TOK charConstant(Token* t) 1877 { 1878 TOK tk = TOK.charLiteral; 1879 //printf("Lexer::charConstant\n"); 1880 p++; 1881 dchar c = *p++; 1882 dchar c2; 1883 switch (c) 1884 { 1885 case '\\': 1886 switch (*p) 1887 { 1888 case 'u': 1889 tk = TOK.wcharLiteral; 1890 goto default; 1891 case 'U': 1892 case '&': 1893 tk = TOK.dcharLiteral; 1894 goto default; 1895 default: 1896 t.unsvalue = escapeSequence(c2); 1897 if (c2 != c2.init) 1898 { 1899 error("html entity requires 2 code units, use a string instead of a character"); 1900 t.unsvalue = '?'; 1901 } 1902 break; 1903 } 1904 break; 1905 case '\n': 1906 L1: 1907 endOfLine(); 1908 goto case; 1909 case '\r': 1910 goto case '\''; 1911 case 0: 1912 case 0x1A: 1913 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1914 p--; 1915 goto case; 1916 case '\'': 1917 error("unterminated character constant"); 1918 t.unsvalue = '?'; 1919 return tk; 1920 default: 1921 if (c & 0x80) 1922 { 1923 p--; 1924 c = decodeUTF(); 1925 p++; 1926 if (c == LS || c == PS) 1927 goto L1; 1928 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) 1929 tk = TOK.wcharLiteral; 1930 else 1931 tk = TOK.dcharLiteral; 1932 } 1933 t.unsvalue = c; 1934 break; 1935 } 1936 if (*p != '\'') 1937 { 1938 while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' && 1939 *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}') 1940 { 1941 if (*p & 0x80) 1942 { 1943 const s = p; 1944 c = decodeUTF(); 1945 if (c == LS || c == PS) 1946 { 1947 p = s; 1948 break; 1949 } 1950 } 1951 p++; 1952 } 1953 1954 if (*p == '\'') 1955 { 1956 error("character constant has multiple characters"); 1957 p++; 1958 } 1959 else 1960 error("unterminated character constant"); 1961 t.unsvalue = '?'; 1962 return tk; 1963 } 1964 p++; 1965 return tk; 1966 } 1967 1968 /*************************************** 1969 * Lex C character constant. 1970 * Parser is on the opening quote. 1971 * Params: 1972 * t = token to fill in 1973 * prefix = one of `u`, `U` or 0. 1974 * Reference: 1975 * C11 6.4.4.4 1976 */ 1977 private void clexerCharConstant(ref Token t, char prefix) 1978 { 1979 escapeStringConstant(&t); 1980 const(char)[] str = t.ustring[0 .. t.len]; 1981 const n = str.length; 1982 const loc = t.loc; 1983 if (n == 0) 1984 { 1985 error(loc, "empty character constant"); 1986 t.value = TOK.semicolon; 1987 return; 1988 } 1989 1990 uint u; 1991 switch (prefix) 1992 { 1993 case 0: 1994 if (n == 1) // fast case 1995 { 1996 u = str[0]; 1997 } 1998 else if (n > 4) 1999 error(loc, "max number of chars in character literal is 4, had %d", 2000 cast(int)n); 2001 else 2002 { 2003 foreach (i, c; str) 2004 (cast(char*)&u)[n - 1 - i] = c; 2005 } 2006 break; 2007 2008 case 'u': 2009 dchar d1; 2010 size_t idx; 2011 auto msg = utf_decodeChar(str, idx, d1); 2012 dchar d2 = 0; 2013 if (idx < n && !msg) 2014 msg = utf_decodeChar(str, idx, d2); 2015 if (msg) 2016 error(loc, "%.*s", cast(int)msg.length, msg.ptr); 2017 else if (idx < n) 2018 error(loc, "max number of chars in 16 bit character literal is 2, had %d", 2019 cast(int)((n + 1) >> 1)); 2020 else if (d1 > 0x1_0000) 2021 error(loc, "%d does not fit in 16 bits", d1); 2022 else if (d2 > 0x1_0000) 2023 error(loc, "%d does not fit in 16 bits", d2); 2024 u = d1; 2025 if (d2) 2026 u = (d1 << 16) | d2; 2027 break; 2028 2029 case 'U': 2030 dchar d; 2031 size_t idx; 2032 auto msg = utf_decodeChar(str, idx, d); 2033 if (msg) 2034 error(loc, "%.*s", cast(int)msg.length, msg.ptr); 2035 else if (idx < n) 2036 error(loc, "max number of chars in 32 bit character literal is 1, had %d", 2037 cast(int)((n + 3) >> 2)); 2038 u = d; 2039 break; 2040 2041 default: 2042 assert(0); 2043 } 2044 t.value = n == 1 ? TOK.charLiteral : TOK.int32Literal; 2045 t.unsvalue = u; 2046 } 2047 2048 /*************************************** 2049 * Get postfix of string literal. 2050 */ 2051 private void stringPostfix(Token* t) pure @nogc 2052 { 2053 switch (*p) 2054 { 2055 case 'c': 2056 case 'w': 2057 case 'd': 2058 t.postfix = *p; 2059 p++; 2060 break; 2061 default: 2062 t.postfix = 0; 2063 break; 2064 } 2065 } 2066 2067 /************************************** 2068 * Read in a number. 2069 * If it's an integer, store it in tok.TKutok.Vlong. 2070 * integers can be decimal, octal or hex 2071 * Handle the suffixes U, UL, LU, L, etc. 2072 * If it's double, store it in tok.TKutok.Vdouble. 2073 * Returns: 2074 * TKnum 2075 * TKdouble,... 2076 */ 2077 private TOK number(Token* t) 2078 { 2079 int base = 10; 2080 const start = p; 2081 ulong n = 0; // unsigned >=64 bit integer type 2082 int d; 2083 bool err = false; 2084 bool overflow = false; 2085 bool anyBinaryDigitsNoSingleUS = false; 2086 bool anyHexDigitsNoSingleUS = false; 2087 char errorDigit = 0; 2088 dchar c = *p; 2089 if (c == '0') 2090 { 2091 ++p; 2092 c = *p; 2093 switch (c) 2094 { 2095 case '0': 2096 case '1': 2097 case '2': 2098 case '3': 2099 case '4': 2100 case '5': 2101 case '6': 2102 case '7': 2103 base = 8; 2104 break; 2105 2106 case '8': 2107 case '9': 2108 errorDigit = cast(char) c; 2109 base = 8; 2110 break; 2111 case 'x': 2112 case 'X': 2113 ++p; 2114 base = 16; 2115 break; 2116 case 'b': 2117 case 'B': 2118 ++p; 2119 base = 2; 2120 break; 2121 case '.': 2122 if (p[1] == '.') 2123 goto Ldone; // if ".." 2124 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80) 2125 { 2126 if (Ccompile && (p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L')) 2127 goto Lreal; // if `0.f` or `0.L` 2128 goto Ldone; // if ".identifier" or ".unicode" 2129 } 2130 goto Lreal; // '.' is part of current token 2131 case 'i': 2132 case 'f': 2133 case 'F': 2134 goto Lreal; 2135 case '_': 2136 if (Ccompile) 2137 error("embedded `_` not allowed"); 2138 ++p; 2139 base = 8; 2140 break; 2141 case 'L': 2142 if (p[1] == 'i') 2143 goto Lreal; 2144 break; 2145 default: 2146 break; 2147 } 2148 } 2149 while (1) 2150 { 2151 c = *p; 2152 switch (c) 2153 { 2154 case '0': 2155 case '1': 2156 case '2': 2157 case '3': 2158 case '4': 2159 case '5': 2160 case '6': 2161 case '7': 2162 case '8': 2163 case '9': 2164 ++p; 2165 d = c - '0'; 2166 break; 2167 case 'a': 2168 case 'b': 2169 case 'c': 2170 case 'd': 2171 case 'e': 2172 case 'f': 2173 case 'A': 2174 case 'B': 2175 case 'C': 2176 case 'D': 2177 case 'E': 2178 case 'F': 2179 ++p; 2180 if (base != 16) 2181 { 2182 if (c == 'e' || c == 'E' || c == 'f' || c == 'F') 2183 goto Lreal; 2184 } 2185 if (c >= 'a') 2186 d = c + 10 - 'a'; 2187 else 2188 d = c + 10 - 'A'; 2189 break; 2190 case 'L': 2191 if (p[1] == 'i') 2192 goto Lreal; 2193 goto Ldone; 2194 case '.': 2195 if (p[1] == '.') 2196 goto Ldone; // if ".." 2197 if (base <= 10 && n > 0 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)) 2198 { 2199 if (Ccompile && base == 10 && 2200 (p[1] == 'e' || p[1] == 'E' || p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L')) 2201 goto Lreal; // if `1.e6` or `1.f` or `1.L` 2202 goto Ldone; // if ".identifier" or ".unicode" 2203 } 2204 if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80)) 2205 goto Ldone; // if ".identifier" or ".unicode" 2206 if (base == 2) 2207 goto Ldone; // if ".identifier" or ".unicode" 2208 goto Lreal; // otherwise as part of a floating point literal 2209 2210 case 'i': 2211 if (Ccompile) 2212 goto Ldone; 2213 goto Lreal; 2214 2215 case 'p': 2216 case 'P': 2217 Lreal: 2218 p = start; 2219 return inreal(t); 2220 case '_': 2221 if (Ccompile) 2222 goto default; 2223 ++p; 2224 continue; 2225 default: 2226 goto Ldone; 2227 } 2228 // got a digit here, set any necessary flags, check for errors 2229 anyHexDigitsNoSingleUS = true; 2230 anyBinaryDigitsNoSingleUS = true; 2231 if (!errorDigit && d >= base) 2232 { 2233 errorDigit = cast(char) c; 2234 } 2235 // Avoid expensive overflow check if we aren't at risk of overflow 2236 if (n <= 0x0FFF_FFFF_FFFF_FFFFUL) 2237 n = n * base + d; 2238 else 2239 { 2240 import core.checkedint : mulu, addu; 2241 2242 n = mulu(n, base, overflow); 2243 n = addu(n, d, overflow); 2244 } 2245 } 2246 Ldone: 2247 if (errorDigit) 2248 { 2249 error(token.loc, "%s digit expected, not `%c`", base == 2 ? "binary".ptr : 2250 base == 8 ? "octal".ptr : 2251 "decimal".ptr, errorDigit); 2252 err = true; 2253 } 2254 if (overflow && !err) 2255 { 2256 error("integer overflow"); 2257 err = true; 2258 } 2259 if ((base == 2 && !anyBinaryDigitsNoSingleUS) || 2260 (base == 16 && !anyHexDigitsNoSingleUS)) 2261 error(token.loc, "`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start); 2262 2263 t.unsvalue = n; 2264 2265 if (Ccompile) 2266 return cnumber(base, n); 2267 2268 enum FLAGS : int 2269 { 2270 none = 0, 2271 decimal = 1, // decimal 2272 unsigned = 2, // u or U suffix 2273 long_ = 4, // L suffix 2274 } 2275 2276 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none; 2277 // Parse trailing 'u', 'U', 'l' or 'L' in any combination 2278 const psuffix = p; 2279 while (1) 2280 { 2281 FLAGS f; 2282 switch (*p) 2283 { 2284 case 'U': 2285 case 'u': 2286 f = FLAGS.unsigned; 2287 goto L1; 2288 case 'l': 2289 f = FLAGS.long_; 2290 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead"); 2291 goto L1; 2292 case 'L': 2293 f = FLAGS.long_; 2294 L1: 2295 p++; 2296 if ((flags & f) && !err) 2297 { 2298 error("repeated integer suffix `%c`", p[-1]); 2299 err = true; 2300 } 2301 flags = cast(FLAGS)(flags | f); 2302 continue; 2303 default: 2304 break; 2305 } 2306 break; 2307 } 2308 if (base == 8 && n >= 8) 2309 { 2310 if (err) 2311 // can't translate invalid octal value, just show a generic message 2312 error("octal literals larger than 7 are no longer supported"); 2313 else 2314 error(token.loc, "octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead", 2315 n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix); 2316 } 2317 TOK result; 2318 switch (flags) 2319 { 2320 case FLAGS.none: 2321 /* Octal or Hexadecimal constant. 2322 * First that fits: int, uint, long, ulong 2323 */ 2324 if (n & 0x8000000000000000L) 2325 result = TOK.uns64Literal; 2326 else if (n & 0xFFFFFFFF00000000L) 2327 result = TOK.int64Literal; 2328 else if (n & 0x80000000) 2329 result = TOK.uns32Literal; 2330 else 2331 result = TOK.int32Literal; 2332 break; 2333 case FLAGS.decimal: 2334 /* First that fits: int, long, long long 2335 */ 2336 if (n & 0x8000000000000000L) 2337 { 2338 result = TOK.uns64Literal; 2339 } 2340 else if (n & 0xFFFFFFFF80000000L) 2341 result = TOK.int64Literal; 2342 else 2343 result = TOK.int32Literal; 2344 break; 2345 case FLAGS.unsigned: 2346 case FLAGS.decimal | FLAGS.unsigned: 2347 /* First that fits: uint, ulong 2348 */ 2349 if (n & 0xFFFFFFFF00000000L) 2350 result = TOK.uns64Literal; 2351 else 2352 result = TOK.uns32Literal; 2353 break; 2354 case FLAGS.decimal | FLAGS.long_: 2355 if (n & 0x8000000000000000L) 2356 { 2357 if (!err) 2358 { 2359 error("signed integer overflow"); 2360 err = true; 2361 } 2362 result = TOK.uns64Literal; 2363 } 2364 else 2365 result = TOK.int64Literal; 2366 break; 2367 case FLAGS.long_: 2368 if (n & 0x8000000000000000L) 2369 result = TOK.uns64Literal; 2370 else 2371 result = TOK.int64Literal; 2372 break; 2373 case FLAGS.unsigned | FLAGS.long_: 2374 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_: 2375 result = TOK.uns64Literal; 2376 break; 2377 default: 2378 debug 2379 { 2380 printf("%x\n", flags); 2381 } 2382 assert(0); 2383 } 2384 return result; 2385 } 2386 2387 /************************************** 2388 * Lex C integer-suffix 2389 * Params: 2390 * base = number base 2391 * n = raw integer value 2392 * Returns: 2393 * token value 2394 */ 2395 private TOK cnumber(int base, ulong n) 2396 { 2397 /* C11 6.4.4.1 2398 * Parse trailing suffixes: 2399 * u or U 2400 * l or L 2401 * ll or LL 2402 */ 2403 enum FLAGS : uint 2404 { 2405 octalhex = 1, // octal or hexadecimal 2406 decimal = 2, // decimal 2407 unsigned = 4, // u or U suffix 2408 long_ = 8, // l or L suffix 2409 llong = 0x10, // ll or LL 2410 2411 // Microsoft extensions 2412 i8 = 0x20, 2413 i16 = 0x40, 2414 i32 = 0x80, 2415 i64 = 0x100, 2416 } 2417 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex; 2418 bool err; 2419 Lsuffixes: 2420 while (1) 2421 { 2422 FLAGS f; 2423 const cs = *p; 2424 switch (cs) 2425 { 2426 case 'U': 2427 case 'u': 2428 f = FLAGS.unsigned; 2429 break; 2430 2431 case 'l': 2432 case 'L': 2433 f = FLAGS.long_; 2434 if (cs == p[1]) 2435 { 2436 f = FLAGS.long_ | FLAGS.llong; 2437 ++p; 2438 } 2439 break; 2440 2441 case 'i': 2442 case 'I': 2443 if (p[1] == '8') 2444 { 2445 f = FLAGS.i8; 2446 ++p; 2447 } 2448 else if (p[1] == '1' && p[2] == '6') 2449 { 2450 f = FLAGS.i16; 2451 p += 2; 2452 } 2453 else if (p[1] == '3' && p[2] == '2') 2454 { 2455 f = FLAGS.i32; 2456 p += 2; 2457 } 2458 else if (p[1] == '6' && p[2] == '4') 2459 { 2460 f = FLAGS.i64; 2461 p += 2; 2462 } 2463 else 2464 break Lsuffixes; 2465 if (p[1] >= '0' && p[1] <= '9' && !err) 2466 { 2467 error("invalid integer suffix"); 2468 err = true; 2469 } 2470 break; 2471 2472 default: 2473 break Lsuffixes; 2474 } 2475 ++p; 2476 if ((flags & f) && !err) 2477 { 2478 error("duplicate integer suffixes"); 2479 err = true; 2480 } 2481 flags = cast(FLAGS)(flags | f); 2482 } 2483 2484 TOK result = TOK.int32Literal; // default 2485 switch (flags) 2486 { 2487 /* Since D doesn't have a variable sized `long` or `unsigned long` type, 2488 * this code deviates from C by picking D int, uint, long, or ulong instead 2489 */ 2490 2491 case FLAGS.octalhex: 2492 /* Octal or Hexadecimal constant. 2493 * First that fits: int, unsigned, long, unsigned long, 2494 * long long, unsigned long long 2495 */ 2496 if (n & 0x8000000000000000L) 2497 result = TOK.uns64Literal; // unsigned long 2498 else if (n & 0xFFFFFFFF00000000L) 2499 result = TOK.int64Literal; // long 2500 else if (n & 0x80000000) 2501 result = TOK.uns32Literal; 2502 else 2503 result = TOK.int32Literal; 2504 break; 2505 2506 case FLAGS.decimal: 2507 /* First that fits: int, long, long long 2508 */ 2509 if (n & 0x8000000000000000L) 2510 result = TOK.uns64Literal; // unsigned long 2511 else if (n & 0xFFFFFFFF80000000L) 2512 result = TOK.int64Literal; // long 2513 else 2514 result = TOK.int32Literal; 2515 break; 2516 2517 case FLAGS.octalhex | FLAGS.unsigned: 2518 case FLAGS.decimal | FLAGS.unsigned: 2519 /* First that fits: unsigned, unsigned long, unsigned long long 2520 */ 2521 if (n & 0xFFFFFFFF00000000L) 2522 result = TOK.uns64Literal; // unsigned long 2523 else 2524 result = TOK.uns32Literal; 2525 break; 2526 2527 case FLAGS.decimal | FLAGS.long_: 2528 /* First that fits: long, long long 2529 */ 2530 if (longsize == 4 || long_longsize == 4) 2531 { 2532 if (n & 0xFFFFFFFF_80000000L) 2533 result = TOK.int64Literal; 2534 else 2535 result = TOK.int32Literal; // long 2536 } 2537 else 2538 { 2539 result = TOK.int64Literal; // long 2540 } 2541 break; 2542 2543 case FLAGS.octalhex | FLAGS.long_: 2544 /* First that fits: long, unsigned long, long long, 2545 * unsigned long long 2546 */ 2547 if (longsize == 4 || long_longsize == 4) 2548 { 2549 if (n & 0x8000000000000000L) 2550 result = TOK.uns64Literal; 2551 else if (n & 0xFFFFFFFF00000000L) 2552 result = TOK.int64Literal; 2553 else if (n & 0x80000000) 2554 result = TOK.uns32Literal; // unsigned long 2555 else 2556 result = TOK.int32Literal; // long 2557 } 2558 else 2559 { 2560 if (n & 0x80000000_00000000L) 2561 result = TOK.uns64Literal; // unsigned long 2562 else 2563 result = TOK.int64Literal; // long 2564 } 2565 break; 2566 2567 case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_: 2568 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_: 2569 /* First that fits: unsigned long, unsigned long long 2570 */ 2571 if (longsize == 4 || long_longsize == 4) 2572 { 2573 if (n & 0xFFFFFFFF00000000L) 2574 result = TOK.uns64Literal; 2575 else 2576 result = TOK.uns32Literal; // unsigned long 2577 } 2578 else 2579 { 2580 result = TOK.uns64Literal; // unsigned long 2581 } 2582 break; 2583 2584 case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong: 2585 /* First that fits: long long, unsigned long long 2586 */ 2587 if (n & 0x8000000000000000L) 2588 result = TOK.uns64Literal; 2589 else 2590 result = TOK.int64Literal; 2591 break; 2592 2593 case FLAGS.decimal | FLAGS.long_ | FLAGS.llong: 2594 /* long long 2595 */ 2596 result = TOK.int64Literal; 2597 break; 2598 2599 case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong: 2600 case FLAGS.decimal | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong: 2601 result = TOK.uns64Literal; 2602 break; 2603 2604 case FLAGS.octalhex | FLAGS.i8: 2605 case FLAGS.octalhex | FLAGS.i16: 2606 case FLAGS.octalhex | FLAGS.i32: 2607 case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i8: 2608 case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i16: 2609 case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i32: 2610 case FLAGS.decimal | FLAGS.unsigned | FLAGS.i8: 2611 case FLAGS.decimal | FLAGS.unsigned | FLAGS.i16: 2612 case FLAGS.decimal | FLAGS.unsigned | FLAGS.i32: 2613 result = TOK.uns32Literal; 2614 break; 2615 2616 case FLAGS.decimal | FLAGS.i8: 2617 case FLAGS.decimal | FLAGS.i16: 2618 case FLAGS.decimal | FLAGS.i32: 2619 result = TOK.int32Literal; 2620 break; 2621 2622 case FLAGS.octalhex | FLAGS.i64: 2623 case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i64: 2624 case FLAGS.decimal | FLAGS.unsigned | FLAGS.i64: 2625 result = TOK.uns64Literal; 2626 break; 2627 2628 case FLAGS.decimal | FLAGS.i64: 2629 result = TOK.int64Literal; 2630 break; 2631 2632 default: 2633 debug printf("%x\n",flags); 2634 assert(0); 2635 } 2636 return result; 2637 } 2638 2639 /************************************** 2640 * Read in characters, converting them to real. 2641 * Bugs: 2642 * Exponent overflow not detected. 2643 * Too much requested precision is not detected. 2644 */ 2645 private TOK inreal(Token* t) 2646 { 2647 //printf("Lexer::inreal()\n"); 2648 debug 2649 { 2650 assert(*p == '.' || isdigit(*p)); 2651 } 2652 bool isWellformedString = true; 2653 stringbuffer.setsize(0); 2654 auto pstart = p; 2655 bool hex = false; 2656 dchar c = *p++; 2657 // Leading '0x' 2658 if (c == '0') 2659 { 2660 c = *p++; 2661 if (c == 'x' || c == 'X') 2662 { 2663 hex = true; 2664 c = *p++; 2665 } 2666 } 2667 // Digits to left of '.' 2668 while (1) 2669 { 2670 if (c == '.') 2671 { 2672 c = *p++; 2673 break; 2674 } 2675 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 2676 { 2677 c = *p++; 2678 continue; 2679 } 2680 break; 2681 } 2682 // Digits to right of '.' 2683 while (1) 2684 { 2685 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 2686 { 2687 c = *p++; 2688 continue; 2689 } 2690 break; 2691 } 2692 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P'))) 2693 { 2694 c = *p++; 2695 if (c == '-' || c == '+') 2696 { 2697 c = *p++; 2698 } 2699 bool anyexp = false; 2700 while (1) 2701 { 2702 if (isdigit(c)) 2703 { 2704 anyexp = true; 2705 c = *p++; 2706 continue; 2707 } 2708 if (c == '_') 2709 { 2710 if (Ccompile) 2711 error("embedded `_` in numeric literals not allowed"); 2712 c = *p++; 2713 continue; 2714 } 2715 if (!anyexp) 2716 { 2717 error("missing exponent"); 2718 isWellformedString = false; 2719 } 2720 break; 2721 } 2722 } 2723 else if (hex) 2724 { 2725 error("exponent required for hex float"); 2726 isWellformedString = false; 2727 } 2728 --p; 2729 while (pstart < p) 2730 { 2731 if (*pstart != '_') 2732 stringbuffer.writeByte(*pstart); 2733 ++pstart; 2734 } 2735 stringbuffer.writeByte(0); 2736 auto sbufptr = cast(const(char)*)stringbuffer[].ptr; 2737 TOK result; 2738 bool isOutOfRange = false; 2739 t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, isOutOfRange) : CTFloat.zero); 2740 2741 bool imaginary = false; 2742 if (*p == 'i' && Ccompile) 2743 { 2744 ++p; 2745 imaginary = true; 2746 } 2747 2748 switch (*p) 2749 { 2750 case 'F': 2751 case 'f': 2752 if (isWellformedString && !isOutOfRange) 2753 isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr); 2754 result = TOK.float32Literal; 2755 p++; 2756 break; 2757 default: 2758 if (isWellformedString && !isOutOfRange) 2759 isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr); 2760 result = TOK.float64Literal; 2761 break; 2762 case 'l': 2763 if (!Ccompile) 2764 error("use 'L' suffix instead of 'l'"); 2765 goto case 'L'; 2766 case 'L': 2767 ++p; 2768 if (Ccompile && long_doublesize == 8) 2769 goto default; 2770 result = TOK.float80Literal; 2771 break; 2772 } 2773 2774 if ((*p == 'i' || *p == 'I') && !Ccompile) 2775 { 2776 if (*p == 'I') 2777 error("use 'i' suffix instead of 'I'"); 2778 p++; 2779 imaginary = true; 2780 } 2781 2782 if (imaginary) 2783 { 2784 switch (result) 2785 { 2786 case TOK.float32Literal: 2787 result = TOK.imaginary32Literal; 2788 break; 2789 case TOK.float64Literal: 2790 result = TOK.imaginary64Literal; 2791 break; 2792 case TOK.float80Literal: 2793 result = TOK.imaginary80Literal; 2794 break; 2795 default: 2796 break; 2797 } 2798 } 2799 const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal); 2800 if (isOutOfRange && !isLong && (!Ccompile || hex)) 2801 { 2802 /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex 2803 */ 2804 const char* suffix = result == TOK.float32Literal ? "f" : result == TOK.float80Literal ? "L" : ""; 2805 const char* type = [TOK.float32Literal: "`float`".ptr, 2806 TOK.float64Literal: "`double`".ptr, 2807 TOK.float80Literal: "`real` for the current target".ptr][result]; 2808 error(scanloc, "number `%s%s` is not representable as a %s", sbufptr, suffix, type); 2809 const char* extra = result == TOK.float64Literal ? "`real` literals can be written using the `L` suffix. " : ""; 2810 eSink.errorSupplemental(scanloc, "%shttps://dlang.org/spec/lex.html#floatliteral", extra); 2811 } 2812 debug 2813 { 2814 switch (result) 2815 { 2816 case TOK.float32Literal: 2817 case TOK.float64Literal: 2818 case TOK.float80Literal: 2819 case TOK.imaginary32Literal: 2820 case TOK.imaginary64Literal: 2821 case TOK.imaginary80Literal: 2822 break; 2823 default: 2824 assert(0); 2825 } 2826 } 2827 return result; 2828 } 2829 2830 final Loc loc() @nogc 2831 { 2832 scanloc.charnum = cast(ushort)(1 + p - line); 2833 version (LocOffset) 2834 scanloc.fileOffset = cast(uint)(p - base); 2835 return scanloc; 2836 } 2837 2838 void error(T...)(const(char)* format, T args) 2839 { 2840 eSink.error(token.loc, format, args); 2841 } 2842 2843 void error(T...)(const ref Loc loc, const(char)* format, T args) 2844 { 2845 eSink.error(loc, format, args); 2846 } 2847 2848 void deprecation(T...)(const ref Loc loc, const(char)* format, T args) 2849 { 2850 eSink.deprecation(loc, format, args); 2851 } 2852 2853 void deprecation(T...)(const(char)* format, T args) 2854 { 2855 eSink.deprecation(token.loc, format, args); 2856 } 2857 2858 void deprecationSupplemental(T...)(const(char)* format, T args) 2859 { 2860 eSink.deprecationSupplemental(token.loc, format, args); 2861 } 2862 2863 /*************************************** 2864 * Parse special token sequence: 2865 * Returns: 2866 * true if the special token sequence was handled 2867 * References: 2868 * https://dlang.org/spec/lex.html#special-token-sequence 2869 */ 2870 bool parseSpecialTokenSequence() 2871 { 2872 Token n; 2873 scan(&n); 2874 if (n.value == TOK.identifier) 2875 { 2876 if (n.ident == Id.line) 2877 { 2878 poundLine(n, false); 2879 return true; 2880 } 2881 else 2882 { 2883 const locx = loc(); 2884 // @@@DEPRECATED_2.103@@@ 2885 // Turn into an error in 2.113 2886 if (inTokenStringConstant) 2887 deprecation(locx, "token string requires valid D tokens, not `#%s`", n.ident.toChars()); 2888 else 2889 error(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars()); 2890 } 2891 } 2892 else if (n.value == TOK.if_) 2893 { 2894 const locx = loc(); 2895 if (inTokenStringConstant) 2896 error(locx, "token string requires valid D tokens, not `#if`"); 2897 else 2898 error(locx, "C preprocessor directive `#if` is not supported, use `version` or `static if`"); 2899 } 2900 return false; 2901 } 2902 2903 /********************************************* 2904 * Parse line/file preprocessor directive: 2905 * #line linnum [filespec] 2906 * Allow __LINE__ for linnum, and __FILE__ for filespec. 2907 * Accept linemarker format: 2908 * # linnum [filespec] {flags} 2909 * There can be zero or more flags, which are one of the digits 1..4, and 2910 * must be in ascending order. The flags are ignored. 2911 * Params: 2912 * tok = token we're on, which is linnum of linemarker 2913 * linemarker = true if line marker format and lexer is on linnum 2914 * References: 2915 * linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html 2916 */ 2917 final void poundLine(ref Token tok, bool linemarker) 2918 { 2919 auto linnum = this.scanloc.linnum; 2920 const(char)* filespec = null; 2921 bool flags; 2922 2923 if (!linemarker) 2924 scan(&tok); 2925 if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal) 2926 { 2927 const lin = cast(int)(tok.unsvalue); 2928 if (lin != tok.unsvalue) 2929 { 2930 error(tok.loc, "line number `%lld` out of range", cast(ulong)tok.unsvalue); 2931 skipToNextLine(); 2932 return; 2933 } 2934 else 2935 linnum = lin; 2936 } 2937 else if (tok.value == TOK.line) // #line __LINE__ 2938 { 2939 } 2940 else 2941 { 2942 error(tok.loc, "positive integer argument expected following `#line`"); 2943 if (tok.value != TOK.endOfLine) 2944 skipToNextLine(); 2945 return; 2946 } 2947 while (1) 2948 { 2949 scan(&tok); 2950 switch (tok.value) 2951 { 2952 case TOK.endOfFile: 2953 case TOK.endOfLine: 2954 if (!inTokenStringConstant) 2955 { 2956 this.scanloc.linnum = linnum; 2957 if (filespec) 2958 this.scanloc.filename = filespec; 2959 } 2960 return; 2961 case TOK.file: 2962 if (filespec || flags) 2963 goto Lerr; 2964 filespec = mem.xstrdup(scanloc.filename); 2965 continue; 2966 case TOK.string_: 2967 if (filespec || flags) 2968 goto Lerr; 2969 if (tok.ptr[0] != '"' || tok.postfix != 0) 2970 goto Lerr; 2971 filespec = tok.ustring; 2972 continue; 2973 case TOK.int32Literal: 2974 if (!filespec) 2975 goto Lerr; 2976 if (linemarker && tok.unsvalue >= 1 && tok.unsvalue <= 4) 2977 { 2978 flags = true; // linemarker flags seen 2979 continue; 2980 } 2981 goto Lerr; 2982 default: 2983 goto Lerr; 2984 } 2985 } 2986 Lerr: 2987 if (filespec is null) 2988 error(tok.loc, "invalid filename for `#line` directive"); 2989 else if (linemarker) 2990 error(tok.loc, "invalid flag for line marker directive"); 2991 else if (!Ccompile) 2992 error(tok.loc, "found `%s` when expecting new line following `#line` directive", tok.toChars()); 2993 if (tok.value != TOK.endOfLine) 2994 skipToNextLine(); 2995 } 2996 2997 /*************************************** 2998 * Scan forward to start of next line. 2999 * Params: 3000 * defines = send characters to `defines` 3001 */ 3002 final void skipToNextLine(OutBuffer* defines = null) 3003 { 3004 while (1) 3005 { 3006 switch (*p) 3007 { 3008 case 0: 3009 case 0x1A: 3010 return; // do not advance p 3011 3012 case '\n': 3013 ++p; 3014 break; 3015 3016 case '\r': 3017 ++p; 3018 if (p[0] == '\n') 3019 ++p; 3020 break; 3021 3022 default: 3023 if (defines) 3024 defines.writeByte(*p); // don't care about Unicode line endings for C 3025 else if (*p & 0x80) 3026 { 3027 const u = decodeUTF(); 3028 if (u == PS || u == LS) 3029 { 3030 ++p; 3031 break; 3032 } 3033 } 3034 ++p; 3035 continue; 3036 } 3037 break; 3038 } 3039 endOfLine(); 3040 tokenizeNewlines = false; 3041 } 3042 3043 /******************************************** 3044 * Decode UTF character. 3045 * Issue error messages for invalid sequences. 3046 * Return decoded character, advance p to last character in UTF sequence. 3047 */ 3048 private uint decodeUTF() 3049 { 3050 string msg; 3051 auto result = decodeUTFpure(msg); 3052 3053 if (msg) 3054 error(token.loc, "%.*s", cast(int)msg.length, msg.ptr); 3055 return result; 3056 } 3057 3058 /******************************************** 3059 * Same as above, but the potential error message is stored to the 3060 * msg parameter instead of being issued. 3061 */ 3062 private pure uint decodeUTFpure(out string msg) 3063 { 3064 const s = p; 3065 assert(*s & 0x80); 3066 // Check length of remaining string up to 4 UTF-8 characters 3067 size_t len; 3068 for (len = 1; len < 4 && s[len]; len++) 3069 { 3070 } 3071 size_t idx = 0; 3072 dchar u; 3073 msg = utf_decodeChar(s[0 .. len], idx, u); 3074 p += idx - 1; 3075 if (!msg && isBidiControl(u)) 3076 msg = "Bidirectional control characters are disallowed for security reasons."; 3077 return u; 3078 } 3079 3080 /*************************************************** 3081 * Parse doc comment embedded between t.ptr and p. 3082 * Remove trailing blanks and tabs from lines. 3083 * Replace all newlines with \n. 3084 * Remove leading comment character from each line. 3085 * Decide if it's a lineComment or a blockComment. 3086 * Append to previous one for this token. 3087 * 3088 * If newParagraph is true, an extra newline will be 3089 * added between adjoining doc comments. 3090 */ 3091 private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure 3092 { 3093 /* ct tells us which kind of comment it is: '/', '*', or '+' 3094 */ 3095 const ct = t.ptr[2]; 3096 /* Start of comment text skips over / * *, / + +, or / / / 3097 */ 3098 const(char)* q = t.ptr + 3; // start of comment text 3099 const(char)* qend = p; 3100 if (ct == '*' || ct == '+') 3101 qend -= 2; 3102 /* Scan over initial row of ****'s or ++++'s or ////'s 3103 */ 3104 for (; q < qend; q++) 3105 { 3106 if (*q != ct) 3107 break; 3108 } 3109 /* Remove leading spaces until start of the comment 3110 */ 3111 int linestart = 0; 3112 if (ct == '/') 3113 { 3114 while (q < qend && (*q == ' ' || *q == '\t')) 3115 ++q; 3116 } 3117 else if (q < qend) 3118 { 3119 if (*q == '\r') 3120 { 3121 ++q; 3122 if (q < qend && *q == '\n') 3123 ++q; 3124 linestart = 1; 3125 } 3126 else if (*q == '\n') 3127 { 3128 ++q; 3129 linestart = 1; 3130 } 3131 } 3132 /* Remove trailing row of ****'s or ++++'s 3133 */ 3134 if (ct != '/') 3135 { 3136 for (; q < qend; qend--) 3137 { 3138 if (qend[-1] != ct) 3139 break; 3140 } 3141 } 3142 /* Comment is now [q .. qend]. 3143 * Canonicalize it into buf[]. 3144 */ 3145 OutBuffer buf; 3146 3147 void trimTrailingWhitespace() 3148 { 3149 const s = buf[]; 3150 auto len = s.length; 3151 while (len && (s[len - 1] == ' ' || s[len - 1] == '\t')) 3152 --len; 3153 buf.setsize(len); 3154 } 3155 3156 for (; q < qend; q++) 3157 { 3158 char c = *q; 3159 switch (c) 3160 { 3161 case '*': 3162 case '+': 3163 if (linestart && c == ct) 3164 { 3165 linestart = 0; 3166 /* Trim preceding whitespace up to preceding \n 3167 */ 3168 trimTrailingWhitespace(); 3169 continue; 3170 } 3171 break; 3172 case ' ': 3173 case '\t': 3174 break; 3175 case '\r': 3176 if (q[1] == '\n') 3177 continue; // skip the \r 3178 goto Lnewline; 3179 default: 3180 if (c == 226) 3181 { 3182 // If LS or PS 3183 if (q[1] == 128 && (q[2] == 168 || q[2] == 169)) 3184 { 3185 q += 2; 3186 goto Lnewline; 3187 } 3188 } 3189 linestart = 0; 3190 break; 3191 Lnewline: 3192 c = '\n'; // replace all newlines with \n 3193 goto case; 3194 case '\n': 3195 linestart = 1; 3196 /* Trim trailing whitespace 3197 */ 3198 trimTrailingWhitespace(); 3199 break; 3200 } 3201 buf.writeByte(c); 3202 } 3203 /* Trim trailing whitespace (if the last line does not have newline) 3204 */ 3205 trimTrailingWhitespace(); 3206 3207 // Always end with a newline 3208 const s = buf[]; 3209 if (s.length == 0 || s[$ - 1] != '\n') 3210 buf.writeByte('\n'); 3211 3212 // It's a line comment if the start of the doc comment comes 3213 // after other non-whitespace on the same line. 3214 auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment; 3215 // Combine with previous doc comment, if any 3216 if (*dc) 3217 { 3218 auto p = combineComments(*dc, buf[], newParagraph); 3219 *dc = p ? p[0 .. strlen(p)] : null; 3220 } 3221 else 3222 *dc = buf.extractSlice(true); 3223 } 3224 3225 /******************************************** 3226 * Combine two document comments into one, 3227 * separated by an extra newline if newParagraph is true. 3228 */ 3229 static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure 3230 { 3231 //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph); 3232 const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n' 3233 if (!c1) 3234 return c2.ptr; 3235 if (!c2) 3236 return c1.ptr; 3237 3238 int insertNewLine = 0; 3239 if (c1.length && c1[$ - 1] != '\n') 3240 insertNewLine = 1; 3241 const retSize = c1.length + insertNewLine + newParagraphSize + c2.length; 3242 auto p = cast(char*)mem.xmalloc_noscan(retSize + 1); 3243 p[0 .. c1.length] = c1[]; 3244 if (insertNewLine) 3245 p[c1.length] = '\n'; 3246 if (newParagraph) 3247 p[c1.length + insertNewLine] = '\n'; 3248 p[retSize - c2.length .. retSize] = c2[]; 3249 p[retSize] = 0; 3250 return p; 3251 } 3252 3253 /************************** 3254 * `p` should be at start of next line 3255 */ 3256 private void endOfLine() @nogc @safe 3257 { 3258 scanloc.linnum = scanloc.linnum + 1; 3259 line = p; 3260 } 3261 } 3262 3263 3264 /******************************* Private *****************************************/ 3265 3266 private: 3267 3268 private enum LS = 0x2028; // UTF line separator 3269 private enum PS = 0x2029; // UTF paragraph separator 3270 3271 /******************************************** 3272 * Do our own char maps 3273 */ 3274 private static immutable cmtable = () 3275 { 3276 ubyte[256] table; 3277 foreach (const c; 0 .. table.length) 3278 { 3279 if ('0' <= c && c <= '7') 3280 table[c] |= CMoctal; 3281 if (c_isxdigit(c)) 3282 table[c] |= CMhex; 3283 if (c_isalnum(c) || c == '_') 3284 table[c] |= CMidchar; 3285 3286 switch (c) 3287 { 3288 case 'x': case 'X': 3289 case 'b': case 'B': 3290 table[c] |= CMzerosecond; 3291 break; 3292 3293 case '0': .. case '9': 3294 case 'e': case 'E': 3295 case 'f': case 'F': 3296 case 'l': case 'L': 3297 case 'p': case 'P': 3298 case 'u': case 'U': 3299 case 'i': 3300 case '.': 3301 case '_': 3302 table[c] |= CMzerosecond | CMdigitsecond; 3303 break; 3304 3305 default: 3306 break; 3307 } 3308 3309 switch (c) 3310 { 3311 case '\\': 3312 case '\n': 3313 case '\r': 3314 case 0: 3315 case 0x1A: 3316 case '\'': 3317 break; 3318 default: 3319 if (!(c & 0x80)) 3320 table[c] |= CMsinglechar; 3321 break; 3322 } 3323 } 3324 return table; 3325 }(); 3326 3327 private 3328 { 3329 enum CMoctal = 0x1; 3330 enum CMhex = 0x2; 3331 enum CMidchar = 0x4; 3332 enum CMzerosecond = 0x8; 3333 enum CMdigitsecond = 0x10; 3334 enum CMsinglechar = 0x20; 3335 } 3336 3337 private bool isoctal(const char c) pure @nogc @safe 3338 { 3339 return (cmtable[c] & CMoctal) != 0; 3340 } 3341 3342 private bool ishex(const char c) pure @nogc @safe 3343 { 3344 return (cmtable[c] & CMhex) != 0; 3345 } 3346 3347 private bool isidchar(const char c) pure @nogc @safe 3348 { 3349 return (cmtable[c] & CMidchar) != 0; 3350 } 3351 3352 private bool isZeroSecond(const char c) pure @nogc @safe 3353 { 3354 return (cmtable[c] & CMzerosecond) != 0; 3355 } 3356 3357 private bool isDigitSecond(const char c) pure @nogc @safe 3358 { 3359 return (cmtable[c] & CMdigitsecond) != 0; 3360 } 3361 3362 private bool issinglechar(const char c) pure @nogc @safe 3363 { 3364 return (cmtable[c] & CMsinglechar) != 0; 3365 } 3366 3367 private bool c_isxdigit(const int c) pure @nogc @safe 3368 { 3369 return (( c >= '0' && c <= '9') || 3370 ( c >= 'a' && c <= 'f') || 3371 ( c >= 'A' && c <= 'F')); 3372 } 3373 3374 private bool c_isalnum(const int c) pure @nogc @safe 3375 { 3376 return (( c >= '0' && c <= '9') || 3377 ( c >= 'a' && c <= 'z') || 3378 ( c >= 'A' && c <= 'Z')); 3379 } 3380 3381 /******************************* Unittest *****************************************/ 3382 3383 unittest 3384 { 3385 fprintf(stderr, "Lexer.unittest %d\n", __LINE__); 3386 3387 ErrorSink errorSink = new ErrorSinkStderr; 3388 3389 void test(T)(string sequence, T expected, bool Ccompile = false) 3390 { 3391 auto p = cast(const(char)*)sequence.ptr; 3392 dchar c2; 3393 Lexer lexer = new Lexer(errorSink); 3394 assert(expected == lexer.escapeSequence(Loc.initial, p, Ccompile, c2)); 3395 assert(p == sequence.ptr + sequence.length); 3396 } 3397 3398 test(`'`, '\''); 3399 test(`"`, '"'); 3400 test(`?`, '?'); 3401 test(`\`, '\\'); 3402 test(`0`, '\0'); 3403 test(`a`, '\a'); 3404 test(`b`, '\b'); 3405 test(`f`, '\f'); 3406 test(`n`, '\n'); 3407 test(`r`, '\r'); 3408 test(`t`, '\t'); 3409 test(`v`, '\v'); 3410 3411 test(`x00`, 0x00); 3412 test(`xff`, 0xff); 3413 test(`xFF`, 0xff); 3414 test(`xa7`, 0xa7); 3415 test(`x3c`, 0x3c); 3416 test(`xe2`, 0xe2); 3417 3418 test(`1`, '\1'); 3419 test(`42`, '\42'); 3420 test(`357`, '\357'); 3421 3422 test(`u1234`, '\u1234'); 3423 test(`uf0e4`, '\uf0e4'); 3424 3425 test(`U0001f603`, '\U0001f603'); 3426 3427 test(`"`, '"'); 3428 test(`<`, '<'); 3429 test(`>`, '>'); 3430 } 3431 3432 unittest 3433 { 3434 fprintf(stderr, "Lexer.unittest %d\n", __LINE__); 3435 3436 static class ErrorSinkTest : ErrorSinkNull 3437 { 3438 nothrow: 3439 extern (C++): 3440 override: 3441 3442 import core.stdc.stdio; 3443 import core.stdc.stdarg; 3444 3445 string expected; 3446 bool gotError; 3447 3448 void error(const ref Loc loc, const(char)* format, ...) 3449 { 3450 gotError = true; 3451 char[100] buffer = void; 3452 va_list ap; 3453 va_start(ap, format); 3454 auto actual = buffer[0 .. vsnprintf(buffer.ptr, buffer.length, format, ap)]; 3455 va_end(ap); 3456 assert(expected == actual); 3457 } 3458 } 3459 3460 ErrorSinkTest errorSink = new ErrorSinkTest; 3461 3462 void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false) 3463 { 3464 errorSink.expected = expectedError; 3465 errorSink.gotError = false; 3466 auto p = cast(const(char)*)sequence.ptr; 3467 Lexer lexer = new Lexer(errorSink); 3468 dchar c2; 3469 auto actualReturnValue = lexer.escapeSequence(Loc.initial, p, Ccompile, c2); 3470 assert(errorSink.gotError); 3471 assert(expectedReturnValue == actualReturnValue); 3472 3473 auto actualScanLength = p - sequence.ptr; 3474 assert(expectedScanLength == actualScanLength); 3475 } 3476 3477 test("c", `undefined escape sequence \c`, 'c', 1); 3478 test("!", `undefined escape sequence \!`, '!', 1); 3479 test(""", `undefined escape sequence \&`, '&', 1, true); 3480 3481 test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2); 3482 3483 test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2); 3484 test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3); 3485 test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4); 3486 3487 test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2); 3488 test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3); 3489 test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4); 3490 test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5); 3491 test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6); 3492 test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7); 3493 test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8); 3494 3495 test("ud800" , `invalid UTF character \U0000d800`, '?', 5); 3496 test("udfff" , `invalid UTF character \U0000dfff`, '?', 5); 3497 test("U00110000", `invalid UTF character \U00110000`, '?', 9); 3498 3499 test("xg0" , `undefined escape hex sequence \xg`, 'g', 2); 3500 test("ug000" , `undefined escape hex sequence \ug`, 'g', 2); 3501 test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2); 3502 3503 test("&BAD;", `unnamed character entity &BAD;` , '?', 5); 3504 test(""", `unterminated named entity "`, '?', 5); 3505 test(""", `unterminated named entity "`, '?', 5); 3506 3507 test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3); 3508 } 3509 3510 unittest 3511 { 3512 fprintf(stderr, "Lexer.unittest %d\n", __LINE__); 3513 /* Not much here, just trying things out. 3514 */ 3515 string text = "int"; // We rely on the implicit null-terminator 3516 ErrorSink errorSink = new ErrorSinkStderr; 3517 scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, false, false, errorSink, null); 3518 TOK tok; 3519 tok = lex1.nextToken(); 3520 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32); 3521 assert(tok == TOK.int32); 3522 tok = lex1.nextToken(); 3523 assert(tok == TOK.endOfFile); 3524 tok = lex1.nextToken(); 3525 assert(tok == TOK.endOfFile); 3526 tok = lex1.nextToken(); 3527 assert(tok == TOK.endOfFile); 3528 } 3529 3530 unittest 3531 { 3532 fprintf(stderr, "Lexer.unittest %d\n", __LINE__); 3533 3534 // We don't want to see Lexer error output during these tests. 3535 ErrorSink errorSink = new ErrorSinkNull; 3536 3537 // Test malformed input: even malformed input should end in a TOK.endOfFile. 3538 static immutable char[][] testcases = 3539 [ // Testcase must end with 0 or 0x1A. 3540 [0], // not malformed, but pathological 3541 ['\'', 0], 3542 ['\'', 0x1A], 3543 ['{', '{', 'q', '{', 0], 3544 [0xFF, 0], 3545 [0xFF, 0x80, 0], 3546 [0xFF, 0xFF, 0], 3547 [0xFF, 0xFF, 0], 3548 ['x', '"', 0x1A], 3549 ]; 3550 3551 foreach (testcase; testcases) 3552 { 3553 scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, false, false, errorSink, null); 3554 TOK tok = lex2.nextToken(); 3555 size_t iterations = 1; 3556 while ((tok != TOK.endOfFile) && (iterations++ < testcase.length)) 3557 { 3558 tok = lex2.nextToken(); 3559 } 3560 assert(tok == TOK.endOfFile); 3561 tok = lex2.nextToken(); 3562 assert(tok == TOK.endOfFile); 3563 } 3564 }