1 /** 2 * Implements the lexical analyzer, which converts source code into lexical tokens. 3 * 4 * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical) 5 * 6 * Copyright: Copyright (C) 1999-2023 by The D Language Foundation, All Rights Reserved 7 * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright) 8 * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d) 10 * Documentation: https://dlang.org/phobos/dmd_lexer.html 11 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d 12 */ 13 14 module dmd.lexer; 15 16 import core.stdc.ctype; 17 import core.stdc.stdio; 18 import core.stdc.string; 19 20 import dmd.entity; 21 import dmd.errorsink; 22 import dmd.id; 23 import dmd.identifier; 24 import dmd.location; 25 import dmd.root.array; 26 import dmd.root.ctfloat; 27 import dmd.common.outbuffer; 28 import dmd.root.port; 29 import dmd.root.rmem; 30 import dmd.root.utf; 31 import dmd.tokens; 32 33 nothrow: 34 35 version (DMDLIB) 36 { 37 version = LocOffset; 38 } 39 40 /*********************************************************** 41 * Values to use for various magic identifiers 42 */ 43 struct CompileEnv 44 { 45 uint versionNumber; /// __VERSION__ 46 const(char)[] date; /// __DATE__ 47 const(char)[] time; /// __TIME__ 48 const(char)[] vendor; /// __VENDOR__ 49 const(char)[] timestamp; /// __TIMESTAMP__ 50 51 bool previewIn; /// `in` means `[ref] scope const`, accepts rvalues 52 bool ddocOutput; /// collect embedded documentation comments 53 bool shortenedMethods = true; /// allow => in normal function declarations 54 } 55 56 /*********************************************************** 57 */ 58 class Lexer 59 { 60 private __gshared OutBuffer stringbuffer; 61 62 Loc scanloc; // for error messages 63 Loc prevloc; // location of token before current 64 65 const(char)* p; // current character 66 67 Token token; 68 69 // For ImportC 70 bool Ccompile; /// true if compiling ImportC 71 72 // The following are valid only if (Ccompile == true) 73 ubyte boolsize; /// size of a C _Bool, default 1 74 ubyte shortsize; /// size of a C short, default 2 75 ubyte intsize; /// size of a C int, default 4 76 ubyte longsize; /// size of C long, 4 or 8 77 ubyte long_longsize; /// size of a C long long, default 8 78 ubyte long_doublesize; /// size of C long double, 8 or D real.sizeof 79 ubyte wchar_tsize; /// size of C wchar_t, 2 or 4 80 81 ErrorSink eSink; /// send error messages through this interface 82 CompileEnv compileEnv; /// environment 83 84 private 85 { 86 const(char)* base; // pointer to start of buffer 87 const(char)* end; // pointer to last element of buffer 88 const(char)* line; // start of current line 89 90 bool doDocComment; // collect doc comment information 91 bool anyToken; // seen at least one token 92 bool commentToken; // comments are TOK.comment's 93 bool tokenizeNewlines; // newlines are turned into TOK.endOfLine's 94 95 bool whitespaceToken; // tokenize whitespaces (only for DMDLIB) 96 97 int inTokenStringConstant; // can be larger than 1 when in nested q{} strings 98 int lastDocLine; // last line of previous doc comment 99 100 Token* tokenFreelist; 101 } 102 103 nothrow: 104 105 /********************* 106 * Creates a Lexer for the source code base[begoffset..endoffset+1]. 107 * The last character, base[endoffset], must be null (0) or EOF (0x1A). 108 * 109 * Params: 110 * filename = used for error messages 111 * base = source code, must be terminated by a null (0) or EOF (0x1A) character 112 * begoffset = starting offset into base[] 113 * endoffset = the last offset to read into base[] 114 * doDocComment = handle documentation comments 115 * commentToken = comments become TOK.comment's 116 * errorSink = where error messages go, must not be null 117 * compileEnv = version, vendor, date, time, etc. 118 */ 119 this(const(char)* filename, const(char)* base, size_t begoffset, 120 size_t endoffset, bool doDocComment, bool commentToken, 121 ErrorSink errorSink, 122 const CompileEnv* compileEnv) pure scope 123 { 124 scanloc = Loc(filename, 1, 1); 125 // debug printf("Lexer::Lexer(%p)\n", base); 126 // debug printf("lexer.filename = %s\n", filename); 127 token = Token.init; 128 this.base = base; 129 this.end = base + endoffset; 130 p = base + begoffset; 131 line = p; 132 this.doDocComment = doDocComment; 133 this.commentToken = commentToken; 134 this.tokenizeNewlines = false; 135 this.inTokenStringConstant = 0; 136 this.lastDocLine = 0; 137 this.eSink = errorSink; 138 assert(errorSink); 139 if (compileEnv) 140 this.compileEnv = *compileEnv; 141 else 142 { 143 this.compileEnv.versionNumber = 1; 144 this.compileEnv.vendor = "DLF"; 145 } 146 //initKeywords(); 147 /* If first line starts with '#!', ignore the line 148 */ 149 if (p && p[0] == '#' && p[1] == '!') 150 { 151 p += 2; 152 for (;;p++) 153 { 154 char c = *p; 155 switch (c) 156 { 157 case '\n': 158 p++; 159 goto case; 160 case 0: 161 case 0x1A: 162 break; 163 164 default: 165 // Note: We do allow malformed UTF-8 on shebang line. 166 // It could have a meaning if the native system 167 // encoding is not Unicode. See test compilable/test13512.d 168 // for example encoded in KOI-8. 169 // We also allow bidirectional control characters. 170 // We do not execute the shebang line, so it can't be used 171 // to conceal code. It is up to the shell to sanitize it. 172 continue; 173 } 174 break; 175 } 176 endOfLine(); 177 } 178 } 179 180 /*********************** 181 * Alternative entry point for DMDLIB, adds `whitespaceToken` 182 */ 183 this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset, 184 bool doDocComment, bool commentToken, bool whitespaceToken, 185 ErrorSink errorSink, const CompileEnv* compileEnv = null 186 ) 187 { 188 this(filename, base, begoffset, endoffset, doDocComment, commentToken, errorSink, compileEnv); 189 this.whitespaceToken = whitespaceToken; 190 } 191 192 /****************** 193 * Used for unittests for a mock Lexer 194 */ 195 this(ErrorSink errorSink) scope { assert(errorSink); this.eSink = errorSink; } 196 197 /************************************** 198 * Reset lexer to lex #define's 199 */ 200 final void resetDefineLines(const(char)[] slice) 201 { 202 base = slice.ptr; 203 end = base + slice.length; 204 assert(*end == 0); 205 p = base; 206 line = p; 207 tokenizeNewlines = true; 208 inTokenStringConstant = 0; 209 lastDocLine = 0; 210 scanloc = Loc("#defines", 1, 1); 211 } 212 213 /********************************** 214 * Set up for next #define line. 215 * p should be at start of next line. 216 */ 217 final void nextDefineLine() 218 { 219 tokenizeNewlines = true; 220 } 221 222 /*************** 223 * Range interface 224 */ 225 226 final bool empty() const pure @property @nogc @safe 227 { 228 return front() == TOK.endOfFile; 229 } 230 231 final TOK front() const pure @property @nogc @safe 232 { 233 return token.value; 234 } 235 236 final void popFront() 237 { 238 nextToken(); 239 } 240 241 /// Returns: a newly allocated `Token`. 242 Token* allocateToken() pure nothrow @safe 243 { 244 if (tokenFreelist) 245 { 246 Token* t = tokenFreelist; 247 tokenFreelist = t.next; 248 t.next = null; 249 return t; 250 } 251 return new Token(); 252 } 253 254 /// Frees the given token by returning it to the freelist. 255 private void releaseToken(Token* token) pure nothrow @nogc @safe 256 { 257 if (mem.isGCEnabled) 258 *token = Token.init; 259 token.next = tokenFreelist; 260 tokenFreelist = token; 261 } 262 263 final TOK nextToken() 264 { 265 prevloc = token.loc; 266 if (token.next) 267 { 268 Token* t = token.next; 269 memcpy(&token, t, Token.sizeof); 270 releaseToken(t); 271 } 272 else 273 { 274 scan(&token); 275 } 276 //printf(token.toChars()); 277 return token.value; 278 } 279 280 /*********************** 281 * Look ahead at next token's value. 282 */ 283 final TOK peekNext() 284 { 285 return peek(&token).value; 286 } 287 288 /*********************** 289 * Look 2 tokens ahead at value. 290 */ 291 final TOK peekNext2() 292 { 293 Token* t = peek(&token); 294 return peek(t).value; 295 } 296 297 /**************************** 298 * Turn next token in buffer into a token. 299 * Params: 300 * t = the token to set the resulting Token to 301 */ 302 final void scan(Token* t) 303 { 304 const lastLine = scanloc.linnum; 305 Loc startLoc; 306 t.blockComment = null; 307 t.lineComment = null; 308 309 while (1) 310 { 311 t.ptr = p; 312 //printf("p = %p, *p = '%c'\n",p,*p); 313 t.loc = loc(); 314 switch (*p) 315 { 316 case 0: 317 case 0x1A: 318 t.value = TOK.endOfFile; // end of file 319 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile. 320 return; 321 case ' ': 322 // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary. 323 while ((cast(size_t)p) % uint.sizeof) 324 { 325 if (*p != ' ') 326 goto LendSkipFourSpaces; 327 p++; 328 } 329 while (*(cast(uint*)p) == 0x20202020) // ' ' == 0x20 330 p += 4; 331 // Skip over any remaining space on the line. 332 while (*p == ' ') 333 p++; 334 LendSkipFourSpaces: 335 version (DMDLIB) 336 { 337 if (whitespaceToken) 338 { 339 t.value = TOK.whitespace; 340 return; 341 } 342 } 343 continue; // skip white space 344 case '\t': 345 case '\v': 346 case '\f': 347 p++; 348 version (DMDLIB) 349 { 350 if (whitespaceToken) 351 { 352 t.value = TOK.whitespace; 353 return; 354 } 355 } 356 continue; // skip white space 357 case '\r': 358 p++; 359 if (*p != '\n') // if CR stands by itself 360 { 361 endOfLine(); 362 if (tokenizeNewlines) 363 { 364 t.value = TOK.endOfLine; 365 tokenizeNewlines = false; 366 return; 367 } 368 } 369 version (DMDLIB) 370 { 371 if (whitespaceToken) 372 { 373 t.value = TOK.whitespace; 374 return; 375 } 376 } 377 continue; // skip white space 378 case '\n': 379 p++; 380 endOfLine(); 381 if (tokenizeNewlines) 382 { 383 t.value = TOK.endOfLine; 384 tokenizeNewlines = false; 385 return; 386 } 387 version (DMDLIB) 388 { 389 if (whitespaceToken) 390 { 391 t.value = TOK.whitespace; 392 return; 393 } 394 } 395 continue; // skip white space 396 case '0': 397 if (!isZeroSecond(p[1])) // if numeric literal does not continue 398 { 399 ++p; 400 t.unsvalue = 0; 401 t.value = TOK.int32Literal; 402 return; 403 } 404 goto Lnumber; 405 406 case '1': .. case '9': 407 if (!isDigitSecond(p[1])) // if numeric literal does not continue 408 { 409 t.unsvalue = *p - '0'; 410 ++p; 411 t.value = TOK.int32Literal; 412 return; 413 } 414 Lnumber: 415 t.value = number(t); 416 return; 417 418 case '\'': 419 if (issinglechar(p[1]) && p[2] == '\'') 420 { 421 t.unsvalue = p[1]; // simple one character literal 422 t.value = TOK.charLiteral; 423 p += 3; 424 } 425 else if (Ccompile) 426 { 427 clexerCharConstant(*t, 0); 428 } 429 else 430 { 431 t.value = charConstant(t); 432 } 433 return; 434 435 case 'u': 436 case 'U': 437 case 'L': 438 if (!Ccompile) 439 goto case_ident; 440 if (p[1] == '\'') // C wide character constant 441 { 442 char c = *p; 443 if (c == 'L') // convert L to u or U 444 c = (wchar_tsize == 4) ? 'u' : 'U'; 445 ++p; 446 clexerCharConstant(*t, c); 447 return; 448 } 449 else if (p[1] == '\"') // C wide string literal 450 { 451 const c = *p; 452 ++p; 453 escapeStringConstant(t); 454 t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') : 455 c == 'u' ? 'w' : 456 'd'; 457 return; 458 } 459 else if (p[1] == '8' && p[2] == '\"') // C UTF-8 string literal 460 { 461 p += 2; 462 escapeStringConstant(t); 463 return; 464 } 465 goto case_ident; 466 467 case 'r': 468 if (Ccompile || p[1] != '"') 469 goto case_ident; 470 p++; 471 goto case '`'; 472 case '`': 473 if (Ccompile) 474 goto default; 475 wysiwygStringConstant(t); 476 return; 477 case 'q': 478 if (Ccompile) 479 goto case_ident; 480 if (p[1] == '"') 481 { 482 p++; 483 delimitedStringConstant(t); 484 return; 485 } 486 else if (p[1] == '{') 487 { 488 p++; 489 tokenStringConstant(t); 490 return; 491 } 492 else 493 goto case_ident; 494 case '"': 495 escapeStringConstant(t); 496 return; 497 case 'a': 498 case 'b': 499 case 'c': 500 case 'd': 501 case 'e': 502 case 'f': 503 case 'g': 504 case 'h': 505 case 'i': 506 case 'j': 507 case 'k': 508 case 'l': 509 case 'm': 510 case 'n': 511 case 'o': 512 case 'p': 513 /*case 'q': case 'r':*/ 514 case 's': 515 case 't': 516 //case 'u': 517 case 'v': 518 case 'w': 519 case 'x': 520 case 'y': 521 case 'z': 522 case 'A': 523 case 'B': 524 case 'C': 525 case 'D': 526 case 'E': 527 case 'F': 528 case 'G': 529 case 'H': 530 case 'I': 531 case 'J': 532 case 'K': 533 //case 'L': 534 case 'M': 535 case 'N': 536 case 'O': 537 case 'P': 538 case 'Q': 539 case 'R': 540 case 'S': 541 case 'T': 542 //case 'U': 543 case 'V': 544 case 'W': 545 case 'X': 546 case 'Y': 547 case 'Z': 548 case '_': 549 case_ident: 550 { 551 while (1) 552 { 553 const c = *++p; 554 if (isidchar(c)) 555 continue; 556 else if (c & 0x80) 557 { 558 const s = p; 559 const u = decodeUTF(); 560 if (isUniAlpha(u)) 561 continue; 562 error(t.loc, "char 0x%04x not allowed in identifier", u); 563 p = s; 564 } 565 break; 566 } 567 Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr)); 568 t.ident = id; 569 t.value = cast(TOK)id.getValue(); 570 571 anyToken = 1; 572 573 /* Different keywords for C and D 574 */ 575 if (Ccompile) 576 { 577 if (t.value != TOK.identifier) 578 { 579 t.value = Ckeywords[t.value]; // filter out D keywords 580 } 581 } 582 else if (t.value >= FirstCKeyword) 583 t.value = TOK.identifier; // filter out C keywords 584 585 else if (*t.ptr == '_') // if special identifier token 586 { 587 void toToken(const(char)[] s) 588 { 589 t.value = TOK.string_; 590 t.ustring = s.ptr; 591 t.len = cast(uint)s.length; 592 t.postfix = 0; 593 } 594 595 if (id == Id.DATE) 596 toToken(compileEnv.date); 597 else if (id == Id.TIME) 598 toToken(compileEnv.time); 599 else if (id == Id.VENDOR) 600 toToken(compileEnv.vendor); 601 else if (id == Id.TIMESTAMP) 602 toToken(compileEnv.timestamp); 603 else if (id == Id.VERSIONX) 604 { 605 t.value = TOK.int64Literal; 606 t.unsvalue = compileEnv.versionNumber; 607 } 608 else if (id == Id.EOFX) 609 { 610 t.value = TOK.endOfFile; 611 // Advance scanner to end of file 612 while (!(*p == 0 || *p == 0x1A)) 613 p++; 614 } 615 } 616 //printf("t.value = %d\n",t.value); 617 return; 618 } 619 case '/': 620 p++; 621 switch (*p) 622 { 623 case '=': 624 p++; 625 t.value = TOK.divAssign; 626 return; 627 case '*': 628 p++; 629 startLoc = loc(); 630 while (1) 631 { 632 while (1) 633 { 634 const c = *p; 635 switch (c) 636 { 637 case '/': 638 break; 639 case '\n': 640 endOfLine(); 641 p++; 642 continue; 643 case '\r': 644 p++; 645 if (*p != '\n') 646 endOfLine(); 647 continue; 648 case 0: 649 case 0x1A: 650 error(t.loc, "unterminated /* */ comment"); 651 p = end; 652 t.loc = loc(); 653 t.value = TOK.endOfFile; 654 return; 655 default: 656 if (c & 0x80) 657 { 658 const u = decodeUTF(); 659 if (u == PS || u == LS) 660 endOfLine(); 661 } 662 p++; 663 continue; 664 } 665 break; 666 } 667 p++; 668 if (p[-2] == '*' && p - 3 != t.ptr) 669 break; 670 } 671 if (commentToken) 672 { 673 t.loc = startLoc; 674 t.value = TOK.comment; 675 return; 676 } 677 else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr) 678 { 679 // if /** but not /**/ 680 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 681 lastDocLine = scanloc.linnum; 682 } 683 continue; 684 case '/': // do // style comments 685 startLoc = loc(); 686 while (1) 687 { 688 const c = *++p; 689 switch (c) 690 { 691 case '\n': 692 break; 693 case '\r': 694 if (p[1] == '\n') 695 p++; 696 break; 697 case 0: 698 case 0x1A: 699 if (commentToken) 700 { 701 p = end; 702 t.loc = startLoc; 703 t.value = TOK.comment; 704 return; 705 } 706 if (doDocComment && t.ptr[2] == '/') 707 { 708 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 709 lastDocLine = scanloc.linnum; 710 } 711 p = end; 712 t.loc = loc(); 713 t.value = TOK.endOfFile; 714 return; 715 default: 716 if (c & 0x80) 717 { 718 const u = decodeUTF(); 719 if (u == PS || u == LS) 720 break; 721 } 722 continue; 723 } 724 break; 725 } 726 if (commentToken) 727 { 728 version (DMDLIB) {} 729 else 730 { 731 p++; 732 endOfLine(); 733 } 734 t.loc = startLoc; 735 t.value = TOK.comment; 736 return; 737 } 738 if (doDocComment && t.ptr[2] == '/') 739 { 740 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 741 lastDocLine = scanloc.linnum; 742 } 743 p++; 744 endOfLine(); 745 continue; 746 case '+': 747 if (!Ccompile) 748 { 749 int nest; 750 startLoc = loc(); 751 p++; 752 nest = 1; 753 while (1) 754 { 755 char c = *p; 756 switch (c) 757 { 758 case '/': 759 p++; 760 if (*p == '+') 761 { 762 p++; 763 nest++; 764 } 765 continue; 766 case '+': 767 p++; 768 if (*p == '/') 769 { 770 p++; 771 if (--nest == 0) 772 break; 773 } 774 continue; 775 case '\r': 776 p++; 777 if (*p != '\n') 778 endOfLine(); 779 continue; 780 case '\n': 781 endOfLine(); 782 p++; 783 continue; 784 case 0: 785 case 0x1A: 786 error(t.loc, "unterminated /+ +/ comment"); 787 p = end; 788 t.loc = loc(); 789 t.value = TOK.endOfFile; 790 return; 791 default: 792 if (c & 0x80) 793 { 794 uint u = decodeUTF(); 795 if (u == PS || u == LS) 796 endOfLine(); 797 } 798 p++; 799 continue; 800 } 801 break; 802 } 803 if (commentToken) 804 { 805 t.loc = startLoc; 806 t.value = TOK.comment; 807 return; 808 } 809 if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr) 810 { 811 // if /++ but not /++/ 812 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 813 lastDocLine = scanloc.linnum; 814 } 815 continue; 816 } 817 break; 818 default: 819 break; 820 } 821 t.value = TOK.div; 822 return; 823 case '.': 824 p++; 825 if (isdigit(*p)) 826 { 827 /* Note that we don't allow ._1 and ._ as being 828 * valid floating point numbers. 829 */ 830 p--; 831 t.value = inreal(t); 832 } 833 else if (p[0] == '.') 834 { 835 if (p[1] == '.') 836 { 837 p += 2; 838 t.value = TOK.dotDotDot; 839 } 840 else 841 { 842 p++; 843 t.value = TOK.slice; 844 } 845 } 846 else 847 t.value = TOK.dot; 848 return; 849 case '&': 850 p++; 851 if (*p == '=') 852 { 853 p++; 854 t.value = TOK.andAssign; 855 } 856 else if (*p == '&') 857 { 858 p++; 859 t.value = TOK.andAnd; 860 } 861 else 862 t.value = TOK.and; 863 return; 864 case '|': 865 p++; 866 if (*p == '=') 867 { 868 p++; 869 t.value = TOK.orAssign; 870 } 871 else if (*p == '|') 872 { 873 p++; 874 t.value = TOK.orOr; 875 } 876 else 877 t.value = TOK.or; 878 return; 879 case '-': 880 p++; 881 if (*p == '=') 882 { 883 p++; 884 t.value = TOK.minAssign; 885 } 886 else if (*p == '-') 887 { 888 p++; 889 t.value = TOK.minusMinus; 890 } 891 else if (*p == '>') 892 { 893 ++p; 894 t.value = TOK.arrow; 895 } 896 else 897 t.value = TOK.min; 898 return; 899 case '+': 900 p++; 901 if (*p == '=') 902 { 903 p++; 904 t.value = TOK.addAssign; 905 } 906 else if (*p == '+') 907 { 908 p++; 909 t.value = TOK.plusPlus; 910 } 911 else 912 t.value = TOK.add; 913 return; 914 case '<': 915 p++; 916 if (*p == '=') 917 { 918 p++; 919 t.value = TOK.lessOrEqual; // <= 920 } 921 else if (*p == '<') 922 { 923 p++; 924 if (*p == '=') 925 { 926 p++; 927 t.value = TOK.leftShiftAssign; // <<= 928 } 929 else 930 t.value = TOK.leftShift; // << 931 } 932 else if (*p == ':' && Ccompile) 933 { 934 ++p; 935 t.value = TOK.leftBracket; // <: 936 } 937 else if (*p == '%' && Ccompile) 938 { 939 ++p; 940 t.value = TOK.leftCurly; // <% 941 } 942 else 943 t.value = TOK.lessThan; // < 944 return; 945 case '>': 946 p++; 947 if (*p == '=') 948 { 949 p++; 950 t.value = TOK.greaterOrEqual; // >= 951 } 952 else if (*p == '>') 953 { 954 p++; 955 if (*p == '=') 956 { 957 p++; 958 t.value = TOK.rightShiftAssign; // >>= 959 } 960 else if (*p == '>') 961 { 962 p++; 963 if (*p == '=') 964 { 965 p++; 966 t.value = TOK.unsignedRightShiftAssign; // >>>= 967 } 968 else 969 t.value = TOK.unsignedRightShift; // >>> 970 } 971 else 972 t.value = TOK.rightShift; // >> 973 } 974 else 975 t.value = TOK.greaterThan; // > 976 return; 977 case '!': 978 p++; 979 if (*p == '=') 980 { 981 p++; 982 t.value = TOK.notEqual; // != 983 } 984 else 985 t.value = TOK.not; // ! 986 return; 987 case '=': 988 p++; 989 if (*p == '=') 990 { 991 p++; 992 t.value = TOK.equal; // == 993 } 994 else if (*p == '>') 995 { 996 p++; 997 t.value = TOK.goesTo; // => 998 } 999 else 1000 t.value = TOK.assign; // = 1001 return; 1002 case '~': 1003 p++; 1004 if (*p == '=') 1005 { 1006 p++; 1007 t.value = TOK.concatenateAssign; // ~= 1008 } 1009 else 1010 t.value = TOK.tilde; // ~ 1011 return; 1012 case '^': 1013 p++; 1014 if (*p == '^') 1015 { 1016 p++; 1017 if (*p == '=') 1018 { 1019 p++; 1020 t.value = TOK.powAssign; // ^^= 1021 } 1022 else 1023 t.value = TOK.pow; // ^^ 1024 } 1025 else if (*p == '=') 1026 { 1027 p++; 1028 t.value = TOK.xorAssign; // ^= 1029 } 1030 else 1031 t.value = TOK.xor; // ^ 1032 return; 1033 case '(': 1034 p++; 1035 t.value = TOK.leftParenthesis; 1036 return; 1037 case ')': 1038 p++; 1039 t.value = TOK.rightParenthesis; 1040 return; 1041 case '[': 1042 p++; 1043 t.value = TOK.leftBracket; 1044 return; 1045 case ']': 1046 p++; 1047 t.value = TOK.rightBracket; 1048 return; 1049 case '{': 1050 p++; 1051 t.value = TOK.leftCurly; 1052 return; 1053 case '}': 1054 p++; 1055 t.value = TOK.rightCurly; 1056 return; 1057 case '?': 1058 p++; 1059 t.value = TOK.question; 1060 return; 1061 case ',': 1062 p++; 1063 t.value = TOK.comma; 1064 return; 1065 case ';': 1066 p++; 1067 t.value = TOK.semicolon; 1068 return; 1069 case ':': 1070 p++; 1071 if (*p == ':') 1072 { 1073 ++p; 1074 t.value = TOK.colonColon; 1075 } 1076 else if (*p == '>' && Ccompile) 1077 { 1078 ++p; 1079 t.value = TOK.rightBracket; 1080 } 1081 else 1082 t.value = TOK.colon; 1083 return; 1084 case '$': 1085 p++; 1086 t.value = TOK.dollar; 1087 return; 1088 case '@': 1089 p++; 1090 t.value = TOK.at; 1091 return; 1092 case '*': 1093 p++; 1094 if (*p == '=') 1095 { 1096 p++; 1097 t.value = TOK.mulAssign; 1098 } 1099 else 1100 t.value = TOK.mul; 1101 return; 1102 case '%': 1103 p++; 1104 if (*p == '=') 1105 { 1106 p++; 1107 t.value = TOK.modAssign; 1108 } 1109 else if (*p == '>' && Ccompile) 1110 { 1111 ++p; 1112 t.value = TOK.rightCurly; 1113 } 1114 else if (*p == ':' && Ccompile) 1115 { 1116 goto case '#'; // %: means # 1117 } 1118 else 1119 t.value = TOK.mod; 1120 return; 1121 case '#': 1122 { 1123 // https://issues.dlang.org/show_bug.cgi?id=22825 1124 // Special token sequences are terminated by newlines, 1125 // and should not be skipped over. 1126 this.tokenizeNewlines = true; 1127 p++; 1128 if (parseSpecialTokenSequence()) 1129 continue; 1130 t.value = TOK.pound; 1131 return; 1132 } 1133 default: 1134 { 1135 dchar c = *p; 1136 if (c & 0x80) 1137 { 1138 c = decodeUTF(); 1139 // Check for start of unicode identifier 1140 if (isUniAlpha(c)) 1141 goto case_ident; 1142 if (c == PS || c == LS) 1143 { 1144 endOfLine(); 1145 p++; 1146 if (tokenizeNewlines) 1147 { 1148 t.value = TOK.endOfLine; 1149 tokenizeNewlines = false; 1150 return; 1151 } 1152 continue; 1153 } 1154 } 1155 if (c < 0x80 && isprint(c)) 1156 error(t.loc, "character '%c' is not a valid token", c); 1157 else 1158 error(t.loc, "character 0x%02x is not a valid token", c); 1159 p++; 1160 continue; 1161 // assert(0); 1162 } 1163 } 1164 } 1165 } 1166 1167 final Token* peek(Token* ct) 1168 { 1169 Token* t; 1170 if (ct.next) 1171 t = ct.next; 1172 else 1173 { 1174 t = allocateToken(); 1175 scan(t); 1176 ct.next = t; 1177 } 1178 return t; 1179 } 1180 1181 /********************************* 1182 * tk is on the opening (. 1183 * Look ahead and return token that is past the closing ). 1184 */ 1185 final Token* peekPastParen(Token* tk) 1186 { 1187 //printf("peekPastParen()\n"); 1188 int parens = 1; 1189 int curlynest = 0; 1190 while (1) 1191 { 1192 tk = peek(tk); 1193 //tk.print(); 1194 switch (tk.value) 1195 { 1196 case TOK.leftParenthesis: 1197 parens++; 1198 continue; 1199 case TOK.rightParenthesis: 1200 --parens; 1201 if (parens) 1202 continue; 1203 tk = peek(tk); 1204 break; 1205 case TOK.leftCurly: 1206 curlynest++; 1207 continue; 1208 case TOK.rightCurly: 1209 if (--curlynest >= 0) 1210 continue; 1211 break; 1212 case TOK.semicolon: 1213 if (curlynest) 1214 continue; 1215 break; 1216 case TOK.endOfFile: 1217 break; 1218 default: 1219 continue; 1220 } 1221 return tk; 1222 } 1223 } 1224 1225 /******************************************* 1226 * Parse escape sequence. 1227 */ 1228 private uint escapeSequence(out dchar c2) 1229 { 1230 return Lexer.escapeSequence(token.loc, p, Ccompile, c2); 1231 } 1232 1233 /******** 1234 * Parse the given string literal escape sequence into a single character. 1235 * D https://dlang.org/spec/lex.html#escape_sequences 1236 * C11 6.4.4.4 1237 * Params: 1238 * loc = location to use for error messages 1239 * sequence = pointer to string with escape sequence to parse. Updated to 1240 * point past the end of the escape sequence 1241 * Ccompile = true for compile C11 escape sequences 1242 * c2 = returns second `dchar` of html entity with 2 code units, otherwise stays `dchar.init` 1243 * Returns: 1244 * the escape sequence as a single character 1245 */ 1246 private dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile, out dchar c2) 1247 { 1248 const(char)* p = sequence; // cache sequence reference on stack 1249 scope(exit) sequence = p; 1250 1251 uint c = *p; 1252 int ndigits; 1253 switch (c) 1254 { 1255 case '\'': 1256 case '"': 1257 case '?': 1258 case '\\': 1259 Lconsume: 1260 p++; 1261 break; 1262 case 'a': 1263 c = 7; 1264 goto Lconsume; 1265 case 'b': 1266 c = 8; 1267 goto Lconsume; 1268 case 'f': 1269 c = 12; 1270 goto Lconsume; 1271 case 'n': 1272 c = 10; 1273 goto Lconsume; 1274 case 'r': 1275 c = 13; 1276 goto Lconsume; 1277 case 't': 1278 c = 9; 1279 goto Lconsume; 1280 case 'v': 1281 c = 11; 1282 goto Lconsume; 1283 case 'u': 1284 ndigits = 4; 1285 goto Lhex; 1286 case 'U': 1287 ndigits = 8; 1288 goto Lhex; 1289 case 'x': 1290 ndigits = 2; 1291 Lhex: 1292 p++; 1293 c = *p; 1294 if (ishex(cast(char)c)) 1295 { 1296 uint v = 0; 1297 int n = 0; 1298 if (Ccompile && ndigits == 2) 1299 { 1300 /* C11 6.4.4.4-7 one to infinity hex digits 1301 */ 1302 do 1303 { 1304 if (isdigit(cast(char)c)) 1305 c -= '0'; 1306 else if (islower(c)) 1307 c -= 'a' - 10; 1308 else 1309 c -= 'A' - 10; 1310 v = v * 16 + c; 1311 c = *++p; 1312 } while (ishex(cast(char)c)); 1313 } 1314 else 1315 { 1316 while (1) 1317 { 1318 if (isdigit(cast(char)c)) 1319 c -= '0'; 1320 else if (islower(c)) 1321 c -= 'a' - 10; 1322 else 1323 c -= 'A' - 10; 1324 v = v * 16 + c; 1325 c = *++p; 1326 if (++n == ndigits) 1327 break; 1328 if (!ishex(cast(char)c)) 1329 { 1330 error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits); 1331 break; 1332 } 1333 } 1334 if (ndigits != 2 && !utf_isValidDchar(v)) 1335 { 1336 error(loc, "invalid UTF character \\U%08x", v); 1337 v = '?'; // recover with valid UTF character 1338 } 1339 } 1340 c = v; 1341 } 1342 else 1343 { 1344 error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c); 1345 p++; 1346 } 1347 break; 1348 case '&': 1349 if (Ccompile) 1350 goto default; 1351 1352 // named character entity 1353 for (const idstart = ++p; 1; p++) 1354 { 1355 switch (*p) 1356 { 1357 case ';': 1358 auto entity = HtmlNamedEntity(idstart[0 .. p - idstart]); 1359 c = entity[0]; 1360 if (entity == entity.init) 1361 { 1362 error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart); 1363 c = '?'; 1364 } 1365 if (entity[1] != entity.init[1]) 1366 c2 = entity[1]; 1367 1368 p++; 1369 break; 1370 default: 1371 if (isalpha(*p) || (p != idstart && isdigit(*p))) 1372 continue; 1373 error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart); 1374 c = '?'; 1375 break; 1376 } 1377 break; 1378 } 1379 break; 1380 case 0: 1381 case 0x1A: 1382 // end of file 1383 c = '\\'; 1384 break; 1385 default: 1386 if (isoctal(cast(char)c)) 1387 { 1388 uint v = 0; 1389 int n = 0; 1390 do 1391 { 1392 v = v * 8 + (c - '0'); 1393 c = *++p; 1394 } 1395 while (++n < 3 && isoctal(cast(char)c)); 1396 c = v; 1397 if (c > 0xFF) 1398 error(loc, "escape octal sequence \\%03o is larger than \\377", c); 1399 } 1400 else 1401 { 1402 error(loc, "undefined escape sequence \\%c", c); 1403 p++; 1404 } 1405 break; 1406 } 1407 return c; 1408 } 1409 1410 /** 1411 Lex a wysiwyg string. `p` must be pointing to the first character before the 1412 contents of the string literal. The character pointed to by `p` will be used as 1413 the terminating character (i.e. backtick or double-quote). 1414 Params: 1415 result = pointer to the token that accepts the result 1416 */ 1417 private void wysiwygStringConstant(Token* result) 1418 { 1419 result.value = TOK.string_; 1420 Loc start = loc(); 1421 auto terminator = p[0]; 1422 p++; 1423 stringbuffer.setsize(0); 1424 while (1) 1425 { 1426 dchar c = p[0]; 1427 p++; 1428 switch (c) 1429 { 1430 case '\n': 1431 endOfLine(); 1432 break; 1433 case '\r': 1434 if (p[0] == '\n') 1435 continue; // ignore 1436 c = '\n'; // treat EndOfLine as \n character 1437 endOfLine(); 1438 break; 1439 case 0: 1440 case 0x1A: 1441 error("unterminated string constant starting at %s", start.toChars()); 1442 result.setString(); 1443 // rewind `p` so it points to the EOF character 1444 p--; 1445 return; 1446 default: 1447 if (c == terminator) 1448 { 1449 result.setString(stringbuffer); 1450 stringPostfix(result); 1451 return; 1452 } 1453 else if (c & 0x80) 1454 { 1455 p--; 1456 const u = decodeUTF(); 1457 p++; 1458 if (u == PS || u == LS) 1459 endOfLine(); 1460 stringbuffer.writeUTF8(u); 1461 continue; 1462 } 1463 break; 1464 } 1465 stringbuffer.writeByte(c); 1466 } 1467 } 1468 1469 /** 1470 Lex a delimited string. Some examples of delimited strings are: 1471 --- 1472 q"(foo(xxx))" // "foo(xxx)" 1473 q"[foo$(LPAREN)]" // "foo$(LPAREN)" 1474 q"/foo]/" // "foo]" 1475 q"HERE 1476 foo 1477 HERE" // "foo\n" 1478 --- 1479 It is assumed that `p` points to the opening double-quote '"'. 1480 Params: 1481 result = pointer to the token that accepts the result 1482 */ 1483 private void delimitedStringConstant(Token* result) 1484 { 1485 result.value = TOK.string_; 1486 Loc start = loc(); 1487 dchar delimleft = 0; 1488 dchar delimright = 0; 1489 uint nest = 1; 1490 uint nestcount = ~0; // dead assignment, needed to suppress warning 1491 Identifier hereid = null; 1492 uint blankrol = 0; 1493 uint startline = 0; 1494 p++; 1495 stringbuffer.setsize(0); 1496 while (1) 1497 { 1498 const s = p; 1499 dchar c = *p++; 1500 //printf("c = '%c'\n", c); 1501 switch (c) 1502 { 1503 case '\n': 1504 Lnextline: 1505 endOfLine(); 1506 startline = 1; 1507 if (blankrol) 1508 { 1509 blankrol = 0; 1510 continue; 1511 } 1512 if (hereid) 1513 { 1514 stringbuffer.writeUTF8(c); 1515 continue; 1516 } 1517 break; 1518 case '\r': 1519 if (*p == '\n') 1520 continue; // ignore 1521 c = '\n'; // treat EndOfLine as \n character 1522 goto Lnextline; 1523 case 0: 1524 case 0x1A: 1525 error("unterminated delimited string constant starting at %s", start.toChars()); 1526 result.setString(); 1527 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1528 p--; 1529 return; 1530 default: 1531 if (c & 0x80) 1532 { 1533 p--; 1534 c = decodeUTF(); 1535 p++; 1536 if (c == PS || c == LS) 1537 goto Lnextline; 1538 } 1539 break; 1540 } 1541 if (delimleft == 0) 1542 { 1543 delimleft = c; 1544 nest = 1; 1545 nestcount = 1; 1546 if (c == '(') 1547 delimright = ')'; 1548 else if (c == '{') 1549 delimright = '}'; 1550 else if (c == '[') 1551 delimright = ']'; 1552 else if (c == '<') 1553 delimright = '>'; 1554 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) 1555 { 1556 // Start of identifier; must be a heredoc 1557 Token tok; 1558 p = s; 1559 scan(&tok); // read in heredoc identifier 1560 if (tok.value != TOK.identifier) 1561 { 1562 error("identifier expected for heredoc, not %s", tok.toChars()); 1563 delimright = c; 1564 } 1565 else 1566 { 1567 hereid = tok.ident; 1568 //printf("hereid = '%s'\n", hereid.toChars()); 1569 blankrol = 1; 1570 } 1571 nest = 0; 1572 } 1573 else 1574 { 1575 delimright = c; 1576 nest = 0; 1577 if (isspace(c)) 1578 error("delimiter cannot be whitespace"); 1579 } 1580 } 1581 else 1582 { 1583 if (blankrol) 1584 { 1585 error("heredoc rest of line should be blank"); 1586 blankrol = 0; 1587 continue; 1588 } 1589 if (nest == 1) 1590 { 1591 if (c == delimleft) 1592 nestcount++; 1593 else if (c == delimright) 1594 { 1595 nestcount--; 1596 if (nestcount == 0) 1597 goto Ldone; 1598 } 1599 } 1600 else if (c == delimright) 1601 goto Ldone; 1602 if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid) 1603 { 1604 Token tok; 1605 auto psave = p; 1606 p = s; 1607 scan(&tok); // read in possible heredoc identifier 1608 //printf("endid = '%s'\n", tok.ident.toChars()); 1609 if (tok.value == TOK.identifier && tok.ident is hereid) 1610 { 1611 /* should check that rest of line is blank 1612 */ 1613 goto Ldone; 1614 } 1615 p = psave; 1616 } 1617 stringbuffer.writeUTF8(c); 1618 startline = 0; 1619 } 1620 } 1621 Ldone: 1622 if (*p == '"') 1623 p++; 1624 else if (hereid) 1625 error("delimited string must end in `%s\"`", hereid.toChars()); 1626 else if (isspace(delimright)) 1627 error("delimited string must end in `\"`"); 1628 else 1629 error(token.loc, "delimited string must end in `%c\"`", delimright); 1630 result.setString(stringbuffer); 1631 stringPostfix(result); 1632 } 1633 1634 /** 1635 Lex a token string. Some examples of token strings are: 1636 --- 1637 q{ foo(xxx) } // " foo(xxx) " 1638 q{foo$(LPAREN)} // "foo$(LPAREN)" 1639 q{{foo}"}"} // "{foo}"}"" 1640 --- 1641 It is assumed that `p` points to the opening curly-brace. 1642 Params: 1643 result = pointer to the token that accepts the result 1644 */ 1645 private void tokenStringConstant(Token* result) 1646 { 1647 result.value = TOK.string_; 1648 1649 uint nest = 1; 1650 const start = loc(); 1651 const pstart = ++p; 1652 inTokenStringConstant++; 1653 scope(exit) inTokenStringConstant--; 1654 while (1) 1655 { 1656 Token tok; 1657 scan(&tok); 1658 switch (tok.value) 1659 { 1660 case TOK.leftCurly: 1661 nest++; 1662 continue; 1663 case TOK.rightCurly: 1664 if (--nest == 0) 1665 { 1666 result.setString(pstart, p - 1 - pstart); 1667 stringPostfix(result); 1668 return; 1669 } 1670 continue; 1671 case TOK.endOfFile: 1672 error("unterminated token string constant starting at %s", start.toChars()); 1673 result.setString(); 1674 return; 1675 default: 1676 continue; 1677 } 1678 } 1679 } 1680 1681 /** 1682 Scan a quoted string while building the processed string value by 1683 handling escape sequences. The result is returned in the given `t` token. 1684 This function assumes that `p` currently points to the opening quote 1685 of the string. 1686 Params: 1687 t = the token to set the resulting string to 1688 * References: 1689 * D https://dlang.org/spec/lex.html#double_quoted_strings 1690 * ImportC C11 6.4.5 1691 */ 1692 private void escapeStringConstant(Token* t) 1693 { 1694 t.value = TOK.string_; 1695 1696 const start = loc(); 1697 const tc = *p++; // opening quote 1698 stringbuffer.setsize(0); 1699 while (1) 1700 { 1701 dchar c = *p++; 1702 dchar c2; 1703 switch (c) 1704 { 1705 case '\\': 1706 switch (*p) 1707 { 1708 case '&': 1709 if (Ccompile) 1710 goto default; 1711 1712 c = escapeSequence(c2); 1713 stringbuffer.writeUTF8(c); 1714 if (c2 != dchar.init) 1715 stringbuffer.writeUTF8(c2); 1716 continue; 1717 case 'u': 1718 case 'U': 1719 c = escapeSequence(c2); 1720 stringbuffer.writeUTF8(c); 1721 continue; 1722 default: 1723 c = escapeSequence(c2); 1724 break; 1725 } 1726 break; 1727 case '\n': 1728 endOfLine(); 1729 if (Ccompile) 1730 goto Lunterminated; 1731 break; 1732 case '\r': 1733 if (*p == '\n') 1734 continue; // ignore 1735 c = '\n'; // treat EndOfLine as \n character 1736 endOfLine(); 1737 if (Ccompile) 1738 goto Lunterminated; 1739 break; 1740 case '\'': 1741 case '"': 1742 if (c != tc) 1743 goto default; 1744 t.setString(stringbuffer); 1745 if (!Ccompile) 1746 stringPostfix(t); 1747 return; 1748 case 0: 1749 case 0x1A: 1750 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1751 p--; 1752 Lunterminated: 1753 error("unterminated string constant starting at %s", start.toChars()); 1754 t.setString(); 1755 return; 1756 default: 1757 if (c & 0x80) 1758 { 1759 p--; 1760 c = decodeUTF(); 1761 if (c == LS || c == PS) 1762 { 1763 c = '\n'; 1764 endOfLine(); 1765 if (Ccompile) 1766 goto Lunterminated; 1767 } 1768 p++; 1769 stringbuffer.writeUTF8(c); 1770 continue; 1771 } 1772 break; 1773 } 1774 stringbuffer.writeByte(c); 1775 } 1776 } 1777 1778 /************************************** 1779 * Reference: 1780 * https://dlang.org/spec/lex.html#characterliteral 1781 */ 1782 private TOK charConstant(Token* t) 1783 { 1784 TOK tk = TOK.charLiteral; 1785 //printf("Lexer::charConstant\n"); 1786 p++; 1787 dchar c = *p++; 1788 dchar c2; 1789 switch (c) 1790 { 1791 case '\\': 1792 switch (*p) 1793 { 1794 case 'u': 1795 tk = TOK.wcharLiteral; 1796 goto default; 1797 case 'U': 1798 case '&': 1799 tk = TOK.dcharLiteral; 1800 goto default; 1801 default: 1802 t.unsvalue = escapeSequence(c2); 1803 if (c2 != c2.init) 1804 { 1805 error("html entity requires 2 code units, use a string instead of a character"); 1806 t.unsvalue = '?'; 1807 } 1808 break; 1809 } 1810 break; 1811 case '\n': 1812 L1: 1813 endOfLine(); 1814 goto case; 1815 case '\r': 1816 goto case '\''; 1817 case 0: 1818 case 0x1A: 1819 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1820 p--; 1821 goto case; 1822 case '\'': 1823 error("unterminated character constant"); 1824 t.unsvalue = '?'; 1825 return tk; 1826 default: 1827 if (c & 0x80) 1828 { 1829 p--; 1830 c = decodeUTF(); 1831 p++; 1832 if (c == LS || c == PS) 1833 goto L1; 1834 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) 1835 tk = TOK.wcharLiteral; 1836 else 1837 tk = TOK.dcharLiteral; 1838 } 1839 t.unsvalue = c; 1840 break; 1841 } 1842 if (*p != '\'') 1843 { 1844 while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' && 1845 *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}') 1846 { 1847 if (*p & 0x80) 1848 { 1849 const s = p; 1850 c = decodeUTF(); 1851 if (c == LS || c == PS) 1852 { 1853 p = s; 1854 break; 1855 } 1856 } 1857 p++; 1858 } 1859 1860 if (*p == '\'') 1861 { 1862 error("character constant has multiple characters"); 1863 p++; 1864 } 1865 else 1866 error("unterminated character constant"); 1867 t.unsvalue = '?'; 1868 return tk; 1869 } 1870 p++; 1871 return tk; 1872 } 1873 1874 /*************************************** 1875 * Lex C character constant. 1876 * Parser is on the opening quote. 1877 * Params: 1878 * t = token to fill in 1879 * prefix = one of `u`, `U` or 0. 1880 * Reference: 1881 * C11 6.4.4.4 1882 */ 1883 private void clexerCharConstant(ref Token t, char prefix) 1884 { 1885 escapeStringConstant(&t); 1886 const(char)[] str = t.ustring[0 .. t.len]; 1887 const n = str.length; 1888 const loc = t.loc; 1889 if (n == 0) 1890 { 1891 error(loc, "empty character constant"); 1892 t.value = TOK.semicolon; 1893 return; 1894 } 1895 1896 uint u; 1897 switch (prefix) 1898 { 1899 case 0: 1900 if (n == 1) // fast case 1901 { 1902 u = str[0]; 1903 } 1904 else if (n > 4) 1905 error(loc, "max number of chars in character literal is 4, had %d", 1906 cast(int)n); 1907 else 1908 { 1909 foreach (i, c; str) 1910 (cast(char*)&u)[n - 1 - i] = c; 1911 } 1912 break; 1913 1914 case 'u': 1915 dchar d1; 1916 size_t idx; 1917 auto msg = utf_decodeChar(str, idx, d1); 1918 dchar d2 = 0; 1919 if (idx < n && !msg) 1920 msg = utf_decodeChar(str, idx, d2); 1921 if (msg) 1922 error(loc, "%.*s", cast(int)msg.length, msg.ptr); 1923 else if (idx < n) 1924 error(loc, "max number of chars in 16 bit character literal is 2, had %d", 1925 cast(int)((n + 1) >> 1)); 1926 else if (d1 > 0x1_0000) 1927 error(loc, "%d does not fit in 16 bits", d1); 1928 else if (d2 > 0x1_0000) 1929 error(loc, "%d does not fit in 16 bits", d2); 1930 u = d1; 1931 if (d2) 1932 u = (d1 << 16) | d2; 1933 break; 1934 1935 case 'U': 1936 dchar d; 1937 size_t idx; 1938 auto msg = utf_decodeChar(str, idx, d); 1939 if (msg) 1940 error(loc, "%.*s", cast(int)msg.length, msg.ptr); 1941 else if (idx < n) 1942 error(loc, "max number of chars in 32 bit character literal is 1, had %d", 1943 cast(int)((n + 3) >> 2)); 1944 u = d; 1945 break; 1946 1947 default: 1948 assert(0); 1949 } 1950 t.value = n == 1 ? TOK.charLiteral : TOK.int32Literal; 1951 t.unsvalue = u; 1952 } 1953 1954 /*************************************** 1955 * Get postfix of string literal. 1956 */ 1957 private void stringPostfix(Token* t) pure @nogc 1958 { 1959 switch (*p) 1960 { 1961 case 'c': 1962 case 'w': 1963 case 'd': 1964 t.postfix = *p; 1965 p++; 1966 break; 1967 default: 1968 t.postfix = 0; 1969 break; 1970 } 1971 } 1972 1973 /************************************** 1974 * Read in a number. 1975 * If it's an integer, store it in tok.TKutok.Vlong. 1976 * integers can be decimal, octal or hex 1977 * Handle the suffixes U, UL, LU, L, etc. 1978 * If it's double, store it in tok.TKutok.Vdouble. 1979 * Returns: 1980 * TKnum 1981 * TKdouble,... 1982 */ 1983 private TOK number(Token* t) 1984 { 1985 int base = 10; 1986 const start = p; 1987 ulong n = 0; // unsigned >=64 bit integer type 1988 int d; 1989 bool err = false; 1990 bool overflow = false; 1991 bool anyBinaryDigitsNoSingleUS = false; 1992 bool anyHexDigitsNoSingleUS = false; 1993 char errorDigit = 0; 1994 dchar c = *p; 1995 if (c == '0') 1996 { 1997 ++p; 1998 c = *p; 1999 switch (c) 2000 { 2001 case '0': 2002 case '1': 2003 case '2': 2004 case '3': 2005 case '4': 2006 case '5': 2007 case '6': 2008 case '7': 2009 base = 8; 2010 break; 2011 2012 case '8': 2013 case '9': 2014 errorDigit = cast(char) c; 2015 base = 8; 2016 break; 2017 case 'x': 2018 case 'X': 2019 ++p; 2020 base = 16; 2021 break; 2022 case 'b': 2023 case 'B': 2024 ++p; 2025 base = 2; 2026 break; 2027 case '.': 2028 if (p[1] == '.') 2029 goto Ldone; // if ".." 2030 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80) 2031 { 2032 if (Ccompile && (p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L')) 2033 goto Lreal; // if `0.f` or `0.L` 2034 goto Ldone; // if ".identifier" or ".unicode" 2035 } 2036 goto Lreal; // '.' is part of current token 2037 case 'i': 2038 case 'f': 2039 case 'F': 2040 goto Lreal; 2041 case '_': 2042 if (Ccompile) 2043 error("embedded `_` not allowed"); 2044 ++p; 2045 base = 8; 2046 break; 2047 case 'L': 2048 if (p[1] == 'i') 2049 goto Lreal; 2050 break; 2051 default: 2052 break; 2053 } 2054 } 2055 while (1) 2056 { 2057 c = *p; 2058 switch (c) 2059 { 2060 case '0': 2061 case '1': 2062 case '2': 2063 case '3': 2064 case '4': 2065 case '5': 2066 case '6': 2067 case '7': 2068 case '8': 2069 case '9': 2070 ++p; 2071 d = c - '0'; 2072 break; 2073 case 'a': 2074 case 'b': 2075 case 'c': 2076 case 'd': 2077 case 'e': 2078 case 'f': 2079 case 'A': 2080 case 'B': 2081 case 'C': 2082 case 'D': 2083 case 'E': 2084 case 'F': 2085 ++p; 2086 if (base != 16) 2087 { 2088 if (c == 'e' || c == 'E' || c == 'f' || c == 'F') 2089 goto Lreal; 2090 } 2091 if (c >= 'a') 2092 d = c + 10 - 'a'; 2093 else 2094 d = c + 10 - 'A'; 2095 break; 2096 case 'L': 2097 if (p[1] == 'i') 2098 goto Lreal; 2099 goto Ldone; 2100 case '.': 2101 if (p[1] == '.') 2102 goto Ldone; // if ".." 2103 if (base <= 10 && n > 0 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)) 2104 { 2105 if (Ccompile && base == 10 && 2106 (p[1] == 'e' || p[1] == 'E' || p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L')) 2107 goto Lreal; // if `1.e6` or `1.f` or `1.L` 2108 goto Ldone; // if ".identifier" or ".unicode" 2109 } 2110 if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80)) 2111 goto Ldone; // if ".identifier" or ".unicode" 2112 if (base == 2) 2113 goto Ldone; // if ".identifier" or ".unicode" 2114 goto Lreal; // otherwise as part of a floating point literal 2115 case 'p': 2116 case 'P': 2117 case 'i': 2118 Lreal: 2119 p = start; 2120 return inreal(t); 2121 case '_': 2122 if (Ccompile) 2123 goto default; 2124 ++p; 2125 continue; 2126 default: 2127 goto Ldone; 2128 } 2129 // got a digit here, set any necessary flags, check for errors 2130 anyHexDigitsNoSingleUS = true; 2131 anyBinaryDigitsNoSingleUS = true; 2132 if (!errorDigit && d >= base) 2133 { 2134 errorDigit = cast(char) c; 2135 } 2136 // Avoid expensive overflow check if we aren't at risk of overflow 2137 if (n <= 0x0FFF_FFFF_FFFF_FFFFUL) 2138 n = n * base + d; 2139 else 2140 { 2141 import core.checkedint : mulu, addu; 2142 2143 n = mulu(n, base, overflow); 2144 n = addu(n, d, overflow); 2145 } 2146 } 2147 Ldone: 2148 if (errorDigit) 2149 { 2150 error(token.loc, "%s digit expected, not `%c`", base == 2 ? "binary".ptr : 2151 base == 8 ? "octal".ptr : 2152 "decimal".ptr, errorDigit); 2153 err = true; 2154 } 2155 if (overflow && !err) 2156 { 2157 error("integer overflow"); 2158 err = true; 2159 } 2160 if ((base == 2 && !anyBinaryDigitsNoSingleUS) || 2161 (base == 16 && !anyHexDigitsNoSingleUS)) 2162 error(token.loc, "`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start); 2163 2164 t.unsvalue = n; 2165 2166 if (Ccompile) 2167 return cnumber(base, n); 2168 2169 enum FLAGS : int 2170 { 2171 none = 0, 2172 decimal = 1, // decimal 2173 unsigned = 2, // u or U suffix 2174 long_ = 4, // L suffix 2175 } 2176 2177 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none; 2178 // Parse trailing 'u', 'U', 'l' or 'L' in any combination 2179 const psuffix = p; 2180 while (1) 2181 { 2182 FLAGS f; 2183 switch (*p) 2184 { 2185 case 'U': 2186 case 'u': 2187 f = FLAGS.unsigned; 2188 goto L1; 2189 case 'l': 2190 f = FLAGS.long_; 2191 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead"); 2192 goto L1; 2193 case 'L': 2194 f = FLAGS.long_; 2195 L1: 2196 p++; 2197 if ((flags & f) && !err) 2198 { 2199 error("unrecognized token"); 2200 err = true; 2201 } 2202 flags = cast(FLAGS)(flags | f); 2203 continue; 2204 default: 2205 break; 2206 } 2207 break; 2208 } 2209 if (base == 8 && n >= 8) 2210 { 2211 if (err) 2212 // can't translate invalid octal value, just show a generic message 2213 error("octal literals larger than 7 are no longer supported"); 2214 else 2215 error(token.loc, "octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead", 2216 n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix); 2217 } 2218 TOK result; 2219 switch (flags) 2220 { 2221 case FLAGS.none: 2222 /* Octal or Hexadecimal constant. 2223 * First that fits: int, uint, long, ulong 2224 */ 2225 if (n & 0x8000000000000000L) 2226 result = TOK.uns64Literal; 2227 else if (n & 0xFFFFFFFF00000000L) 2228 result = TOK.int64Literal; 2229 else if (n & 0x80000000) 2230 result = TOK.uns32Literal; 2231 else 2232 result = TOK.int32Literal; 2233 break; 2234 case FLAGS.decimal: 2235 /* First that fits: int, long, long long 2236 */ 2237 if (n & 0x8000000000000000L) 2238 { 2239 result = TOK.uns64Literal; 2240 } 2241 else if (n & 0xFFFFFFFF80000000L) 2242 result = TOK.int64Literal; 2243 else 2244 result = TOK.int32Literal; 2245 break; 2246 case FLAGS.unsigned: 2247 case FLAGS.decimal | FLAGS.unsigned: 2248 /* First that fits: uint, ulong 2249 */ 2250 if (n & 0xFFFFFFFF00000000L) 2251 result = TOK.uns64Literal; 2252 else 2253 result = TOK.uns32Literal; 2254 break; 2255 case FLAGS.decimal | FLAGS.long_: 2256 if (n & 0x8000000000000000L) 2257 { 2258 if (!err) 2259 { 2260 error("signed integer overflow"); 2261 err = true; 2262 } 2263 result = TOK.uns64Literal; 2264 } 2265 else 2266 result = TOK.int64Literal; 2267 break; 2268 case FLAGS.long_: 2269 if (n & 0x8000000000000000L) 2270 result = TOK.uns64Literal; 2271 else 2272 result = TOK.int64Literal; 2273 break; 2274 case FLAGS.unsigned | FLAGS.long_: 2275 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_: 2276 result = TOK.uns64Literal; 2277 break; 2278 default: 2279 debug 2280 { 2281 printf("%x\n", flags); 2282 } 2283 assert(0); 2284 } 2285 return result; 2286 } 2287 2288 /************************************** 2289 * Lex C integer-suffix 2290 * Params: 2291 * base = number base 2292 * n = raw integer value 2293 * Returns: 2294 * token value 2295 */ 2296 private TOK cnumber(int base, ulong n) 2297 { 2298 /* C11 6.4.4.1 2299 * Parse trailing suffixes: 2300 * u or U 2301 * l or L 2302 * ll or LL 2303 */ 2304 enum FLAGS : uint 2305 { 2306 octalhex = 1, // octal or hexadecimal 2307 decimal = 2, // decimal 2308 unsigned = 4, // u or U suffix 2309 long_ = 8, // l or L suffix 2310 llong = 0x10 // ll or LL 2311 } 2312 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex; 2313 bool err; 2314 Lsuffixes: 2315 while (1) 2316 { 2317 FLAGS f; 2318 const cs = *p; 2319 switch (cs) 2320 { 2321 case 'U': 2322 case 'u': 2323 f = FLAGS.unsigned; 2324 break; 2325 2326 case 'l': 2327 case 'L': 2328 f = FLAGS.long_; 2329 if (cs == p[1]) 2330 { 2331 f = FLAGS.long_ | FLAGS.llong; 2332 ++p; 2333 } 2334 break; 2335 2336 default: 2337 break Lsuffixes; 2338 } 2339 ++p; 2340 if ((flags & f) && !err) 2341 { 2342 error("duplicate integer suffixes"); 2343 err = true; 2344 } 2345 flags = cast(FLAGS)(flags | f); 2346 } 2347 2348 TOK result = TOK.int32Literal; // default 2349 switch (flags) 2350 { 2351 /* Since D doesn't have a variable sized `long` or `unsigned long` type, 2352 * this code deviates from C by picking D int, uint, long, or ulong instead 2353 */ 2354 2355 case FLAGS.octalhex: 2356 /* Octal or Hexadecimal constant. 2357 * First that fits: int, unsigned, long, unsigned long, 2358 * long long, unsigned long long 2359 */ 2360 if (n & 0x8000000000000000L) 2361 result = TOK.uns64Literal; // unsigned long 2362 else if (n & 0xFFFFFFFF00000000L) 2363 result = TOK.int64Literal; // long 2364 else if (n & 0x80000000) 2365 result = TOK.uns32Literal; 2366 else 2367 result = TOK.int32Literal; 2368 break; 2369 2370 case FLAGS.decimal: 2371 /* First that fits: int, long, long long 2372 */ 2373 if (n & 0x8000000000000000L) 2374 result = TOK.uns64Literal; // unsigned long 2375 else if (n & 0xFFFFFFFF80000000L) 2376 result = TOK.int64Literal; // long 2377 else 2378 result = TOK.int32Literal; 2379 break; 2380 2381 case FLAGS.octalhex | FLAGS.unsigned: 2382 case FLAGS.decimal | FLAGS.unsigned: 2383 /* First that fits: unsigned, unsigned long, unsigned long long 2384 */ 2385 if (n & 0xFFFFFFFF00000000L) 2386 result = TOK.uns64Literal; // unsigned long 2387 else 2388 result = TOK.uns32Literal; 2389 break; 2390 2391 case FLAGS.decimal | FLAGS.long_: 2392 /* First that fits: long, long long 2393 */ 2394 if (longsize == 4 || long_longsize == 4) 2395 { 2396 if (n & 0xFFFFFFFF_80000000L) 2397 result = TOK.int64Literal; 2398 else 2399 result = TOK.int32Literal; // long 2400 } 2401 else 2402 { 2403 result = TOK.int64Literal; // long 2404 } 2405 break; 2406 2407 case FLAGS.octalhex | FLAGS.long_: 2408 /* First that fits: long, unsigned long, long long, 2409 * unsigned long long 2410 */ 2411 if (longsize == 4 || long_longsize == 4) 2412 { 2413 if (n & 0x8000000000000000L) 2414 result = TOK.uns64Literal; 2415 else if (n & 0xFFFFFFFF00000000L) 2416 result = TOK.int64Literal; 2417 else if (n & 0x80000000) 2418 result = TOK.uns32Literal; // unsigned long 2419 else 2420 result = TOK.int32Literal; // long 2421 } 2422 else 2423 { 2424 if (n & 0x80000000_00000000L) 2425 result = TOK.uns64Literal; // unsigned long 2426 else 2427 result = TOK.int64Literal; // long 2428 } 2429 break; 2430 2431 case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_: 2432 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_: 2433 /* First that fits: unsigned long, unsigned long long 2434 */ 2435 if (longsize == 4 || long_longsize == 4) 2436 { 2437 if (n & 0xFFFFFFFF00000000L) 2438 result = TOK.uns64Literal; 2439 else 2440 result = TOK.uns32Literal; // unsigned long 2441 } 2442 else 2443 { 2444 result = TOK.uns64Literal; // unsigned long 2445 } 2446 break; 2447 2448 case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong: 2449 /* First that fits: long long, unsigned long long 2450 */ 2451 if (n & 0x8000000000000000L) 2452 result = TOK.uns64Literal; 2453 else 2454 result = TOK.int64Literal; 2455 break; 2456 2457 case FLAGS.decimal | FLAGS.long_ | FLAGS.llong: 2458 /* long long 2459 */ 2460 result = TOK.int64Literal; 2461 break; 2462 2463 case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong: 2464 case FLAGS.decimal | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong: 2465 result = TOK.uns64Literal; 2466 break; 2467 2468 default: 2469 debug printf("%x\n",flags); 2470 assert(0); 2471 } 2472 return result; 2473 } 2474 2475 /************************************** 2476 * Read in characters, converting them to real. 2477 * Bugs: 2478 * Exponent overflow not detected. 2479 * Too much requested precision is not detected. 2480 */ 2481 private TOK inreal(Token* t) 2482 { 2483 //printf("Lexer::inreal()\n"); 2484 debug 2485 { 2486 assert(*p == '.' || isdigit(*p)); 2487 } 2488 bool isWellformedString = true; 2489 stringbuffer.setsize(0); 2490 auto pstart = p; 2491 bool hex = false; 2492 dchar c = *p++; 2493 // Leading '0x' 2494 if (c == '0') 2495 { 2496 c = *p++; 2497 if (c == 'x' || c == 'X') 2498 { 2499 hex = true; 2500 c = *p++; 2501 } 2502 } 2503 // Digits to left of '.' 2504 while (1) 2505 { 2506 if (c == '.') 2507 { 2508 c = *p++; 2509 break; 2510 } 2511 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 2512 { 2513 c = *p++; 2514 continue; 2515 } 2516 break; 2517 } 2518 // Digits to right of '.' 2519 while (1) 2520 { 2521 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 2522 { 2523 c = *p++; 2524 continue; 2525 } 2526 break; 2527 } 2528 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P'))) 2529 { 2530 c = *p++; 2531 if (c == '-' || c == '+') 2532 { 2533 c = *p++; 2534 } 2535 bool anyexp = false; 2536 while (1) 2537 { 2538 if (isdigit(c)) 2539 { 2540 anyexp = true; 2541 c = *p++; 2542 continue; 2543 } 2544 if (c == '_') 2545 { 2546 if (Ccompile) 2547 error("embedded `_` in numeric literals not allowed"); 2548 c = *p++; 2549 continue; 2550 } 2551 if (!anyexp) 2552 { 2553 error("missing exponent"); 2554 isWellformedString = false; 2555 } 2556 break; 2557 } 2558 } 2559 else if (hex) 2560 { 2561 error("exponent required for hex float"); 2562 isWellformedString = false; 2563 } 2564 --p; 2565 while (pstart < p) 2566 { 2567 if (*pstart != '_') 2568 stringbuffer.writeByte(*pstart); 2569 ++pstart; 2570 } 2571 stringbuffer.writeByte(0); 2572 auto sbufptr = cast(const(char)*)stringbuffer[].ptr; 2573 TOK result; 2574 bool isOutOfRange = false; 2575 t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, isOutOfRange) : CTFloat.zero); 2576 2577 bool imaginary = false; 2578 if (*p == 'i' && Ccompile) 2579 { 2580 ++p; 2581 imaginary = true; 2582 } 2583 2584 switch (*p) 2585 { 2586 case 'F': 2587 case 'f': 2588 if (isWellformedString && !isOutOfRange) 2589 isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr); 2590 result = TOK.float32Literal; 2591 p++; 2592 break; 2593 default: 2594 if (isWellformedString && !isOutOfRange) 2595 isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr); 2596 result = TOK.float64Literal; 2597 break; 2598 case 'l': 2599 if (!Ccompile) 2600 error("use 'L' suffix instead of 'l'"); 2601 goto case 'L'; 2602 case 'L': 2603 ++p; 2604 if (Ccompile && long_doublesize == 8) 2605 goto default; 2606 result = TOK.float80Literal; 2607 break; 2608 } 2609 2610 if ((*p == 'i' || *p == 'I') && !Ccompile) 2611 { 2612 if (*p == 'I') 2613 error("use 'i' suffix instead of 'I'"); 2614 p++; 2615 imaginary = true; 2616 } 2617 2618 if (imaginary) 2619 { 2620 switch (result) 2621 { 2622 case TOK.float32Literal: 2623 result = TOK.imaginary32Literal; 2624 break; 2625 case TOK.float64Literal: 2626 result = TOK.imaginary64Literal; 2627 break; 2628 case TOK.float80Literal: 2629 result = TOK.imaginary80Literal; 2630 break; 2631 default: 2632 break; 2633 } 2634 } 2635 const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal); 2636 if (isOutOfRange && !isLong && (!Ccompile || hex)) 2637 { 2638 /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex 2639 */ 2640 const char* suffix = result == TOK.float32Literal ? "f" : result == TOK.float80Literal ? "L" : ""; 2641 const char* type = [TOK.float32Literal: "`float`".ptr, 2642 TOK.float64Literal: "`double`".ptr, 2643 TOK.float80Literal: "`real` for the current target".ptr][result]; 2644 error(scanloc, "number `%s%s` is not representable as a %s", sbufptr, suffix, type); 2645 const char* extra = result == TOK.float64Literal ? "`real` literals can be written using the `L` suffix. " : ""; 2646 eSink.errorSupplemental(scanloc, "%shttps://dlang.org/spec/lex.html#floatliteral", extra); 2647 } 2648 debug 2649 { 2650 switch (result) 2651 { 2652 case TOK.float32Literal: 2653 case TOK.float64Literal: 2654 case TOK.float80Literal: 2655 case TOK.imaginary32Literal: 2656 case TOK.imaginary64Literal: 2657 case TOK.imaginary80Literal: 2658 break; 2659 default: 2660 assert(0); 2661 } 2662 } 2663 return result; 2664 } 2665 2666 final Loc loc() pure @nogc 2667 { 2668 scanloc.charnum = cast(uint)(1 + p - line); 2669 version (LocOffset) 2670 scanloc.fileOffset = cast(uint)(p - base); 2671 return scanloc; 2672 } 2673 2674 void error(T...)(const(char)* format, T args) 2675 { 2676 eSink.error(token.loc, format, args); 2677 } 2678 2679 void error(T...)(const ref Loc loc, const(char)* format, T args) 2680 { 2681 eSink.error(loc, format, args); 2682 } 2683 2684 void deprecation(T...)(const ref Loc loc, const(char)* format, T args) 2685 { 2686 eSink.deprecation(loc, format, args); 2687 } 2688 2689 void deprecation(T...)(const(char)* format, T args) 2690 { 2691 eSink.deprecation(token.loc, format, args); 2692 } 2693 2694 void deprecationSupplemental(T...)(const(char)* format, T args) 2695 { 2696 eSink.deprecationSupplemental(token.loc, format, args); 2697 } 2698 2699 /*************************************** 2700 * Parse special token sequence: 2701 * Returns: 2702 * true if the special token sequence was handled 2703 * References: 2704 * https://dlang.org/spec/lex.html#special-token-sequence 2705 */ 2706 bool parseSpecialTokenSequence() 2707 { 2708 Token n; 2709 scan(&n); 2710 if (n.value == TOK.identifier) 2711 { 2712 if (n.ident == Id.line) 2713 { 2714 poundLine(n, false); 2715 return true; 2716 } 2717 else 2718 { 2719 const locx = loc(); 2720 // @@@DEPRECATED_2.103@@@ 2721 // Turn into an error in 2.113 2722 if (inTokenStringConstant) 2723 deprecation(locx, "token string requires valid D tokens, not `#%s`", n.ident.toChars()); 2724 else 2725 error(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars()); 2726 } 2727 } 2728 else if (n.value == TOK.if_) 2729 { 2730 const locx = loc(); 2731 if (inTokenStringConstant) 2732 error(locx, "token string requires valid D tokens, not `#if`"); 2733 else 2734 error(locx, "C preprocessor directive `#if` is not supported, use `version` or `static if`"); 2735 } 2736 return false; 2737 } 2738 2739 /********************************************* 2740 * Parse line/file preprocessor directive: 2741 * #line linnum [filespec] 2742 * Allow __LINE__ for linnum, and __FILE__ for filespec. 2743 * Accept linemarker format: 2744 * # linnum [filespec] {flags} 2745 * There can be zero or more flags, which are one of the digits 1..4, and 2746 * must be in ascending order. The flags are ignored. 2747 * Params: 2748 * tok = token we're on, which is linnum of linemarker 2749 * linemarker = true if line marker format and lexer is on linnum 2750 * References: 2751 * linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html 2752 */ 2753 final void poundLine(ref Token tok, bool linemarker) 2754 { 2755 auto linnum = this.scanloc.linnum; 2756 const(char)* filespec = null; 2757 bool flags; 2758 2759 if (!linemarker) 2760 scan(&tok); 2761 if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal) 2762 { 2763 const lin = cast(int)(tok.unsvalue); 2764 if (lin != tok.unsvalue) 2765 { 2766 error(tok.loc, "line number `%lld` out of range", cast(ulong)tok.unsvalue); 2767 skipToNextLine(); 2768 return; 2769 } 2770 else 2771 linnum = lin; 2772 } 2773 else if (tok.value == TOK.line) // #line __LINE__ 2774 { 2775 } 2776 else 2777 { 2778 error(tok.loc, "positive integer argument expected following `#line`"); 2779 if (tok.value != TOK.endOfLine) 2780 skipToNextLine(); 2781 return; 2782 } 2783 while (1) 2784 { 2785 scan(&tok); 2786 switch (tok.value) 2787 { 2788 case TOK.endOfFile: 2789 case TOK.endOfLine: 2790 if (!inTokenStringConstant) 2791 { 2792 this.scanloc.linnum = linnum; 2793 if (filespec) 2794 this.scanloc.filename = filespec; 2795 } 2796 return; 2797 case TOK.file: 2798 if (filespec || flags) 2799 goto Lerr; 2800 filespec = mem.xstrdup(scanloc.filename); 2801 continue; 2802 case TOK.string_: 2803 if (filespec || flags) 2804 goto Lerr; 2805 if (tok.ptr[0] != '"' || tok.postfix != 0) 2806 goto Lerr; 2807 filespec = tok.ustring; 2808 continue; 2809 case TOK.int32Literal: 2810 if (!filespec) 2811 goto Lerr; 2812 if (linemarker && tok.unsvalue >= 1 && tok.unsvalue <= 4) 2813 { 2814 flags = true; // linemarker flags seen 2815 continue; 2816 } 2817 goto Lerr; 2818 default: 2819 goto Lerr; 2820 } 2821 } 2822 Lerr: 2823 if (filespec is null) 2824 error(tok.loc, "invalid filename for `#line` directive"); 2825 else if (linemarker) 2826 error(tok.loc, "invalid flag for line marker directive"); 2827 else if (!Ccompile) 2828 error(tok.loc, "found `%s` when expecting new line following `#line` directive", tok.toChars()); 2829 if (tok.value != TOK.endOfLine) 2830 skipToNextLine(); 2831 } 2832 2833 /*************************************** 2834 * Scan forward to start of next line. 2835 * Params: 2836 * defines = send characters to `defines` 2837 */ 2838 final void skipToNextLine(OutBuffer* defines = null) 2839 { 2840 while (1) 2841 { 2842 switch (*p) 2843 { 2844 case 0: 2845 case 0x1A: 2846 return; // do not advance p 2847 2848 case '\n': 2849 ++p; 2850 break; 2851 2852 case '\r': 2853 ++p; 2854 if (p[0] == '\n') 2855 ++p; 2856 break; 2857 2858 default: 2859 if (defines) 2860 defines.writeByte(*p); // don't care about Unicode line endings for C 2861 else if (*p & 0x80) 2862 { 2863 const u = decodeUTF(); 2864 if (u == PS || u == LS) 2865 { 2866 ++p; 2867 break; 2868 } 2869 } 2870 ++p; 2871 continue; 2872 } 2873 break; 2874 } 2875 endOfLine(); 2876 tokenizeNewlines = false; 2877 } 2878 2879 /******************************************** 2880 * Decode UTF character. 2881 * Issue error messages for invalid sequences. 2882 * Return decoded character, advance p to last character in UTF sequence. 2883 */ 2884 private uint decodeUTF() 2885 { 2886 string msg; 2887 auto result = decodeUTFpure(msg); 2888 2889 if (msg) 2890 error(token.loc, "%.*s", cast(int)msg.length, msg.ptr); 2891 return result; 2892 } 2893 2894 /******************************************** 2895 * Same as above, but the potential error message is stored to the 2896 * msg parameter instead of being issued. 2897 */ 2898 private pure uint decodeUTFpure(out string msg) 2899 { 2900 const s = p; 2901 assert(*s & 0x80); 2902 // Check length of remaining string up to 4 UTF-8 characters 2903 size_t len; 2904 for (len = 1; len < 4 && s[len]; len++) 2905 { 2906 } 2907 size_t idx = 0; 2908 dchar u; 2909 msg = utf_decodeChar(s[0 .. len], idx, u); 2910 p += idx - 1; 2911 if (!msg && isBidiControl(u)) 2912 msg = "Bidirectional control characters are disallowed for security reasons."; 2913 return u; 2914 } 2915 2916 /*************************************************** 2917 * Parse doc comment embedded between t.ptr and p. 2918 * Remove trailing blanks and tabs from lines. 2919 * Replace all newlines with \n. 2920 * Remove leading comment character from each line. 2921 * Decide if it's a lineComment or a blockComment. 2922 * Append to previous one for this token. 2923 * 2924 * If newParagraph is true, an extra newline will be 2925 * added between adjoining doc comments. 2926 */ 2927 private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure 2928 { 2929 /* ct tells us which kind of comment it is: '/', '*', or '+' 2930 */ 2931 const ct = t.ptr[2]; 2932 /* Start of comment text skips over / * *, / + +, or / / / 2933 */ 2934 const(char)* q = t.ptr + 3; // start of comment text 2935 const(char)* qend = p; 2936 if (ct == '*' || ct == '+') 2937 qend -= 2; 2938 /* Scan over initial row of ****'s or ++++'s or ////'s 2939 */ 2940 for (; q < qend; q++) 2941 { 2942 if (*q != ct) 2943 break; 2944 } 2945 /* Remove leading spaces until start of the comment 2946 */ 2947 int linestart = 0; 2948 if (ct == '/') 2949 { 2950 while (q < qend && (*q == ' ' || *q == '\t')) 2951 ++q; 2952 } 2953 else if (q < qend) 2954 { 2955 if (*q == '\r') 2956 { 2957 ++q; 2958 if (q < qend && *q == '\n') 2959 ++q; 2960 linestart = 1; 2961 } 2962 else if (*q == '\n') 2963 { 2964 ++q; 2965 linestart = 1; 2966 } 2967 } 2968 /* Remove trailing row of ****'s or ++++'s 2969 */ 2970 if (ct != '/') 2971 { 2972 for (; q < qend; qend--) 2973 { 2974 if (qend[-1] != ct) 2975 break; 2976 } 2977 } 2978 /* Comment is now [q .. qend]. 2979 * Canonicalize it into buf[]. 2980 */ 2981 OutBuffer buf; 2982 2983 void trimTrailingWhitespace() 2984 { 2985 const s = buf[]; 2986 auto len = s.length; 2987 while (len && (s[len - 1] == ' ' || s[len - 1] == '\t')) 2988 --len; 2989 buf.setsize(len); 2990 } 2991 2992 for (; q < qend; q++) 2993 { 2994 char c = *q; 2995 switch (c) 2996 { 2997 case '*': 2998 case '+': 2999 if (linestart && c == ct) 3000 { 3001 linestart = 0; 3002 /* Trim preceding whitespace up to preceding \n 3003 */ 3004 trimTrailingWhitespace(); 3005 continue; 3006 } 3007 break; 3008 case ' ': 3009 case '\t': 3010 break; 3011 case '\r': 3012 if (q[1] == '\n') 3013 continue; // skip the \r 3014 goto Lnewline; 3015 default: 3016 if (c == 226) 3017 { 3018 // If LS or PS 3019 if (q[1] == 128 && (q[2] == 168 || q[2] == 169)) 3020 { 3021 q += 2; 3022 goto Lnewline; 3023 } 3024 } 3025 linestart = 0; 3026 break; 3027 Lnewline: 3028 c = '\n'; // replace all newlines with \n 3029 goto case; 3030 case '\n': 3031 linestart = 1; 3032 /* Trim trailing whitespace 3033 */ 3034 trimTrailingWhitespace(); 3035 break; 3036 } 3037 buf.writeByte(c); 3038 } 3039 /* Trim trailing whitespace (if the last line does not have newline) 3040 */ 3041 trimTrailingWhitespace(); 3042 3043 // Always end with a newline 3044 const s = buf[]; 3045 if (s.length == 0 || s[$ - 1] != '\n') 3046 buf.writeByte('\n'); 3047 3048 // It's a line comment if the start of the doc comment comes 3049 // after other non-whitespace on the same line. 3050 auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment; 3051 // Combine with previous doc comment, if any 3052 if (*dc) 3053 { 3054 auto p = combineComments(*dc, buf[], newParagraph); 3055 *dc = p ? p[0 .. strlen(p)] : null; 3056 } 3057 else 3058 *dc = buf.extractSlice(true); 3059 } 3060 3061 /******************************************** 3062 * Combine two document comments into one, 3063 * separated by an extra newline if newParagraph is true. 3064 */ 3065 static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure 3066 { 3067 //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph); 3068 const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n' 3069 if (!c1) 3070 return c2.ptr; 3071 if (!c2) 3072 return c1.ptr; 3073 3074 int insertNewLine = 0; 3075 if (c1.length && c1[$ - 1] != '\n') 3076 insertNewLine = 1; 3077 const retSize = c1.length + insertNewLine + newParagraphSize + c2.length; 3078 auto p = cast(char*)mem.xmalloc_noscan(retSize + 1); 3079 p[0 .. c1.length] = c1[]; 3080 if (insertNewLine) 3081 p[c1.length] = '\n'; 3082 if (newParagraph) 3083 p[c1.length + insertNewLine] = '\n'; 3084 p[retSize - c2.length .. retSize] = c2[]; 3085 p[retSize] = 0; 3086 return p; 3087 } 3088 3089 /************************** 3090 * `p` should be at start of next line 3091 */ 3092 private void endOfLine() pure @nogc @safe 3093 { 3094 scanloc.linnum++; 3095 line = p; 3096 } 3097 } 3098 3099 3100 /******************************* Private *****************************************/ 3101 3102 private: 3103 3104 private enum LS = 0x2028; // UTF line separator 3105 private enum PS = 0x2029; // UTF paragraph separator 3106 3107 /******************************************** 3108 * Do our own char maps 3109 */ 3110 private static immutable cmtable = () 3111 { 3112 ubyte[256] table; 3113 foreach (const c; 0 .. table.length) 3114 { 3115 if ('0' <= c && c <= '7') 3116 table[c] |= CMoctal; 3117 if (c_isxdigit(c)) 3118 table[c] |= CMhex; 3119 if (c_isalnum(c) || c == '_') 3120 table[c] |= CMidchar; 3121 3122 switch (c) 3123 { 3124 case 'x': case 'X': 3125 case 'b': case 'B': 3126 table[c] |= CMzerosecond; 3127 break; 3128 3129 case '0': .. case '9': 3130 case 'e': case 'E': 3131 case 'f': case 'F': 3132 case 'l': case 'L': 3133 case 'p': case 'P': 3134 case 'u': case 'U': 3135 case 'i': 3136 case '.': 3137 case '_': 3138 table[c] |= CMzerosecond | CMdigitsecond; 3139 break; 3140 3141 default: 3142 break; 3143 } 3144 3145 switch (c) 3146 { 3147 case '\\': 3148 case '\n': 3149 case '\r': 3150 case 0: 3151 case 0x1A: 3152 case '\'': 3153 break; 3154 default: 3155 if (!(c & 0x80)) 3156 table[c] |= CMsinglechar; 3157 break; 3158 } 3159 } 3160 return table; 3161 }(); 3162 3163 private 3164 { 3165 enum CMoctal = 0x1; 3166 enum CMhex = 0x2; 3167 enum CMidchar = 0x4; 3168 enum CMzerosecond = 0x8; 3169 enum CMdigitsecond = 0x10; 3170 enum CMsinglechar = 0x20; 3171 } 3172 3173 private bool isoctal(const char c) pure @nogc @safe 3174 { 3175 return (cmtable[c] & CMoctal) != 0; 3176 } 3177 3178 private bool ishex(const char c) pure @nogc @safe 3179 { 3180 return (cmtable[c] & CMhex) != 0; 3181 } 3182 3183 private bool isidchar(const char c) pure @nogc @safe 3184 { 3185 return (cmtable[c] & CMidchar) != 0; 3186 } 3187 3188 private bool isZeroSecond(const char c) pure @nogc @safe 3189 { 3190 return (cmtable[c] & CMzerosecond) != 0; 3191 } 3192 3193 private bool isDigitSecond(const char c) pure @nogc @safe 3194 { 3195 return (cmtable[c] & CMdigitsecond) != 0; 3196 } 3197 3198 private bool issinglechar(const char c) pure @nogc @safe 3199 { 3200 return (cmtable[c] & CMsinglechar) != 0; 3201 } 3202 3203 private bool c_isxdigit(const int c) pure @nogc @safe 3204 { 3205 return (( c >= '0' && c <= '9') || 3206 ( c >= 'a' && c <= 'f') || 3207 ( c >= 'A' && c <= 'F')); 3208 } 3209 3210 private bool c_isalnum(const int c) pure @nogc @safe 3211 { 3212 return (( c >= '0' && c <= '9') || 3213 ( c >= 'a' && c <= 'z') || 3214 ( c >= 'A' && c <= 'Z')); 3215 } 3216 3217 /******************************* Unittest *****************************************/ 3218 3219 unittest 3220 { 3221 fprintf(stderr, "Lexer.unittest %d\n", __LINE__); 3222 3223 ErrorSink errorSink = new ErrorSinkStderr; 3224 3225 void test(T)(string sequence, T expected, bool Ccompile = false) 3226 { 3227 auto p = cast(const(char)*)sequence.ptr; 3228 dchar c2; 3229 Lexer lexer = new Lexer(errorSink); 3230 assert(expected == lexer.escapeSequence(Loc.initial, p, Ccompile, c2)); 3231 assert(p == sequence.ptr + sequence.length); 3232 } 3233 3234 test(`'`, '\''); 3235 test(`"`, '"'); 3236 test(`?`, '?'); 3237 test(`\`, '\\'); 3238 test(`0`, '\0'); 3239 test(`a`, '\a'); 3240 test(`b`, '\b'); 3241 test(`f`, '\f'); 3242 test(`n`, '\n'); 3243 test(`r`, '\r'); 3244 test(`t`, '\t'); 3245 test(`v`, '\v'); 3246 3247 test(`x00`, 0x00); 3248 test(`xff`, 0xff); 3249 test(`xFF`, 0xff); 3250 test(`xa7`, 0xa7); 3251 test(`x3c`, 0x3c); 3252 test(`xe2`, 0xe2); 3253 3254 test(`1`, '\1'); 3255 test(`42`, '\42'); 3256 test(`357`, '\357'); 3257 3258 test(`u1234`, '\u1234'); 3259 test(`uf0e4`, '\uf0e4'); 3260 3261 test(`U0001f603`, '\U0001f603'); 3262 3263 test(`"`, '"'); 3264 test(`<`, '<'); 3265 test(`>`, '>'); 3266 } 3267 3268 unittest 3269 { 3270 fprintf(stderr, "Lexer.unittest %d\n", __LINE__); 3271 3272 static class ErrorSinkTest : ErrorSinkNull 3273 { 3274 nothrow: 3275 extern (C++): 3276 override: 3277 3278 import core.stdc.stdio; 3279 import core.stdc.stdarg; 3280 3281 string expected; 3282 bool gotError; 3283 3284 void error(const ref Loc loc, const(char)* format, ...) 3285 { 3286 gotError = true; 3287 char[100] buffer = void; 3288 va_list ap; 3289 va_start(ap, format); 3290 auto actual = buffer[0 .. vsnprintf(buffer.ptr, buffer.length, format, ap)]; 3291 va_end(ap); 3292 assert(expected == actual); 3293 } 3294 } 3295 3296 ErrorSinkTest errorSink = new ErrorSinkTest; 3297 3298 void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false) 3299 { 3300 errorSink.expected = expectedError; 3301 errorSink.gotError = false; 3302 auto p = cast(const(char)*)sequence.ptr; 3303 Lexer lexer = new Lexer(errorSink); 3304 dchar c2; 3305 auto actualReturnValue = lexer.escapeSequence(Loc.initial, p, Ccompile, c2); 3306 assert(errorSink.gotError); 3307 assert(expectedReturnValue == actualReturnValue); 3308 3309 auto actualScanLength = p - sequence.ptr; 3310 assert(expectedScanLength == actualScanLength); 3311 } 3312 3313 test("c", `undefined escape sequence \c`, 'c', 1); 3314 test("!", `undefined escape sequence \!`, '!', 1); 3315 test(""", `undefined escape sequence \&`, '&', 1, true); 3316 3317 test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2); 3318 3319 test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2); 3320 test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3); 3321 test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4); 3322 3323 test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2); 3324 test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3); 3325 test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4); 3326 test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5); 3327 test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6); 3328 test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7); 3329 test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8); 3330 3331 test("ud800" , `invalid UTF character \U0000d800`, '?', 5); 3332 test("udfff" , `invalid UTF character \U0000dfff`, '?', 5); 3333 test("U00110000", `invalid UTF character \U00110000`, '?', 9); 3334 3335 test("xg0" , `undefined escape hex sequence \xg`, 'g', 2); 3336 test("ug000" , `undefined escape hex sequence \ug`, 'g', 2); 3337 test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2); 3338 3339 test("&BAD;", `unnamed character entity &BAD;` , '?', 5); 3340 test(""", `unterminated named entity "`, '?', 5); 3341 test(""", `unterminated named entity "`, '?', 5); 3342 3343 test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3); 3344 } 3345 3346 unittest 3347 { 3348 fprintf(stderr, "Lexer.unittest %d\n", __LINE__); 3349 /* Not much here, just trying things out. 3350 */ 3351 string text = "int"; // We rely on the implicit null-terminator 3352 ErrorSink errorSink = new ErrorSinkStderr; 3353 scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, false, false, errorSink, null); 3354 TOK tok; 3355 tok = lex1.nextToken(); 3356 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32); 3357 assert(tok == TOK.int32); 3358 tok = lex1.nextToken(); 3359 assert(tok == TOK.endOfFile); 3360 tok = lex1.nextToken(); 3361 assert(tok == TOK.endOfFile); 3362 tok = lex1.nextToken(); 3363 assert(tok == TOK.endOfFile); 3364 } 3365 3366 unittest 3367 { 3368 fprintf(stderr, "Lexer.unittest %d\n", __LINE__); 3369 3370 // We don't want to see Lexer error output during these tests. 3371 ErrorSink errorSink = new ErrorSinkNull; 3372 3373 // Test malformed input: even malformed input should end in a TOK.endOfFile. 3374 static immutable char[][] testcases = 3375 [ // Testcase must end with 0 or 0x1A. 3376 [0], // not malformed, but pathological 3377 ['\'', 0], 3378 ['\'', 0x1A], 3379 ['{', '{', 'q', '{', 0], 3380 [0xFF, 0], 3381 [0xFF, 0x80, 0], 3382 [0xFF, 0xFF, 0], 3383 [0xFF, 0xFF, 0], 3384 ['x', '"', 0x1A], 3385 ]; 3386 3387 foreach (testcase; testcases) 3388 { 3389 scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, false, false, errorSink, null); 3390 TOK tok = lex2.nextToken(); 3391 size_t iterations = 1; 3392 while ((tok != TOK.endOfFile) && (iterations++ < testcase.length)) 3393 { 3394 tok = lex2.nextToken(); 3395 } 3396 assert(tok == TOK.endOfFile); 3397 tok = lex2.nextToken(); 3398 assert(tok == TOK.endOfFile); 3399 } 3400 }