1 // This file is part of Visual D 2 // 3 // Visual D integrates the D programming language into Visual Studio 4 // Copyright (c) 2010-2011 by Rainer Schuetze, All Rights Reserved 5 // 6 // Distributed under the Boost Software License, Version 1.0. 7 // See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt 8 9 module vdc.lexer; 10 11 import std.ascii; 12 import std.uni : isAlpha; 13 import std.utf; 14 import std.conv; 15 16 enum supportUnorderedCompareOps = false; 17 18 // current limitations: 19 // - nested comments must not nest more than 255 times 20 // - braces must not nest more than 4095 times inside token string 21 // - number of different delimiters must not exceed 256 22 23 enum TokenCat : int 24 { 25 // assumed to match beginning of visuald.colorizer.TokenColor 26 Text, 27 Keyword, 28 Comment, 29 Identifier, 30 String, 31 Literal, 32 Text2, 33 Operator, 34 } 35 36 struct TokenInfo 37 { 38 TokenCat type; 39 int tokid; 40 int StartIndex; 41 int EndIndex; 42 } 43 44 /////////////////////////////////////////////////////////////////////////////// 45 46 struct Lexer 47 { 48 enum State 49 { 50 kWhite, 51 kBlockComment, 52 kNestedComment, 53 kStringCStyle, 54 kStringWysiwyg, 55 kStringAltWysiwyg, 56 kStringDelimited, 57 kStringDelimitedNestedBracket, 58 kStringDelimitedNestedParen, 59 kStringDelimitedNestedBrace, 60 kStringDelimitedNestedAngle, 61 kStringTokenFirst, // after 'q', but before '{' to pass '{' as single operator 62 kStringToken, // encoded by tokenStringLevel > 0 63 kStringHex, // for now, treated as State.kStringWysiwyg 64 kStringEscape, // removed in D2.026, not supported 65 } 66 67 // lexer scan state is: ___TTNNS 68 // TT: token string nesting level 69 // NN: comment nesting level/string delimiter id 70 // S: State 71 static State scanState(int state) { return cast(State) (state & 0xf); } 72 static int nestingLevel(int state) { return (state >> 4) & 0xff; } // used for state kNestedComment and kStringDelimited 73 static int tokenStringLevel(int state) { return (state >> 12) & 0xff; } 74 static int getOtherState(int state) { return (state & 0xfff00000); } 75 76 bool mTokenizeTokenString = true; 77 bool mSplitNestedComments = true; 78 bool mAllowDollarInIdentifiers = false; 79 80 static int toState(State s, int nesting, int tokLevel, int otherState) 81 { 82 static assert(State.kStringToken <= 15); 83 assert(s >= State.kWhite && s <= State.kStringToken); 84 assert(nesting < 32); 85 assert(tokLevel < 32); 86 87 return s | ((nesting & 0xff) << 4) | ((tokLevel & 0xff) << 12) | otherState; 88 } 89 90 static bool isStringState(State state) { return state >= State.kStringCStyle; } 91 static bool isCommentState(State state) { return state == State.kBlockComment || state == State.kNestedComment; } 92 93 static string[256] s_delimiters; 94 static int s_nextDelimiter; 95 96 static int getDelimiterIndex(string delim) 97 { 98 int idx = (s_nextDelimiter - 1) & 0xff; 99 for( ; idx != s_nextDelimiter; idx = (idx - 1) & 0xff) 100 if(delim == s_delimiters[idx]) 101 return idx; 102 103 s_nextDelimiter = (s_nextDelimiter + 1) & 0xff; 104 s_delimiters[idx] = delim; 105 return idx; 106 } 107 108 int scanIdentifier(S)(S text, size_t startpos, ref size_t pos) 109 { 110 int pid; 111 return scanIdentifier(text, startpos, pos, pid); 112 } 113 114 int scanIdentifier(S)(S text, size_t startpos, ref size_t pos, ref int pid) 115 { 116 while(pos < text.length) 117 { 118 auto nextpos = pos; 119 dchar ch = decode(text, nextpos); 120 if(!isIdentifierCharOrDigit(ch)) 121 break; 122 pos = nextpos; 123 } 124 string ident = toUTF8(text[startpos .. pos]); 125 126 if(findKeyword(ident, pid)) 127 return pid == TOK_is ? TokenCat.Operator : TokenCat.Keyword; 128 if(findSpecial(ident, pid)) 129 return TokenCat.String; 130 131 pid = TOK_Identifier; 132 return TokenCat.Identifier; 133 } 134 135 static int scanOperator(S)(S text, size_t startpos, ref size_t pos, ref int pid) 136 { 137 size_t len; 138 int id = parseOperator(text, startpos, len); 139 if(id == TOK_error) 140 return TokenCat.Text; 141 142 pid = id; 143 pos = startpos + len; 144 return TokenCat.Operator; 145 } 146 147 static dchar trydecode(S)(S text, ref size_t pos) 148 { 149 if(pos >= text.length) 150 return 0; 151 dchar ch = decode(text, pos); 152 return ch; 153 } 154 155 static void skipDigits(S)(S text, ref size_t pos, int base) 156 { 157 while(pos < text.length) 158 { 159 auto nextpos = pos; 160 dchar ch = decode(text, nextpos); 161 if(ch != '_') 162 { 163 if(base < 16 && (ch < '0' || ch >= '0' + base)) 164 break; 165 else if(base == 16 && !isHexDigit(ch)) 166 break; 167 } 168 pos = nextpos; 169 } 170 } 171 172 static int scanNumber(S)(S text, dchar ch, ref size_t pos) 173 { 174 int pid; 175 return scanNumber(text, ch, pos, pid); 176 } 177 178 static int scanNumber(S)(S text, dchar ch, ref size_t pos, ref int pid) 179 { 180 // pos after first digit 181 int base = 10; 182 size_t nextpos = pos; 183 if(ch == '.') 184 goto L_float; 185 186 if(ch == '0') 187 { 188 size_t prevpos = pos; 189 ch = trydecode(text, pos); 190 ch = toLower(ch); 191 if(ch == 'b') 192 base = 2; 193 else if (ch == 'x') 194 base = 16; 195 else 196 { 197 base = 8; 198 pos = prevpos; 199 } 200 } 201 202 // pos now after prefix or first digit 203 skipDigits(text, pos, base); 204 // pos now after last digit of integer part 205 206 nextpos = pos; 207 ch = trydecode(text, nextpos); 208 209 if((base == 10 && toLower(ch) == 'e') || (base == 16 && toLower(ch) == 'p')) 210 goto L_exponent; 211 if(base >= 8 && ch == '.') // ".." is the slice token 212 { 213 { // mute errors about goto skipping declaration 214 size_t trypos = nextpos; 215 dchar trych = trydecode(text, trypos); 216 if (trych == '.') 217 goto L_integer; 218 //if (isAlpha(trych) || trych == '_' || (p[1] & 0x80)) 219 // goto done; 220 } 221 // float 222 if(base < 10) 223 base = 10; 224 L_float: 225 pos = nextpos; 226 skipDigits(text, pos, base); 227 228 nextpos = pos; 229 ch = trydecode(text, nextpos); 230 if((base == 10 && toLower(ch) == 'e') || (base == 16 && toLower(ch) == 'p')) 231 { 232 L_exponent: 233 // exponent 234 pos = nextpos; 235 ch = trydecode(text, nextpos); 236 if(ch == '-' || ch == '+') 237 pos = nextpos; 238 skipDigits(text, pos, 10); 239 } 240 241 // suffix 242 nextpos = pos; 243 ch = trydecode(text, nextpos); 244 if(ch == 'L' || toUpper(ch) == 'F') 245 { 246 L_floatLiteral: 247 pos = nextpos; 248 ch = trydecode(text, nextpos); 249 } 250 if(ch == 'i') 251 L_complexLiteral: 252 pos = nextpos; 253 pid = TOK_FloatLiteral; 254 } 255 else 256 { 257 // check integer suffix 258 if(ch == 'i') 259 goto L_complexLiteral; 260 if(toUpper(ch) == 'F') 261 goto L_floatLiteral; 262 263 if(toUpper(ch) == 'U') 264 { 265 pos = nextpos; 266 ch = trydecode(text, nextpos); 267 if(ch == 'L') 268 pos = nextpos; 269 } 270 else if (ch == 'L') 271 { 272 pos = nextpos; 273 ch = trydecode(text, nextpos); 274 if(ch == 'i') 275 goto L_complexLiteral; 276 if(toUpper(ch) == 'U') 277 pos = nextpos; 278 } 279 L_integer: 280 pid = TOK_IntegerLiteral; 281 } 282 return TokenCat.Literal; 283 } 284 285 version(unspecified) unittest 286 { 287 int pid; 288 size_t pos = 1; 289 auto cat = scanNumber("0.0i", '0', pos, pid); 290 assert(pid == TOK_FloatLiteral); 291 pos = 1; 292 cat = scanNumber("0.i", '0', pos, pid); 293 assert(pid == TOK_IntegerLiteral); 294 } 295 296 static State scanBlockComment(S)(S text, ref size_t pos) 297 { 298 while(pos < text.length) 299 { 300 dchar ch = decode(text, pos); 301 while(ch == '*') 302 { 303 if (pos >= text.length) 304 return State.kBlockComment; 305 ch = decode(text, pos); 306 if(ch == '/') 307 return State.kWhite; 308 } 309 } 310 return State.kBlockComment; 311 } 312 313 State scanNestedComment(S)(S text, size_t startpos, ref size_t pos, ref int nesting) 314 { 315 while(pos < text.length) 316 { 317 dchar ch = decode(text, pos); 318 while(ch == '/') 319 { 320 if (pos >= text.length) 321 return State.kNestedComment; 322 ch = decode(text, pos); 323 if(ch == '+') 324 { 325 if(mSplitNestedComments && pos > startpos + 2) 326 { 327 pos -= 2; 328 return State.kNestedComment; 329 } 330 nesting++; 331 goto nextChar; 332 } 333 } 334 while(ch == '+') 335 { 336 if (pos >= text.length) 337 return State.kNestedComment; 338 ch = decode(text, pos); 339 if(ch == '/') 340 { 341 nesting--; 342 if(nesting == 0) 343 return State.kWhite; 344 if(mSplitNestedComments) 345 return State.kNestedComment; 346 break; 347 } 348 } 349 nextChar:; 350 } 351 return State.kNestedComment; 352 } 353 354 static State scanStringPostFix(S)(S text, ref size_t pos) 355 { 356 size_t nextpos = pos; 357 dchar ch = trydecode(text, nextpos); 358 if(ch == 'c' || ch == 'w' || ch == 'd') 359 pos = nextpos; 360 return State.kWhite; 361 } 362 363 static State scanStringWysiwyg(S)(S text, ref size_t pos) 364 { 365 while(pos < text.length) 366 { 367 dchar ch = decode(text, pos); 368 if(ch == '"') 369 return scanStringPostFix(text, pos); 370 } 371 return State.kStringWysiwyg; 372 } 373 374 static State scanStringAltWysiwyg(S)(S text, ref size_t pos) 375 { 376 while(pos < text.length) 377 { 378 dchar ch = decode(text, pos); 379 if(ch == '`') 380 return scanStringPostFix(text, pos); 381 } 382 return State.kStringAltWysiwyg; 383 } 384 385 static State scanStringCStyle(S)(S text, ref size_t pos, dchar term) 386 { 387 while(pos < text.length) 388 { 389 dchar ch = decode(text, pos); 390 if(ch == '\\') 391 { 392 if (pos >= text.length) 393 break; 394 ch = decode(text, pos); 395 } 396 else if(ch == term) 397 return scanStringPostFix(text, pos); 398 } 399 return State.kStringCStyle; 400 } 401 402 State startDelimiterString(S)(S text, ref size_t pos, ref int nesting) 403 { 404 import std.uni : isWhite; 405 nesting = 1; 406 407 auto startpos = pos; 408 dchar ch = trydecode(text, pos); 409 State s = State.kStringDelimited; 410 if(ch == '[') 411 s = State.kStringDelimitedNestedBracket; 412 else if(ch == '(') 413 s = State.kStringDelimitedNestedParen; 414 else if(ch == '{') 415 s = State.kStringDelimitedNestedBrace; 416 else if(ch == '<') 417 s = State.kStringDelimitedNestedAngle; 418 else if(ch == 0 || isWhite(ch)) // bad delimiter, fallback to wysiwyg string 419 s = State.kStringWysiwyg; 420 else 421 { 422 if(isIdentifierChar(ch)) 423 scanIdentifier(text, startpos, pos); 424 string delim = toUTF8(text[startpos .. pos]); 425 nesting = getDelimiterIndex(delim); 426 } 427 return s; 428 } 429 430 State scanTokenString(S)(S text, ref size_t pos, ref int tokLevel) 431 { 432 int state = toState(State.kWhite, 0, 0, 0); 433 int id = -1; 434 while(pos < text.length && tokLevel > 0) 435 { 436 int type = scan(state, text, pos, id); 437 if(id == TOK_lcurly) 438 tokLevel++; 439 else if(id == TOK_rcurly) 440 tokLevel--; 441 } 442 return (tokLevel > 0 ? State.kStringToken : State.kWhite); 443 } 444 445 static bool isStartingComment(S)(S txt, ref size_t idx) 446 { 447 if(idx >= 0 && idx < txt.length-1 && txt[idx] == '/' && (txt[idx+1] == '*' || txt[idx+1] == '+')) 448 return true; 449 if((txt[idx] == '*' || txt[idx] == '+') && idx > 0 && txt[idx-1] == '/') 450 { 451 idx--; 452 return true; 453 } 454 return false; 455 } 456 457 static bool isEndingComment(S)(S txt, ref size_t pos) 458 { 459 if(pos < txt.length && pos > 0 && txt[pos] == '/' && (txt[pos-1] == '*' || txt[pos-1] == '+')) 460 { 461 pos--; 462 return true; 463 } 464 if(pos < txt.length-1 && pos >= 0 && (txt[pos] == '*' || txt[pos] == '+') && txt[pos+1] == '/') 465 return true; 466 return false; 467 } 468 469 bool isIdentifierChar(dchar ch) 470 { 471 if(mAllowDollarInIdentifiers && ch == '$') 472 return true; 473 return isAlpha(ch) || ch == '_' || ch == '@'; 474 } 475 476 bool isIdentifierCharOrDigit(dchar ch) 477 { 478 return isIdentifierChar(ch) || isDigit(ch); 479 } 480 481 bool isIdentifier(S)(S text) 482 { 483 if(text.length == 0) 484 return false; 485 486 size_t pos; 487 dchar ch = decode(text, pos); 488 if(!isIdentifierChar(ch)) 489 return false; 490 491 while(pos < text.length) 492 { 493 ch = decode(text, pos); 494 if(!isIdentifierCharOrDigit(ch)) 495 return false; 496 } 497 return true; 498 } 499 500 static bool isInteger(S)(S text) 501 { 502 if(text.length == 0) 503 return false; 504 505 size_t pos; 506 while(pos < text.length) 507 { 508 dchar ch = decode(text, pos); 509 if(!isDigit(ch)) 510 return false; 511 } 512 return true; 513 } 514 515 static bool isBracketPair(dchar ch1, dchar ch2) 516 { 517 switch(ch1) 518 { 519 case '{': return ch2 == '}'; 520 case '}': return ch2 == '{'; 521 case '(': return ch2 == ')'; 522 case ')': return ch2 == '('; 523 case '[': return ch2 == ']'; 524 case ']': return ch2 == '['; 525 default: return false; 526 } 527 } 528 529 static bool isOpeningBracket(dchar ch) 530 { 531 return ch == '[' || ch == '(' || ch == '{'; 532 } 533 534 static bool isClosingBracket(dchar ch) 535 { 536 return ch == ']' || ch == ')' || ch == '}'; 537 } 538 539 static dchar openingBracket(State s) 540 { 541 switch(s) 542 { 543 case State.kStringDelimitedNestedBracket: return '['; 544 case State.kStringDelimitedNestedParen: return '('; 545 case State.kStringDelimitedNestedBrace: return '{'; 546 case State.kStringDelimitedNestedAngle: return '<'; 547 default: break; 548 } 549 assert(0); 550 } 551 552 static dchar closingBracket(State s) 553 { 554 switch(s) 555 { 556 case State.kStringDelimitedNestedBracket: return ']'; 557 case State.kStringDelimitedNestedParen: return ')'; 558 case State.kStringDelimitedNestedBrace: return '}'; 559 case State.kStringDelimitedNestedAngle: return '>'; 560 default: break; 561 } 562 assert(0); 563 } 564 565 static bool isCommentOrSpace(S)(int type, S text) 566 { 567 return (type == TokenCat.Comment || (type == TokenCat.Text && isWhite(text[0]))); 568 } 569 570 static State scanNestedDelimiterString(S)(S text, ref size_t pos, State s, ref int nesting) 571 { 572 dchar open = openingBracket(s); 573 dchar close = closingBracket(s); 574 575 while(pos < text.length) 576 { 577 dchar ch = decode(text, pos); 578 if(ch == open) 579 nesting++; 580 else if(ch == close && nesting > 0) 581 nesting--; 582 else if(ch == '"' && nesting == 0) 583 return scanStringPostFix(text, pos); 584 } 585 return s; 586 } 587 588 State scanDelimitedString(S)(S text, ref size_t pos, ref int delim) 589 { 590 string delimiter = s_delimiters[delim]; 591 592 while(pos < text.length) 593 { 594 auto startpos = pos; 595 dchar ch = decode(text, pos); 596 if(isIdentifierChar(ch)) 597 scanIdentifier(text, startpos, pos); 598 string ident = toUTF8(text[startpos .. pos]); 599 if(ident == delimiter) 600 { 601 ch = trydecode(text, pos); 602 if(ch == '"') 603 { 604 delim = 0; // reset delimiter id, it shadows nesting 605 return scanStringPostFix(text, pos); 606 } 607 } 608 } 609 return State.kStringDelimited; 610 } 611 612 int scan(S)(ref int state, in S text, ref size_t pos, ref int id) 613 { 614 State s = scanState(state); 615 int nesting = nestingLevel(state); 616 int tokLevel = tokenStringLevel(state); 617 int otherState = getOtherState(state); 618 619 int type = TokenCat.Text; 620 size_t startpos = pos; 621 dchar ch; 622 623 id = TOK_Space; 624 625 switch(s) 626 { 627 case State.kWhite: 628 ch = decode(text, pos); 629 if(ch == 'r' || ch == 'x' || ch == 'q') 630 { 631 size_t prevpos = pos; 632 dchar nch = trydecode(text, pos); 633 if(nch == '"' && ch == 'q') 634 { 635 s = startDelimiterString(text, pos, nesting); 636 if(s == State.kStringDelimited) 637 goto case State.kStringDelimited; 638 else if(s == State.kStringWysiwyg) 639 goto case State.kStringWysiwyg; 640 else 641 goto case State.kStringDelimitedNestedBracket; 642 } 643 else if(tokLevel == 0 && ch == 'q' && nch == '{') 644 { 645 type = TokenCat.String; 646 id = TOK_StringLiteral; 647 if(mTokenizeTokenString) 648 { 649 pos = prevpos; 650 s = State.kStringTokenFirst; 651 } 652 else 653 { 654 tokLevel = 1; 655 s = scanTokenString(text, pos, tokLevel); 656 } 657 break; 658 } 659 else if(nch == '"') 660 { 661 goto case State.kStringWysiwyg; 662 } 663 else 664 { 665 pos = prevpos; 666 type = scanIdentifier(text, startpos, pos, id); 667 } 668 } 669 else if(isIdentifierChar(ch)) 670 type = scanIdentifier(text, startpos, pos, id); 671 else if(isDigit(ch)) 672 type = scanNumber(text, ch, pos, id); 673 else if (ch == '.') 674 { 675 size_t nextpos = pos; 676 ch = trydecode(text, nextpos); 677 if(isDigit(ch)) 678 type = scanNumber(text, '.', pos, id); 679 else 680 type = scanOperator(text, startpos, pos, id); 681 } 682 else if (ch == '/') 683 { 684 size_t prevpos = pos; 685 ch = trydecode(text, pos); 686 if (ch == '/') 687 { 688 // line comment 689 type = TokenCat.Comment; 690 id = TOK_Comment; 691 while(pos < text.length && decode(text, pos) != '\n') {} 692 } 693 else if (ch == '*') 694 { 695 s = scanBlockComment(text, pos); 696 type = TokenCat.Comment; 697 id = TOK_Comment; 698 } 699 else if (ch == '+') 700 { 701 nesting = 1; 702 s = scanNestedComment(text, startpos, pos, nesting); 703 type = TokenCat.Comment; 704 id = TOK_Comment; 705 } 706 else 707 { 708 // step back to position after '/' 709 pos = prevpos; 710 type = scanOperator(text, startpos, pos, id); 711 } 712 } 713 else if (ch == '"') 714 goto case State.kStringCStyle; 715 716 else if (ch == '`') 717 goto case State.kStringAltWysiwyg; 718 719 else if (ch == '\'') 720 { 721 s = scanStringCStyle(text, pos, '\''); 722 id = TOK_CharacterLiteral; 723 type = TokenCat.String; 724 } 725 else if (ch == '#') 726 { 727 // display #! or #line as line comment 728 type = TokenCat.Comment; 729 id = TOK_Comment; 730 while(pos < text.length && decode(text, pos) != '\n') {} 731 } 732 else 733 { 734 if (tokLevel > 0) 735 { 736 if(ch == '{') 737 tokLevel++; 738 else if (ch == '}') 739 tokLevel--; 740 if(!isWhite(ch)) 741 type = scanOperator(text, startpos, pos, id); 742 id = TOK_StringLiteral; 743 } 744 else if(!isWhite(ch)) 745 type = scanOperator(text, startpos, pos, id); 746 } 747 break; 748 749 case State.kStringTokenFirst: 750 ch = decode(text, pos); 751 assert(ch == '{'); 752 753 tokLevel = 1; 754 type = TokenCat.Operator; 755 id = TOK_StringLiteral; 756 s = State.kWhite; 757 break; 758 759 case State.kStringToken: 760 type = TokenCat.String; 761 id = TOK_StringLiteral; 762 s = scanTokenString(text, pos, tokLevel); 763 break; 764 765 case State.kBlockComment: 766 s = scanBlockComment(text, pos); 767 type = TokenCat.Comment; 768 id = TOK_Comment; 769 break; 770 771 case State.kNestedComment: 772 s = scanNestedComment(text, pos, pos, nesting); 773 type = TokenCat.Comment; 774 id = TOK_Comment; 775 break; 776 777 case State.kStringCStyle: 778 s = scanStringCStyle(text, pos, '"'); 779 type = TokenCat.String; 780 id = TOK_StringLiteral; 781 break; 782 783 case State.kStringWysiwyg: 784 s = scanStringWysiwyg(text, pos); 785 type = TokenCat.String; 786 id = TOK_StringLiteral; 787 break; 788 789 case State.kStringAltWysiwyg: 790 s = scanStringAltWysiwyg(text, pos); 791 type = TokenCat.String; 792 id = TOK_StringLiteral; 793 break; 794 795 case State.kStringDelimited: 796 s = scanDelimitedString(text, pos, nesting); 797 type = TokenCat.String; 798 id = TOK_StringLiteral; 799 break; 800 801 case State.kStringDelimitedNestedBracket: 802 case State.kStringDelimitedNestedParen: 803 case State.kStringDelimitedNestedBrace: 804 case State.kStringDelimitedNestedAngle: 805 s = scanNestedDelimiterString(text, pos, s, nesting); 806 type = TokenCat.String; 807 id = TOK_StringLiteral; 808 break; 809 810 default: 811 break; 812 } 813 state = toState(s, nesting, tokLevel, otherState); 814 815 if(tokLevel > 0) 816 id = TOK_StringLiteral; 817 return type; 818 } 819 820 int scan(S)(ref int state, in S text, ref size_t pos) 821 { 822 int id; 823 return scan(state, text, pos, id); 824 } 825 826 /////////////////////////////////////////////////////////////// 827 TokenInfo[] ScanLine(S)(int iState, S text) 828 { 829 TokenInfo[] lineInfo; 830 for(size_t pos = 0; pos < text.length; ) 831 { 832 TokenInfo info; 833 info.StartIndex = pos; 834 info.type = cast(TokenCat) scan(iState, text, pos, info.tokid); 835 info.EndIndex = pos; 836 lineInfo ~= info; 837 } 838 return lineInfo; 839 } 840 } 841 842 /////////////////////////////////////////////////////////////// 843 844 // converted int[string] to short[string] due to bug #2500 845 __gshared short[string] keywords_map; // maps to TOK enumerator 846 __gshared short[string] specials_map; // maps to TOK enumerator 847 alias AssociativeArray!(string, short) _wa1; // fully instantiate type info 848 alias AssociativeArray!(int, const(int)) _wa2; // fully instantiate type info 849 850 shared static this() 851 { 852 foreach(i, s; keywords) 853 keywords_map[s] = cast(short) (TOK_begin_Keywords + i); 854 855 foreach(i, s; specials) 856 specials_map[s] = cast(short) i; 857 } 858 859 bool findKeyword(string ident, ref int id) 860 { 861 if(__ctfe) 862 { 863 // slow, but compiles 864 foreach(i, k; keywords) 865 if(k == ident) 866 { 867 id = cast(int) (TOK_begin_Keywords + i); 868 return true; 869 } 870 } 871 else if(auto pident = ident in keywords_map) 872 { 873 id = *pident; 874 return true; 875 } 876 return false; 877 } 878 879 bool isKeyword(string ident) 880 { 881 int id; 882 return findKeyword(ident, id); 883 } 884 885 bool findSpecial(string ident, ref int id) 886 { 887 if(__ctfe) 888 { 889 // slow, but compiles 890 foreach(i, k; specials) 891 if(k == ident) 892 { 893 id = TOK_StringLiteral; 894 return true; 895 } 896 } 897 else if(auto pident = ident in specials_map) 898 { 899 id = TOK_StringLiteral; 900 return true; 901 } 902 return false; 903 } 904 905 const string[] keywords = 906 [ 907 "this", 908 "super", 909 "assert", 910 "null", 911 "true", 912 "false", 913 "cast", 914 "new", 915 "delete", 916 "throw", 917 "module", 918 "pragma", 919 "typeof", 920 "typeid", 921 "template", 922 923 "void", 924 "byte", 925 "ubyte", 926 "short", 927 "ushort", 928 "int", 929 "uint", 930 "long", 931 "ulong", 932 "cent", 933 "ucent", 934 "float", 935 "double", 936 "real", 937 "bool", 938 "char", 939 "wchar", 940 "dchar", 941 "ifloat", 942 "idouble", 943 "ireal", 944 945 "cfloat", 946 "cdouble", 947 "creal", 948 949 "delegate", 950 "function", 951 952 "is", 953 "if", 954 "else", 955 "while", 956 "for", 957 "do", 958 "switch", 959 "case", 960 "default", 961 "break", 962 "continue", 963 "synchronized", 964 "return", 965 "goto", 966 "try", 967 "catch", 968 "finally", 969 "with", 970 "asm", 971 "foreach", 972 "foreach_reverse", 973 "scope", 974 975 "struct", 976 "class", 977 "interface", 978 "union", 979 "enum", 980 "import", 981 "mixin", 982 "static", 983 "final", 984 "const", 985 "typedef", 986 "alias", 987 "override", 988 "abstract", 989 "volatile", 990 "debug", 991 "deprecated", 992 "in", 993 "out", 994 "inout", 995 "lazy", 996 "auto", 997 998 "align", 999 "extern", 1000 "private", 1001 "package", 1002 "protected", 1003 "public", 1004 "export", 1005 1006 "body", 1007 "invariant", 1008 "unittest", 1009 "version", 1010 //{ "manifest", TOKmanifest }, 1011 1012 // Added after 1.0 1013 "ref", 1014 "macro", 1015 "pure", 1016 "nothrow", 1017 "__gshared", 1018 "__thread", 1019 "__traits", 1020 "__overloadset", 1021 "__parameters", 1022 "__argTypes", 1023 "__vector", 1024 1025 "__FILE__", 1026 "__LINE__", 1027 "__FUNCTION__", 1028 "__PRETTY_FUNCTION__", 1029 "__MODULE__", 1030 1031 "shared", 1032 "immutable", 1033 1034 "@disable", 1035 "@property", 1036 "@nogc", 1037 "@safe", 1038 "@system", 1039 "@trusted", 1040 1041 ]; 1042 1043 // not listed as keywords, but "special tokens" 1044 const string[] specials = 1045 [ 1046 "__DATE__", 1047 "__EOF__", 1048 "__TIME__", 1049 "__TIMESTAMP__", 1050 "__VENDOR__", 1051 "__VERSION__", 1052 ]; 1053 1054 //////////////////////////////////////////////////////////////////////// 1055 enum 1056 { 1057 TOK_begin_Generic, 1058 TOK_Space = TOK_begin_Generic, 1059 TOK_Comment, 1060 TOK_Identifier, 1061 TOK_IntegerLiteral, 1062 TOK_FloatLiteral, 1063 TOK_StringLiteral, 1064 TOK_CharacterLiteral, 1065 TOK_EOF, 1066 TOK_RECOVER, 1067 TOK_end_Generic 1068 } 1069 1070 string genKeywordEnum(string kw) 1071 { 1072 if(kw[0] == '@') 1073 kw = kw[1..$]; 1074 return "TOK_" ~ kw; 1075 } 1076 1077 string genKeywordsEnum(T)(const string[] kwords, T begin) 1078 { 1079 string enums = "enum { TOK_begin_Keywords = " ~ to!string(begin) ~ ", "; 1080 bool first = true; 1081 foreach(kw; kwords) 1082 { 1083 enums ~= genKeywordEnum(kw); 1084 if(first) 1085 { 1086 first = false; 1087 enums ~= " = TOK_begin_Keywords"; 1088 } 1089 enums ~= ","; 1090 } 1091 enums ~= "TOK_end_Keywords }"; 1092 return enums; 1093 } 1094 1095 mixin(genKeywordsEnum(keywords, "TOK_end_Generic")); 1096 1097 const string[2][] operators = 1098 [ 1099 [ "lcurly", "{" ], 1100 [ "rcurly", "}" ], 1101 [ "lparen", "(" ], 1102 [ "rparen", ")" ], 1103 [ "lbracket", "[" ], 1104 [ "rbracket", "]" ], 1105 [ "semicolon", ";" ], 1106 [ "colon", ":" ], 1107 [ "comma", "," ], 1108 [ "dot", "." ], 1109 1110 // binary operators 1111 [ "xor", "^" ], 1112 [ "lt", "<" ], 1113 [ "gt", ">" ], 1114 [ "le", "<=" ], 1115 [ "ge", ">=" ], 1116 [ "equal", "==" ], 1117 [ "notequal", "!=" ], 1118 [ "lambda", "=>" ], 1119 1120 [ "unord", "!<>=" ], 1121 [ "ue", "!<>" ], 1122 [ "lg", "<>" ], 1123 [ "leg", "<>=" ], 1124 [ "ule", "!>" ], 1125 [ "ul", "!>=" ], 1126 [ "uge", "!<" ], 1127 [ "ug", "!<=" ], 1128 [ "notcontains", "!in" ], 1129 [ "notidentity", "!is" ], 1130 1131 [ "shl", "<<" ], 1132 [ "shr", ">>" ], 1133 [ "ushr", ">>>" ], 1134 [ "add", "+" ], 1135 [ "min", "-" ], 1136 [ "mul", "*" ], 1137 [ "div", "/" ], 1138 [ "mod", "%" ], 1139 [ "pow", "^^" ], 1140 [ "and", "&" ], 1141 [ "andand", "&&" ], 1142 [ "or", "|" ], 1143 [ "oror", "||" ], 1144 [ "tilde", "~" ], 1145 1146 [ "assign", "=" ], 1147 [ "xorass", "^=" ], 1148 [ "addass", "+=" ], 1149 [ "minass", "-=" ], 1150 [ "mulass", "*=" ], 1151 [ "divass", "/=" ], 1152 [ "modass", "%=" ], 1153 [ "powass", "^^=" ], 1154 [ "shlass", "<<=" ], 1155 [ "shrass", ">>=" ], 1156 [ "ushrass", ">>>=" ], 1157 [ "andass", "&=" ], 1158 [ "orass", "|=" ], 1159 [ "catass", "~=" ], 1160 1161 // end of binary operators 1162 1163 [ "not", "!" ], 1164 [ "dollar", "$" ], 1165 [ "slice", ".." ], 1166 [ "dotdotdot", "..." ], 1167 [ "plusplus", "++" ], 1168 [ "minusminus", "--" ], 1169 [ "question", "?" ], 1170 /+ 1171 [ "array", "[]" ], 1172 // symbols with duplicate meaning 1173 [ "address", "&" ], 1174 [ "star", "*" ], 1175 [ "preplusplus", "++" ], 1176 [ "preminusminus", "--" ], 1177 [ "neg", "-" ], 1178 [ "uadd", "+" ], 1179 [ "cat", "~" ], 1180 [ "identity", "is" ], 1181 [ "plus", "++" ], 1182 [ "minus", "--" ], 1183 +/ 1184 ]; 1185 1186 string genOperatorEnum(T)(const string[2][] ops, T begin) 1187 { 1188 string enums = "enum { TOK_begin_Operators = " ~ to!string(begin) ~ ", "; 1189 bool first = true; 1190 for(int o = 0; o < ops.length; o++) 1191 { 1192 enums ~= "TOK_" ~ ops[o][0]; 1193 if(first) 1194 { 1195 first = false; 1196 enums ~= " = TOK_begin_Operators"; 1197 } 1198 enums ~= ","; 1199 } 1200 enums ~= "TOK_end_Operators }"; 1201 return enums; 1202 } 1203 1204 mixin(genOperatorEnum(operators, "TOK_end_Keywords")); 1205 1206 enum TOK_binaryOperatorFirst = TOK_xor; 1207 enum TOK_binaryOperatorLast = TOK_catass; 1208 enum TOK_assignOperatorFirst = TOK_assign; 1209 enum TOK_assignOperatorLast = TOK_catass; 1210 enum TOK_unorderedOperatorFirst = TOK_unord; 1211 enum TOK_unorderedOperatorLast = TOK_ug; 1212 1213 enum TOK_error = -1; 1214 1215 bool _stringEqual(string s1, string s2, int length) 1216 { 1217 if(s1.length < length || s2.length < length) 1218 return false; 1219 for(int i = 0; i < length; i++) 1220 if(s1[i] != s2[i]) 1221 return false; 1222 return true; 1223 } 1224 1225 int[] sortedOperatorIndexArray() 1226 { 1227 // create sorted list of operators 1228 int[] opIndex; 1229 for(int o = 0; o < operators.length; o++) 1230 { 1231 string op = operators[o][1]; 1232 int p = 0; 1233 while(p < opIndex.length) 1234 { 1235 assert(op != operators[opIndex[p]][1], "duplicate operator " ~ op); 1236 if(op < operators[opIndex[p]][1]) 1237 break; 1238 p++; 1239 } 1240 // array slicing does not work in CTFE? 1241 // opIndex ~= opIndex[0..p] ~ o ~ opIndex[p..$]; 1242 int[] nIndex; 1243 for(int i = 0; i < p; i++) 1244 nIndex ~= opIndex[i]; 1245 nIndex ~= o; 1246 for(int i = p; i < opIndex.length; i++) 1247 nIndex ~= opIndex[i]; 1248 opIndex = nIndex; 1249 } 1250 return opIndex; 1251 } 1252 1253 string[] sortedOperatorArray() 1254 { 1255 string[] array; 1256 foreach(o; sortedOperatorIndexArray()) 1257 array ~= operators[o][1]; 1258 return array; 1259 } 1260 1261 string genOperatorParser(string getch) 1262 { 1263 int[] opIndex = sortedOperatorIndexArray(); 1264 1265 int matchlen = 0; 1266 string indent = ""; 1267 string[] defaults = [ "error" ]; 1268 string txt = indent ~ "dchar ch;\n"; 1269 for(int o = 0; o < opIndex.length; o++) 1270 { 1271 string op = operators[opIndex[o]][1]; 1272 string nextop; 1273 if(o + 1 < opIndex.length) 1274 nextop = operators[opIndex[o+1]][1]; 1275 1276 while(op.length > matchlen) 1277 { 1278 if(matchlen > 0) 1279 txt ~= indent ~ "case '" ~ op[matchlen-1] ~ "':\n"; 1280 indent ~= " "; 1281 txt ~= indent ~ "ch = " ~ getch ~ ";\n"; 1282 txt ~= indent ~ "switch(ch)\n"; 1283 txt ~= indent ~ "{\n"; 1284 indent ~= " "; 1285 int len = (matchlen > 0 ? matchlen - 1 : 0); 1286 while(len > 0 && defaults[len] == defaults[len+1]) 1287 len--; 1288 txt ~= indent ~ "default: len = " ~ to!string(len) ~ "; return TOK_" ~ defaults[$-1] ~ ";\n"; 1289 //txt ~= indent ~ "case '" ~ op[matchlen] ~ "':\n"; 1290 defaults ~= defaults[$-1]; 1291 matchlen++; 1292 } 1293 if(nextop.length > matchlen && nextop[0..matchlen] == op) 1294 { 1295 if(matchlen > 0) 1296 txt ~= indent ~ "case '" ~ op[matchlen-1] ~ "':\n"; 1297 indent ~= " "; 1298 txt ~= indent ~ "ch = " ~ getch ~ ";\n"; 1299 txt ~= indent ~ "switch(ch)\n"; 1300 txt ~= indent ~ "{\n"; 1301 indent ~= " "; 1302 txt ~= indent ~ "default: len = " ~ to!string(matchlen) ~ "; return TOK_" ~ operators[opIndex[o]][0] ~ "; // " ~ op ~ "\n"; 1303 defaults ~= operators[opIndex[o]][0]; 1304 matchlen++; 1305 } 1306 else 1307 { 1308 string case_txt = "case '" ~ op[matchlen-1] ~ "':"; 1309 if(isAlphaNum(op[matchlen-1])) 1310 case_txt ~= " ch = getch(); if(isAlphaNum(ch) || ch == '_') goto default;\n" ~ indent ~ " "; 1311 txt ~= indent ~ case_txt ~ " len = " ~ to!string(matchlen) ~ "; return TOK_" ~ operators[opIndex[o]][0] ~ "; // " ~ op ~ "\n"; 1312 1313 while(nextop.length < matchlen || (matchlen > 0 && !_stringEqual(op, nextop, matchlen-1))) 1314 { 1315 matchlen--; 1316 indent = indent[0..$-2]; 1317 txt ~= indent ~ "}\n"; 1318 indent = indent[0..$-2]; 1319 defaults = defaults[0..$-1]; 1320 } 1321 } 1322 } 1323 return txt; 1324 } 1325 1326 int parseOperator(S)(S txt, size_t pos, ref size_t len) 1327 { 1328 dchar getch() 1329 { 1330 if(pos >= txt.length) 1331 return 0; 1332 return decode(txt, pos); 1333 } 1334 1335 mixin(genOperatorParser("getch()")); 1336 } 1337 1338 //////////////////////////////////////////////////////////////////////// 1339 version(none) 1340 { 1341 pragma(msg, genKeywordsEnum(keywords, "TOK_end_Generic")); 1342 pragma(msg, genOperatorEnum(operators, "TOK_end_Keywords")); 1343 pragma(msg, sortedOperatorArray()); 1344 pragma(msg, genOperatorParser("getch()")); 1345 } 1346 1347 string tokenString(int id) 1348 { 1349 switch(id) 1350 { 1351 case TOK_Space: return " "; 1352 case TOK_Comment: return "/**/"; 1353 case TOK_Identifier: return "Identifier"; 1354 case TOK_IntegerLiteral: return "IntegerLiteral"; 1355 case TOK_FloatLiteral: return "FloatLiteral"; 1356 case TOK_StringLiteral: return "StringtLiteral"; 1357 case TOK_CharacterLiteral: return "CharacterLiteral"; 1358 case TOK_EOF: return "__EOF__"; 1359 case TOK_RECOVER: return "__RECOVER__"; 1360 case TOK_begin_Keywords: .. case TOK_end_Keywords - 1: 1361 return keywords[id - TOK_begin_Keywords]; 1362 case TOK_begin_Operators: .. case TOK_end_Operators - 1: 1363 return operators[id - TOK_begin_Operators][1]; 1364 default: 1365 assert(false); 1366 } 1367 } 1368 1369 string operatorName(int id) 1370 { 1371 switch(id) 1372 { 1373 case TOK_begin_Operators: .. case TOK_end_Operators - 1: 1374 return operators[id - TOK_begin_Operators][0]; 1375 default: 1376 assert(false); 1377 } 1378 } 1379 1380 enum case_TOKs_BasicTypeX = q{ 1381 case TOK_bool: 1382 case TOK_byte: 1383 case TOK_ubyte: 1384 case TOK_short: 1385 case TOK_ushort: 1386 case TOK_int: 1387 case TOK_uint: 1388 case TOK_long: 1389 case TOK_ulong: 1390 case TOK_char: 1391 case TOK_wchar: 1392 case TOK_dchar: 1393 case TOK_float: 1394 case TOK_double: 1395 case TOK_real: 1396 case TOK_ifloat: 1397 case TOK_idouble: 1398 case TOK_ireal: 1399 case TOK_cfloat: 1400 case TOK_cdouble: 1401 case TOK_creal: 1402 case TOK_void: 1403 }; 1404 1405 enum case_TOKs_TemplateSingleArgument = q{ 1406 case TOK_Identifier: 1407 case TOK_CharacterLiteral: 1408 case TOK_StringLiteral: 1409 case TOK_IntegerLiteral: 1410 case TOK_FloatLiteral: 1411 case TOK_true: 1412 case TOK_false: 1413 case TOK_null: 1414 case TOK___FILE__: 1415 case TOK___LINE__: 1416 }; // + case_TOKs_BasicTypeX;