1 /**
2  * Implements the lexical analyzer, which converts source code into lexical tokens.
3  *
4  * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical)
5  *
6  * Copyright:   Copyright (C) 1999-2023 by The D Language Foundation, All Rights Reserved
7  * Authors:     $(LINK2 https://www.digitalmars.com, Walter Bright)
8  * License:     $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d)
10  * Documentation:  https://dlang.org/phobos/dmd_lexer.html
11  * Coverage:    https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d
12  */
13 
14 module dmd.lexer;
15 
16 import core.stdc.ctype;
17 import core.stdc.stdio;
18 import core.stdc.string;
19 
20 import dmd.entity;
21 import dmd.errorsink;
22 import dmd.id;
23 import dmd.identifier;
24 import dmd.location;
25 import dmd.root.array;
26 import dmd.root.ctfloat;
27 import dmd.common.outbuffer;
28 import dmd.root.port;
29 import dmd.root.rmem;
30 import dmd.root.utf;
31 import dmd.tokens;
32 
33 nothrow:
34 
35 version (DMDLIB)
36 {
37     version = LocOffset;
38 }
39 
40 /***********************************************************
41  * Values to use for various magic identifiers
42  */
43 struct CompileEnv
44 {
45     uint versionNumber;      /// __VERSION__
46     const(char)[] date;      /// __DATE__
47     const(char)[] time;      /// __TIME__
48     const(char)[] vendor;    /// __VENDOR__
49     const(char)[] timestamp; /// __TIMESTAMP__
50 
51     bool previewIn;          /// `in` means `[ref] scope const`, accepts rvalues
52     bool ddocOutput;         /// collect embedded documentation comments
53     bool masm;               /// use MASM inline asm syntax
54 }
55 
56 /***********************************************************
57  */
58 class Lexer
59 {
60     private __gshared OutBuffer stringbuffer;
61 
62     Loc scanloc;            // for error messages
63     Loc prevloc;            // location of token before current
64 
65     const(char)* p;         // current character
66 
67     Token token;
68 
69     // For ImportC
70     bool Ccompile;              /// true if compiling ImportC
71 
72     // The following are valid only if (Ccompile == true)
73     ubyte boolsize;             /// size of a C _Bool, default 1
74     ubyte shortsize;            /// size of a C short, default 2
75     ubyte intsize;              /// size of a C int, default 4
76     ubyte longsize;             /// size of C long, 4 or 8
77     ubyte long_longsize;        /// size of a C long long, default 8
78     ubyte long_doublesize;      /// size of C long double, 8 or D real.sizeof
79     ubyte wchar_tsize;          /// size of C wchar_t, 2 or 4
80 
81     ErrorSink eSink;            /// send error messages through this interface
82     CompileEnv compileEnv;      /// environment
83 
84     private
85     {
86         const(char)* base;      // pointer to start of buffer
87         const(char)* end;       // pointer to last element of buffer
88         const(char)* line;      // start of current line
89 
90         bool doDocComment;      // collect doc comment information
91         bool anyToken;          // seen at least one token
92         bool commentToken;      // comments are TOK.comment's
93         bool tokenizeNewlines;  // newlines are turned into TOK.endOfLine's
94 
95         bool whitespaceToken;   // tokenize whitespaces (only for DMDLIB)
96 
97         int inTokenStringConstant; // can be larger than 1 when in nested q{} strings
98         int lastDocLine;        // last line of previous doc comment
99 
100         Token* tokenFreelist;
101     }
102 
103   nothrow:
104 
105     /*********************
106      * Creates a Lexer for the source code base[begoffset..endoffset+1].
107      * The last character, base[endoffset], must be null (0) or EOF (0x1A).
108      *
109      * Params:
110      *  filename = used for error messages
111      *  base = source code, must be terminated by a null (0) or EOF (0x1A) character
112      *  begoffset = starting offset into base[]
113      *  endoffset = the last offset to read into base[]
114      *  doDocComment = handle documentation comments
115      *  commentToken = comments become TOK.comment's
116      *  errorSink = where error messages go, must not be null
117      *  compileEnv = version, vendor, date, time, etc.
118      */
119     this(const(char)* filename, const(char)* base, size_t begoffset,
120         size_t endoffset, bool doDocComment, bool commentToken,
121         ErrorSink errorSink,
122         const CompileEnv* compileEnv) scope
123     {
124         scanloc = Loc(filename, 1, 1);
125         // debug printf("Lexer::Lexer(%p)\n", base);
126         // debug printf("lexer.filename = %s\n", filename);
127         token = Token.init;
128         this.base = base;
129         this.end = base + endoffset;
130         p = base + begoffset;
131         line = p;
132         this.doDocComment = doDocComment;
133         this.commentToken = commentToken;
134         this.tokenizeNewlines = false;
135         this.inTokenStringConstant = 0;
136         this.lastDocLine = 0;
137         this.eSink = errorSink;
138         assert(errorSink);
139         if (compileEnv)
140             this.compileEnv = *compileEnv;
141         else
142         {
143             this.compileEnv.versionNumber = 1;
144             this.compileEnv.vendor = "DLF";
145         }
146         //initKeywords();
147         /* If first line starts with '#!', ignore the line
148          */
149         if (p && p[0] == '#' && p[1] == '!')
150         {
151             p += 2;
152             for (;;p++)
153             {
154                 char c = *p;
155                 switch (c)
156                 {
157                 case '\n':
158                     p++;
159                     goto case;
160                 case 0:
161                 case 0x1A:
162                     break;
163 
164                 default:
165                     // Note: We do allow malformed UTF-8 on shebang line.
166                     // It could have a meaning if the native system
167                     // encoding is not Unicode. See test compilable/test13512.d
168                     // for example encoded in KOI-8.
169                     // We also allow bidirectional control characters.
170                     // We do not execute the shebang line, so it can't be used
171                     // to conceal code. It is up to the shell to sanitize it.
172                     continue;
173                 }
174                 break;
175             }
176             endOfLine();
177         }
178     }
179 
180     /***********************
181      * Alternative entry point for DMDLIB, adds `whitespaceToken`
182      */
183     this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset,
184         bool doDocComment, bool commentToken, bool whitespaceToken,
185         ErrorSink errorSink, const CompileEnv* compileEnv = null
186         )
187     {
188         this(filename, base, begoffset, endoffset, doDocComment, commentToken, errorSink, compileEnv);
189         this.whitespaceToken = whitespaceToken;
190     }
191 
192     /******************
193      * Used for unittests for a mock Lexer
194      */
195     this(ErrorSink errorSink) scope @safe { assert(errorSink); this.eSink = errorSink; }
196 
197     /**************************************
198      * Reset lexer to lex #define's
199      */
200     final void resetDefineLines(const(char)[] slice)
201     {
202         base = slice.ptr;
203         end = base + slice.length;
204         assert(*end == 0);
205         p = base;
206         line = p;
207         tokenizeNewlines = true;
208         inTokenStringConstant = 0;
209         lastDocLine = 0;
210         scanloc = Loc("#defines", 1, 1);
211     }
212 
213     /**********************************
214      * Set up for next #define line.
215      * p should be at start of next line.
216      */
217     final void nextDefineLine()
218     {
219         tokenizeNewlines = true;
220     }
221 
222     /***************
223      * Range interface
224      */
225 
226     final bool empty() const pure @property @nogc @safe
227     {
228         return front() == TOK.endOfFile;
229     }
230 
231     final TOK front() const pure @property @nogc @safe
232     {
233         return token.value;
234     }
235 
236     final void popFront()
237     {
238         nextToken();
239     }
240 
241     /// Returns: a newly allocated `Token`.
242     Token* allocateToken() pure nothrow @safe
243     {
244         if (tokenFreelist)
245         {
246             Token* t = tokenFreelist;
247             tokenFreelist = t.next;
248             t.next = null;
249             return t;
250         }
251         return new Token();
252     }
253 
254     /// Frees the given token by returning it to the freelist.
255     private void releaseToken(Token* token) pure nothrow @nogc @safe
256     {
257         if (mem.isGCEnabled)
258             *token = Token.init;
259         token.next = tokenFreelist;
260         tokenFreelist = token;
261     }
262 
263     final TOK nextToken()
264     {
265         prevloc = token.loc;
266         if (token.next)
267         {
268             Token* t = token.next;
269             memcpy(&token, t, Token.sizeof);
270             releaseToken(t);
271         }
272         else
273         {
274             scan(&token);
275         }
276         //printf(token.toChars());
277         return token.value;
278     }
279 
280     /***********************
281      * Look ahead at next token's value.
282      */
283     final TOK peekNext()
284     {
285         return peek(&token).value;
286     }
287 
288     /***********************
289      * Look 2 tokens ahead at value.
290      */
291     final TOK peekNext2()
292     {
293         Token* t = peek(&token);
294         return peek(t).value;
295     }
296 
297     /****************************
298      * Turn next token in buffer into a token.
299      * Params:
300      *  t = the token to set the resulting Token to
301      */
302     final void scan(Token* t)
303     {
304         const lastLine = scanloc.linnum;
305         Loc startLoc;
306         t.blockComment = null;
307         t.lineComment = null;
308 
309         while (1)
310         {
311             t.ptr = p;
312             //printf("p = %p, *p = '%c'\n",p,*p);
313             t.loc = loc();
314             switch (*p)
315             {
316             case 0:
317             case 0x1A:
318                 t.value = TOK.endOfFile; // end of file
319                 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile.
320                 return;
321             case ' ':
322                 // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary.
323                 while ((cast(size_t)p) % uint.sizeof)
324                 {
325                     if (*p != ' ')
326                         goto LendSkipFourSpaces;
327                     p++;
328                 }
329                 while (*(cast(uint*)p) == 0x20202020) // ' ' == 0x20
330                     p += 4;
331                 // Skip over any remaining space on the line.
332                 while (*p == ' ')
333                     p++;
334             LendSkipFourSpaces:
335                 version (DMDLIB)
336                 {
337                     if (whitespaceToken)
338                     {
339                         t.value = TOK.whitespace;
340                         return;
341                     }
342                 }
343                 continue; // skip white space
344             case '\t':
345             case '\v':
346             case '\f':
347                 p++;
348                 version (DMDLIB)
349                 {
350                     if (whitespaceToken)
351                     {
352                         t.value = TOK.whitespace;
353                         return;
354                     }
355                 }
356                 continue; // skip white space
357             case '\r':
358                 p++;
359                 if (*p != '\n') // if CR stands by itself
360                 {
361                     endOfLine();
362                     if (tokenizeNewlines)
363                     {
364                         t.value = TOK.endOfLine;
365                         tokenizeNewlines = false;
366                         return;
367                     }
368                 }
369                 version (DMDLIB)
370                 {
371                     if (whitespaceToken)
372                     {
373                         t.value = TOK.whitespace;
374                         return;
375                     }
376                 }
377                 continue; // skip white space
378             case '\n':
379                 p++;
380                 endOfLine();
381                 if (tokenizeNewlines)
382                 {
383                     t.value = TOK.endOfLine;
384                     tokenizeNewlines = false;
385                     return;
386                 }
387                 version (DMDLIB)
388                 {
389                     if (whitespaceToken)
390                     {
391                         t.value = TOK.whitespace;
392                         return;
393                     }
394                 }
395                 continue; // skip white space
396 
397             case '\\':
398                 if (Ccompile && (p[1] == '\r' || p[1] == '\n'))
399                 {
400                     ++p; // ignore \ followed by new line, like VC does
401                     continue;
402                 }
403                 goto default;
404 
405             case '0':
406                 if (!isZeroSecond(p[1]))        // if numeric literal does not continue
407                 {
408                     ++p;
409                     t.unsvalue = 0;
410                     t.value = TOK.int32Literal;
411                     return;
412                 }
413                 goto Lnumber;
414 
415             case '1': .. case '9':
416                 if (!isDigitSecond(p[1]))       // if numeric literal does not continue
417                 {
418                     t.unsvalue = *p - '0';
419                     ++p;
420                     t.value = TOK.int32Literal;
421                     return;
422                 }
423             Lnumber:
424                 t.value = number(t);
425                 return;
426 
427             case '\'':
428                 if (issinglechar(p[1]) && p[2] == '\'')
429                 {
430                     t.unsvalue = p[1];        // simple one character literal
431                     t.value = TOK.charLiteral;
432                     p += 3;
433                 }
434                 else if (Ccompile)
435                 {
436                     clexerCharConstant(*t, 0);
437                 }
438                 else
439                 {
440                     t.value = charConstant(t);
441                 }
442                 return;
443 
444             case 'u':
445             case 'U':
446             case 'L':
447                 if (!Ccompile)
448                     goto case_ident;
449                 if (p[1] == '\'')       // C wide character constant
450                 {
451                     char c = *p;
452                     if (c == 'L')       // convert L to u or U
453                         c = (wchar_tsize == 4) ? 'u' : 'U';
454                     ++p;
455                     clexerCharConstant(*t, c);
456                     return;
457                 }
458                 else if (p[1] == '\"')  // C wide string literal
459                 {
460                     const c = *p;
461                     ++p;
462                     escapeStringConstant(t);
463                     t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') :
464                                 c == 'u' ? 'w' :
465                                 'd';
466                     return;
467                 }
468                 else if (p[1] == '8' && p[2] == '\"') // C UTF-8 string literal
469                 {
470                     p += 2;
471                     escapeStringConstant(t);
472                     return;
473                 }
474                 goto case_ident;
475 
476             case 'r':
477                 if (Ccompile || p[1] != '"')
478                     goto case_ident;
479                 p++;
480                 goto case '`';
481             case '`':
482                 if (Ccompile)
483                     goto default;
484                 wysiwygStringConstant(t);
485                 return;
486             case 'x':
487                 if (p[1] != '"')
488                     goto case_ident;
489                 p++;
490                 t.value = hexStringConstant(t);
491                 return;
492             case 'q':
493                 if (Ccompile)
494                     goto case_ident;
495                 if (p[1] == '"')
496                 {
497                     p++;
498                     delimitedStringConstant(t);
499                     return;
500                 }
501                 else if (p[1] == '{')
502                 {
503                     p++;
504                     tokenStringConstant(t);
505                     return;
506                 }
507                 else
508                     goto case_ident;
509             case '"':
510                 escapeStringConstant(t);
511                 return;
512             case 'a':
513             case 'b':
514             case 'c':
515             case 'd':
516             case 'e':
517             case 'f':
518             case 'g':
519             case 'h':
520             case 'i':
521             case 'j':
522             case 'k':
523             case 'l':
524             case 'm':
525             case 'n':
526             case 'o':
527             case 'p':
528                 /*case 'q': case 'r':*/
529             case 's':
530             case 't':
531             //case 'u':
532             case 'v':
533             case 'w':
534                 /*case 'x':*/
535             case 'y':
536             case 'z':
537             case 'A':
538             case 'B':
539             case 'C':
540             case 'D':
541             case 'E':
542             case 'F':
543             case 'G':
544             case 'H':
545             case 'I':
546             case 'J':
547             case 'K':
548             //case 'L':
549             case 'M':
550             case 'N':
551             case 'O':
552             case 'P':
553             case 'Q':
554             case 'R':
555             case 'S':
556             case 'T':
557             //case 'U':
558             case 'V':
559             case 'W':
560             case 'X':
561             case 'Y':
562             case 'Z':
563             case '_':
564             case_ident:
565                 {
566                     while (1)
567                     {
568                         const c = *++p;
569                         if (isidchar(c))
570                             continue;
571                         else if (c & 0x80)
572                         {
573                             const s = p;
574                             const u = decodeUTF();
575                             if (isUniAlpha(u))
576                                 continue;
577                             error(t.loc, "char 0x%04x not allowed in identifier", u);
578                             p = s;
579                         }
580                         break;
581                     }
582                     Identifier id = Identifier.idPool((cast(char*)t.ptr)[0 .. p - t.ptr], false);
583                     t.ident = id;
584                     t.value = cast(TOK)id.getValue();
585 
586                     anyToken = 1;
587 
588                     /* Different keywords for C and D
589                      */
590                     if (Ccompile)
591                     {
592                         if (t.value != TOK.identifier)
593                         {
594                             t.value = Ckeywords[t.value];  // filter out D keywords
595                         }
596                     }
597                     else if (t.value >= FirstCKeyword)
598                         t.value = TOK.identifier;       // filter out C keywords
599 
600                     else if (*t.ptr == '_') // if special identifier token
601                     {
602                         void toToken(const(char)[] s)
603                         {
604                             t.value = TOK.string_;
605                             t.ustring = s.ptr;
606                             t.len = cast(uint)s.length;
607                             t.postfix = 0;
608                         }
609 
610                         if (id == Id.DATE)
611                             toToken(compileEnv.date);
612                         else if (id == Id.TIME)
613                             toToken(compileEnv.time);
614                         else if (id == Id.VENDOR)
615                             toToken(compileEnv.vendor);
616                         else if (id == Id.TIMESTAMP)
617                             toToken(compileEnv.timestamp);
618                         else if (id == Id.VERSIONX)
619                         {
620                             t.value = TOK.int64Literal;
621                             t.unsvalue = compileEnv.versionNumber;
622                         }
623                         else if (id == Id.EOFX)
624                         {
625                             t.value = TOK.endOfFile;
626                             // Advance scanner to end of file
627                             while (!(*p == 0 || *p == 0x1A))
628                                 p++;
629                         }
630                     }
631                     //printf("t.value = %d\n",t.value);
632                     return;
633                 }
634             case '/':
635                 p++;
636                 switch (*p)
637                 {
638                 case '=':
639                     p++;
640                     t.value = TOK.divAssign;
641                     return;
642                 case '*':
643                     p++;
644                     startLoc = loc();
645                     while (1)
646                     {
647                         while (1)
648                         {
649                             const c = *p;
650                             switch (c)
651                             {
652                             case '/':
653                                 break;
654                             case '\n':
655                                 endOfLine();
656                                 p++;
657                                 continue;
658                             case '\r':
659                                 p++;
660                                 if (*p != '\n')
661                                     endOfLine();
662                                 continue;
663                             case 0:
664                             case 0x1A:
665                                 error(t.loc, "unterminated /* */ comment");
666                                 p = end;
667                                 t.loc = loc();
668                                 t.value = TOK.endOfFile;
669                                 return;
670                             default:
671                                 if (c & 0x80)
672                                 {
673                                     const u = decodeUTF();
674                                     if (u == PS || u == LS)
675                                         endOfLine();
676                                 }
677                                 p++;
678                                 continue;
679                             }
680                             break;
681                         }
682                         p++;
683                         if (p[-2] == '*' && p - 3 != t.ptr)
684                             break;
685                     }
686                     if (commentToken)
687                     {
688                         t.loc = startLoc;
689                         t.value = TOK.comment;
690                         return;
691                     }
692                     else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr)
693                     {
694                         // if /** but not /**/
695                         getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
696                         lastDocLine = scanloc.linnum;
697                     }
698                     continue;
699                 case '/': // do // style comments
700                     startLoc = loc();
701                     while (1)
702                     {
703                         const c = *++p;
704                         switch (c)
705                         {
706                         case '\n':
707                             break;
708                         case '\r':
709                             if (p[1] == '\n')
710                                 p++;
711                             break;
712                         case 0:
713                         case 0x1A:
714                             if (commentToken)
715                             {
716                                 p = end;
717                                 t.loc = startLoc;
718                                 t.value = TOK.comment;
719                                 return;
720                             }
721                             if (doDocComment && t.ptr[2] == '/')
722                             {
723                                 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
724                                 lastDocLine = scanloc.linnum;
725                             }
726                             p = end;
727                             t.loc = loc();
728                             t.value = TOK.endOfFile;
729                             return;
730                         default:
731                             if (c & 0x80)
732                             {
733                                 const u = decodeUTF();
734                                 if (u == PS || u == LS)
735                                     break;
736                             }
737                             continue;
738                         }
739                         break;
740                     }
741                     if (commentToken)
742                     {
743                         version (DMDLIB) {}
744                         else
745                         {
746                             p++;
747                             endOfLine();
748                         }
749                         t.loc = startLoc;
750                         t.value = TOK.comment;
751                         return;
752                     }
753                     if (doDocComment && t.ptr[2] == '/')
754                     {
755                         getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
756                         lastDocLine = scanloc.linnum;
757                     }
758                     p++;
759                     endOfLine();
760                     continue;
761                 case '+':
762                     if (!Ccompile)
763                     {
764                         int nest;
765                         startLoc = loc();
766                         p++;
767                         nest = 1;
768                         while (1)
769                         {
770                             char c = *p;
771                             switch (c)
772                             {
773                             case '/':
774                                 p++;
775                                 if (*p == '+')
776                                 {
777                                     p++;
778                                     nest++;
779                                 }
780                                 continue;
781                             case '+':
782                                 p++;
783                                 if (*p == '/')
784                                 {
785                                     p++;
786                                     if (--nest == 0)
787                                         break;
788                                 }
789                                 continue;
790                             case '\r':
791                                 p++;
792                                 if (*p != '\n')
793                                     endOfLine();
794                                 continue;
795                             case '\n':
796                                 endOfLine();
797                                 p++;
798                                 continue;
799                             case 0:
800                             case 0x1A:
801                                 error(t.loc, "unterminated /+ +/ comment");
802                                 p = end;
803                                 t.loc = loc();
804                                 t.value = TOK.endOfFile;
805                                 return;
806                             default:
807                                 if (c & 0x80)
808                                 {
809                                     uint u = decodeUTF();
810                                     if (u == PS || u == LS)
811                                         endOfLine();
812                                 }
813                                 p++;
814                                 continue;
815                             }
816                             break;
817                         }
818                         if (commentToken)
819                         {
820                             t.loc = startLoc;
821                             t.value = TOK.comment;
822                             return;
823                         }
824                         if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr)
825                         {
826                             // if /++ but not /++/
827                             getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
828                             lastDocLine = scanloc.linnum;
829                         }
830                         continue;
831                     }
832                     break;
833                 default:
834                     break;
835                 }
836                 t.value = TOK.div;
837                 return;
838             case '.':
839                 p++;
840                 if (isdigit(*p))
841                 {
842                     /* Note that we don't allow ._1 and ._ as being
843                      * valid floating point numbers.
844                      */
845                     p--;
846                     t.value = inreal(t);
847                 }
848                 else if (p[0] == '.')
849                 {
850                     if (p[1] == '.')
851                     {
852                         p += 2;
853                         t.value = TOK.dotDotDot;
854                     }
855                     else
856                     {
857                         p++;
858                         t.value = TOK.slice;
859                     }
860                 }
861                 else
862                     t.value = TOK.dot;
863                 return;
864             case '&':
865                 p++;
866                 if (*p == '=')
867                 {
868                     p++;
869                     t.value = TOK.andAssign;
870                 }
871                 else if (*p == '&')
872                 {
873                     p++;
874                     t.value = TOK.andAnd;
875                 }
876                 else
877                     t.value = TOK.and;
878                 return;
879             case '|':
880                 p++;
881                 if (*p == '=')
882                 {
883                     p++;
884                     t.value = TOK.orAssign;
885                 }
886                 else if (*p == '|')
887                 {
888                     p++;
889                     t.value = TOK.orOr;
890                 }
891                 else
892                     t.value = TOK.or;
893                 return;
894             case '-':
895                 p++;
896                 if (*p == '=')
897                 {
898                     p++;
899                     t.value = TOK.minAssign;
900                 }
901                 else if (*p == '-')
902                 {
903                     p++;
904                     t.value = TOK.minusMinus;
905                 }
906                 else if (*p == '>')
907                 {
908                     ++p;
909                     t.value = TOK.arrow;
910                 }
911                 else
912                     t.value = TOK.min;
913                 return;
914             case '+':
915                 p++;
916                 if (*p == '=')
917                 {
918                     p++;
919                     t.value = TOK.addAssign;
920                 }
921                 else if (*p == '+')
922                 {
923                     p++;
924                     t.value = TOK.plusPlus;
925                 }
926                 else
927                     t.value = TOK.add;
928                 return;
929             case '<':
930                 p++;
931                 if (*p == '=')
932                 {
933                     p++;
934                     t.value = TOK.lessOrEqual; // <=
935                 }
936                 else if (*p == '<')
937                 {
938                     p++;
939                     if (*p == '=')
940                     {
941                         p++;
942                         t.value = TOK.leftShiftAssign; // <<=
943                     }
944                     else
945                         t.value = TOK.leftShift; // <<
946                 }
947                 else if (*p == ':' && Ccompile)
948                 {
949                     ++p;
950                     t.value = TOK.leftBracket;  // <:
951                 }
952                 else if (*p == '%' && Ccompile)
953                 {
954                     ++p;
955                     t.value = TOK.leftCurly;    // <%
956                 }
957                 else
958                     t.value = TOK.lessThan; // <
959                 return;
960             case '>':
961                 p++;
962                 if (*p == '=')
963                 {
964                     p++;
965                     t.value = TOK.greaterOrEqual; // >=
966                 }
967                 else if (*p == '>')
968                 {
969                     p++;
970                     if (*p == '=')
971                     {
972                         p++;
973                         t.value = TOK.rightShiftAssign; // >>=
974                     }
975                     else if (*p == '>')
976                     {
977                         p++;
978                         if (*p == '=')
979                         {
980                             p++;
981                             t.value = TOK.unsignedRightShiftAssign; // >>>=
982                         }
983                         else
984                             t.value = TOK.unsignedRightShift; // >>>
985                     }
986                     else
987                         t.value = TOK.rightShift; // >>
988                 }
989                 else
990                     t.value = TOK.greaterThan; // >
991                 return;
992             case '!':
993                 p++;
994                 if (*p == '=')
995                 {
996                     p++;
997                     t.value = TOK.notEqual; // !=
998                 }
999                 else
1000                     t.value = TOK.not; // !
1001                 return;
1002             case '=':
1003                 p++;
1004                 if (*p == '=')
1005                 {
1006                     p++;
1007                     t.value = TOK.equal; // ==
1008                 }
1009                 else if (*p == '>')
1010                 {
1011                     p++;
1012                     t.value = TOK.goesTo; // =>
1013                 }
1014                 else
1015                     t.value = TOK.assign; // =
1016                 return;
1017             case '~':
1018                 p++;
1019                 if (*p == '=')
1020                 {
1021                     p++;
1022                     t.value = TOK.concatenateAssign; // ~=
1023                 }
1024                 else
1025                     t.value = TOK.tilde; // ~
1026                 return;
1027             case '^':
1028                 p++;
1029                 if (*p == '^')
1030                 {
1031                     p++;
1032                     if (*p == '=')
1033                     {
1034                         p++;
1035                         t.value = TOK.powAssign; // ^^=
1036                     }
1037                     else
1038                         t.value = TOK.pow; // ^^
1039                 }
1040                 else if (*p == '=')
1041                 {
1042                     p++;
1043                     t.value = TOK.xorAssign; // ^=
1044                 }
1045                 else
1046                     t.value = TOK.xor; // ^
1047                 return;
1048             case '(':
1049                 p++;
1050                 t.value = TOK.leftParenthesis;
1051                 return;
1052             case ')':
1053                 p++;
1054                 t.value = TOK.rightParenthesis;
1055                 return;
1056             case '[':
1057                 p++;
1058                 t.value = TOK.leftBracket;
1059                 return;
1060             case ']':
1061                 p++;
1062                 t.value = TOK.rightBracket;
1063                 return;
1064             case '{':
1065                 p++;
1066                 t.value = TOK.leftCurly;
1067                 return;
1068             case '}':
1069                 p++;
1070                 t.value = TOK.rightCurly;
1071                 return;
1072             case '?':
1073                 p++;
1074                 t.value = TOK.question;
1075                 return;
1076             case ',':
1077                 p++;
1078                 t.value = TOK.comma;
1079                 return;
1080             case ';':
1081                 p++;
1082                 t.value = TOK.semicolon;
1083                 return;
1084             case ':':
1085                 p++;
1086                 if (*p == ':')
1087                 {
1088                     ++p;
1089                     t.value = TOK.colonColon;
1090                 }
1091                 else if (*p == '>' && Ccompile)
1092                 {
1093                     ++p;
1094                     t.value = TOK.rightBracket;
1095                 }
1096                 else
1097                     t.value = TOK.colon;
1098                 return;
1099             case '$':
1100                 p++;
1101                 t.value = TOK.dollar;
1102                 return;
1103             case '@':
1104                 p++;
1105                 t.value = TOK.at;
1106                 return;
1107             case '*':
1108                 p++;
1109                 if (*p == '=')
1110                 {
1111                     p++;
1112                     t.value = TOK.mulAssign;
1113                 }
1114                 else
1115                     t.value = TOK.mul;
1116                 return;
1117             case '%':
1118                 p++;
1119                 if (*p == '=')
1120                 {
1121                     p++;
1122                     t.value = TOK.modAssign;
1123                 }
1124                 else if (*p == '>' && Ccompile)
1125                 {
1126                     ++p;
1127                     t.value = TOK.rightCurly;
1128                 }
1129                 else if (*p == ':' && Ccompile)
1130                 {
1131                     goto case '#';      // %: means #
1132                 }
1133                 else
1134                     t.value = TOK.mod;
1135                 return;
1136             case '#':
1137                 {
1138                     // https://issues.dlang.org/show_bug.cgi?id=22825
1139                     // Special token sequences are terminated by newlines,
1140                     // and should not be skipped over.
1141                     this.tokenizeNewlines = true;
1142                     p++;
1143                     if (parseSpecialTokenSequence())
1144                         continue;
1145                     t.value = TOK.pound;
1146                     return;
1147                 }
1148             default:
1149                 {
1150                     dchar c = *p;
1151                     if (c & 0x80)
1152                     {
1153                         c = decodeUTF();
1154                         // Check for start of unicode identifier
1155                         if (isUniAlpha(c))
1156                             goto case_ident;
1157                         if (c == PS || c == LS)
1158                         {
1159                             endOfLine();
1160                             p++;
1161                             if (tokenizeNewlines)
1162                             {
1163                                 t.value = TOK.endOfLine;
1164                                 tokenizeNewlines = false;
1165                                 return;
1166                             }
1167                             continue;
1168                         }
1169                     }
1170                     if (c < 0x80 && isprint(c))
1171                         error(t.loc, "character '%c' is not a valid token", c);
1172                     else
1173                         error(t.loc, "character 0x%02x is not a valid token", c);
1174                     p++;
1175                     continue;
1176                     // assert(0);
1177                 }
1178             }
1179         }
1180     }
1181 
1182     final Token* peek(Token* ct)
1183     {
1184         Token* t;
1185         if (ct.next)
1186             t = ct.next;
1187         else
1188         {
1189             t = allocateToken();
1190             scan(t);
1191             ct.next = t;
1192         }
1193         return t;
1194     }
1195 
1196     /*********************************
1197      * tk is on the opening (.
1198      * Look ahead and return token that is past the closing ).
1199      */
1200     final Token* peekPastParen(Token* tk)
1201     {
1202         //printf("peekPastParen()\n");
1203         int parens = 1;
1204         int curlynest = 0;
1205         while (1)
1206         {
1207             tk = peek(tk);
1208             //tk.print();
1209             switch (tk.value)
1210             {
1211             case TOK.leftParenthesis:
1212                 parens++;
1213                 continue;
1214             case TOK.rightParenthesis:
1215                 --parens;
1216                 if (parens)
1217                     continue;
1218                 tk = peek(tk);
1219                 break;
1220             case TOK.leftCurly:
1221                 curlynest++;
1222                 continue;
1223             case TOK.rightCurly:
1224                 if (--curlynest >= 0)
1225                     continue;
1226                 break;
1227             case TOK.semicolon:
1228                 if (curlynest)
1229                     continue;
1230                 break;
1231             case TOK.endOfFile:
1232                 break;
1233             default:
1234                 continue;
1235             }
1236             return tk;
1237         }
1238     }
1239 
1240     /*******************************************
1241      * Parse escape sequence.
1242      */
1243     private uint escapeSequence(out dchar c2)
1244     {
1245         return Lexer.escapeSequence(token.loc, p, Ccompile, c2);
1246     }
1247 
1248     /********
1249      * Parse the given string literal escape sequence into a single character.
1250      * D https://dlang.org/spec/lex.html#escape_sequences
1251      * C11 6.4.4.4
1252      * Params:
1253      *  loc = location to use for error messages
1254      *  sequence = pointer to string with escape sequence to parse. Updated to
1255      *             point past the end of the escape sequence
1256      *  Ccompile = true for compile C11 escape sequences
1257      *  c2 = returns second `dchar` of html entity with 2 code units, otherwise stays `dchar.init`
1258      * Returns:
1259      *  the escape sequence as a single character
1260      */
1261     private dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile, out dchar c2)
1262     {
1263         const(char)* p = sequence; // cache sequence reference on stack
1264         scope(exit) sequence = p;
1265 
1266         uint c = *p;
1267         int ndigits;
1268         switch (c)
1269         {
1270         case '\'':
1271         case '"':
1272         case '?':
1273         case '\\':
1274         Lconsume:
1275             p++;
1276             break;
1277         case 'a':
1278             c = 7;
1279             goto Lconsume;
1280         case 'b':
1281             c = 8;
1282             goto Lconsume;
1283         case 'f':
1284             c = 12;
1285             goto Lconsume;
1286         case 'n':
1287             c = 10;
1288             goto Lconsume;
1289         case 'r':
1290             c = 13;
1291             goto Lconsume;
1292         case 't':
1293             c = 9;
1294             goto Lconsume;
1295         case 'v':
1296             c = 11;
1297             goto Lconsume;
1298         case 'u':
1299             ndigits = 4;
1300             goto Lhex;
1301         case 'U':
1302             ndigits = 8;
1303             goto Lhex;
1304         case 'x':
1305             ndigits = 2;
1306         Lhex:
1307             p++;
1308             c = *p;
1309             if (ishex(cast(char)c))
1310             {
1311                 uint v = 0;
1312                 int n = 0;
1313                 if (Ccompile && ndigits == 2)
1314                 {
1315                     /* C11 6.4.4.4-7 one to infinity hex digits
1316                      */
1317                     do
1318                     {
1319                         if (isdigit(cast(char)c))
1320                             c -= '0';
1321                         else if (islower(c))
1322                             c -= 'a' - 10;
1323                         else
1324                             c -= 'A' - 10;
1325                         v = v * 16 + c;
1326                         c = *++p;
1327                     } while (ishex(cast(char)c));
1328                 }
1329                 else
1330                 {
1331                     while (1)
1332                     {
1333                         if (isdigit(cast(char)c))
1334                             c -= '0';
1335                         else if (islower(c))
1336                             c -= 'a' - 10;
1337                         else
1338                             c -= 'A' - 10;
1339                         v = v * 16 + c;
1340                         c = *++p;
1341                         if (++n == ndigits)
1342                             break;
1343                         if (!ishex(cast(char)c))
1344                         {
1345                             error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits);
1346                             break;
1347                         }
1348                     }
1349                     if (ndigits != 2 && !utf_isValidDchar(v))
1350                     {
1351                         error(loc, "invalid UTF character \\U%08x", v);
1352                         v = '?'; // recover with valid UTF character
1353                     }
1354                 }
1355                 c = v;
1356             }
1357             else
1358             {
1359                 error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c);
1360                 p++;
1361             }
1362             break;
1363         case '&':
1364             if (Ccompile)
1365                 goto default;
1366 
1367             // named character entity
1368             for (const idstart = ++p; 1; p++)
1369             {
1370                 switch (*p)
1371                 {
1372                 case ';':
1373                     auto entity = HtmlNamedEntity(idstart[0 .. p - idstart]);
1374                     c = entity[0];
1375                     if (entity == entity.init)
1376                     {
1377                         error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart);
1378                         c = '?';
1379                     }
1380                     if (entity[1] != entity.init[1])
1381                         c2 = entity[1];
1382 
1383                     p++;
1384                     break;
1385                 default:
1386                     if (isalpha(*p) || (p != idstart && isdigit(*p)))
1387                         continue;
1388                     error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart);
1389                     c = '?';
1390                     break;
1391                 }
1392                 break;
1393             }
1394             break;
1395         case 0:
1396         case 0x1A:
1397             // end of file
1398             c = '\\';
1399             break;
1400         default:
1401             if (isoctal(cast(char)c))
1402             {
1403                 uint v = 0;
1404                 int n = 0;
1405                 do
1406                 {
1407                     v = v * 8 + (c - '0');
1408                     c = *++p;
1409                 }
1410                 while (++n < 3 && isoctal(cast(char)c));
1411                 c = v;
1412                 if (c > 0xFF)
1413                     error(loc, "escape octal sequence \\%03o is larger than \\377", c);
1414             }
1415             else
1416             {
1417                 error(loc, "undefined escape sequence \\%c", c);
1418                 p++;
1419             }
1420             break;
1421         }
1422         return c;
1423     }
1424 
1425     /**
1426     Lex a wysiwyg string. `p` must be pointing to the first character before the
1427     contents of the string literal. The character pointed to by `p` will be used as
1428     the terminating character (i.e. backtick or double-quote).
1429     Params:
1430         result = pointer to the token that accepts the result
1431     */
1432     private void wysiwygStringConstant(Token* result)
1433     {
1434         result.value = TOK.string_;
1435         Loc start = loc();
1436         auto terminator = p[0];
1437         p++;
1438         stringbuffer.setsize(0);
1439         while (1)
1440         {
1441             dchar c = p[0];
1442             p++;
1443             switch (c)
1444             {
1445             case '\n':
1446                 endOfLine();
1447                 break;
1448             case '\r':
1449                 if (p[0] == '\n')
1450                     continue; // ignore
1451                 c = '\n'; // treat EndOfLine as \n character
1452                 endOfLine();
1453                 break;
1454             case 0:
1455             case 0x1A:
1456                 error("unterminated string constant starting at %s", start.toChars());
1457                 result.setString();
1458                 // rewind `p` so it points to the EOF character
1459                 p--;
1460                 return;
1461             default:
1462                 if (c == terminator)
1463                 {
1464                     result.setString(stringbuffer);
1465                     stringPostfix(result);
1466                     return;
1467                 }
1468                 else if (c & 0x80)
1469                 {
1470                     p--;
1471                     const u = decodeUTF();
1472                     p++;
1473                     if (u == PS || u == LS)
1474                         endOfLine();
1475                     stringbuffer.writeUTF8(u);
1476                     continue;
1477                 }
1478                 break;
1479             }
1480             stringbuffer.writeByte(c);
1481         }
1482     }
1483 
1484     /**************************************
1485      * Lex hex strings:
1486      *      x"0A ae 34FE BD"
1487      */
1488     final TOK hexStringConstant(Token* t)
1489     {
1490         Loc start = loc();
1491         uint n = 0;
1492         uint v = ~0; // dead assignment, needed to suppress warning
1493         p++;
1494         stringbuffer.setsize(0);
1495         while (1)
1496         {
1497             dchar c = *p++;
1498             switch (c)
1499             {
1500             case ' ':
1501             case '\t':
1502             case '\v':
1503             case '\f':
1504                 continue; // skip white space
1505             case '\r':
1506                 if (*p == '\n')
1507                     continue; // ignore '\r' if followed by '\n'
1508                 // Treat isolated '\r' as if it were a '\n'
1509                 goto case '\n';
1510             case '\n':
1511                 endOfLine();
1512                 continue;
1513             case 0:
1514             case 0x1A:
1515                 error("unterminated string constant starting at %s", start.toChars());
1516                 t.setString();
1517                 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1518                 p--;
1519                 return TOK.hexadecimalString;
1520             case '"':
1521                 if (n & 1)
1522                 {
1523                     error("odd number (%d) of hex characters in hex string", n);
1524                     stringbuffer.writeByte(v);
1525                 }
1526                 t.setString(stringbuffer);
1527                 t.postfix = 'h';
1528                 stringPostfix(t);
1529                 return TOK.hexadecimalString;
1530             default:
1531                 if (c >= '0' && c <= '9')
1532                     c -= '0';
1533                 else if (c >= 'a' && c <= 'f')
1534                     c -= 'a' - 10;
1535                 else if (c >= 'A' && c <= 'F')
1536                     c -= 'A' - 10;
1537                 else if (c & 0x80)
1538                 {
1539                     p--;
1540                     const u = decodeUTF();
1541                     p++;
1542                     if (u == PS || u == LS)
1543                         endOfLine();
1544                     else
1545                         error("non-hex character \\u%04x in hex string", u);
1546                 }
1547                 else
1548                     error("non-hex character '%c' in hex string", c);
1549                 if (n & 1)
1550                 {
1551                     v = (v << 4) | c;
1552                     stringbuffer.writeByte(v);
1553                 }
1554                 else
1555                     v = c;
1556                 n++;
1557                 break;
1558             }
1559         }
1560         assert(0); // see bug 15731
1561     }
1562 
1563     /**
1564     Lex a delimited string. Some examples of delimited strings are:
1565     ---
1566     q"(foo(xxx))"      // "foo(xxx)"
1567     q"[foo$(LPAREN)]"  // "foo$(LPAREN)"
1568     q"/foo]/"          // "foo]"
1569     q"HERE
1570     foo
1571     HERE"              // "foo\n"
1572     ---
1573     It is assumed that `p` points to the opening double-quote '"'.
1574     Params:
1575         result = pointer to the token that accepts the result
1576     */
1577     private void delimitedStringConstant(Token* result)
1578     {
1579         result.value = TOK.string_;
1580         Loc start = loc();
1581         dchar delimleft = 0;
1582         dchar delimright = 0;
1583         uint nest = 1;
1584         uint nestcount = ~0; // dead assignment, needed to suppress warning
1585         Identifier hereid = null;
1586         uint blankrol = 0;
1587         uint startline = 0;
1588         p++;
1589         stringbuffer.setsize(0);
1590         while (1)
1591         {
1592             const s = p;
1593             dchar c = *p++;
1594             //printf("c = '%c'\n", c);
1595             switch (c)
1596             {
1597             case '\n':
1598             Lnextline:
1599                 endOfLine();
1600                 startline = 1;
1601                 if (blankrol)
1602                 {
1603                     blankrol = 0;
1604                     continue;
1605                 }
1606                 if (hereid)
1607                 {
1608                     stringbuffer.writeUTF8(c);
1609                     continue;
1610                 }
1611                 break;
1612             case '\r':
1613                 if (*p == '\n')
1614                     continue; // ignore
1615                 c = '\n'; // treat EndOfLine as \n character
1616                 goto Lnextline;
1617             case 0:
1618             case 0x1A:
1619                 error("unterminated delimited string constant starting at %s", start.toChars());
1620                 result.setString();
1621                 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1622                 p--;
1623                 return;
1624             default:
1625                 if (c & 0x80)
1626                 {
1627                     p--;
1628                     c = decodeUTF();
1629                     p++;
1630                     if (c == PS || c == LS)
1631                         goto Lnextline;
1632                 }
1633                 break;
1634             }
1635             if (delimleft == 0)
1636             {
1637                 delimleft = c;
1638                 nest = 1;
1639                 nestcount = 1;
1640                 if (c == '(')
1641                     delimright = ')';
1642                 else if (c == '{')
1643                     delimright = '}';
1644                 else if (c == '[')
1645                     delimright = ']';
1646                 else if (c == '<')
1647                     delimright = '>';
1648                 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1649                 {
1650                     // Start of identifier; must be a heredoc
1651                     Token tok;
1652                     p = s;
1653                     scan(&tok); // read in heredoc identifier
1654                     if (tok.value != TOK.identifier)
1655                     {
1656                         error("identifier expected for heredoc, not %s", tok.toChars());
1657                         delimright = c;
1658                     }
1659                     else
1660                     {
1661                         hereid = tok.ident;
1662                         //printf("hereid = '%s'\n", hereid.toChars());
1663                         blankrol = 1;
1664                     }
1665                     nest = 0;
1666                 }
1667                 else
1668                 {
1669                     delimright = c;
1670                     nest = 0;
1671                     if (isspace(c))
1672                         error("delimiter cannot be whitespace");
1673                 }
1674             }
1675             else
1676             {
1677                 if (blankrol)
1678                 {
1679                     error("heredoc rest of line should be blank");
1680                     blankrol = 0;
1681                     continue;
1682                 }
1683                 if (nest == 1)
1684                 {
1685                     if (c == delimleft)
1686                         nestcount++;
1687                     else if (c == delimright)
1688                     {
1689                         nestcount--;
1690                         if (nestcount == 0)
1691                             goto Ldone;
1692                     }
1693                 }
1694                 else if (c == delimright)
1695                     goto Ldone;
1696                 if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid)
1697                 {
1698                     Token tok;
1699                     auto psave = p;
1700                     p = s;
1701                     scan(&tok); // read in possible heredoc identifier
1702                     //printf("endid = '%s'\n", tok.ident.toChars());
1703                     if (tok.value == TOK.identifier && tok.ident is hereid)
1704                     {
1705                         /* should check that rest of line is blank
1706                          */
1707                         goto Ldone;
1708                     }
1709                     p = psave;
1710                 }
1711                 stringbuffer.writeUTF8(c);
1712                 startline = 0;
1713             }
1714         }
1715     Ldone:
1716         if (*p == '"')
1717             p++;
1718         else if (hereid)
1719             error("delimited string must end in `%s\"`", hereid.toChars());
1720         else if (isspace(delimright))
1721             error("delimited string must end in `\"`");
1722         else
1723             error(token.loc, "delimited string must end in `%c\"`", delimright);
1724         result.setString(stringbuffer);
1725         stringPostfix(result);
1726     }
1727 
1728     /**
1729     Lex a token string. Some examples of token strings are:
1730     ---
1731     q{ foo(xxx) }    // " foo(xxx) "
1732     q{foo$(LPAREN)}  // "foo$(LPAREN)"
1733     q{{foo}"}"}      // "{foo}"}""
1734     ---
1735     It is assumed that `p` points to the opening curly-brace.
1736     Params:
1737         result = pointer to the token that accepts the result
1738     */
1739     private void tokenStringConstant(Token* result)
1740     {
1741         result.value = TOK.string_;
1742 
1743         uint nest = 1;
1744         const start = loc();
1745         const pstart = ++p;
1746         inTokenStringConstant++;
1747         scope(exit) inTokenStringConstant--;
1748         while (1)
1749         {
1750             Token tok;
1751             scan(&tok);
1752             switch (tok.value)
1753             {
1754             case TOK.leftCurly:
1755                 nest++;
1756                 continue;
1757             case TOK.rightCurly:
1758                 if (--nest == 0)
1759                 {
1760                     result.setString(pstart, p - 1 - pstart);
1761                     stringPostfix(result);
1762                     return;
1763                 }
1764                 continue;
1765             case TOK.endOfFile:
1766                 error("unterminated token string constant starting at %s", start.toChars());
1767                 result.setString();
1768                 return;
1769             default:
1770                 continue;
1771             }
1772         }
1773     }
1774 
1775     /**
1776     Scan a quoted string while building the processed string value by
1777     handling escape sequences. The result is returned in the given `t` token.
1778     This function assumes that `p` currently points to the opening quote
1779     of the string.
1780     Params:
1781         t = the token to set the resulting string to
1782     * References:
1783     *   D https://dlang.org/spec/lex.html#double_quoted_strings
1784     *   ImportC C11 6.4.5
1785     */
1786     private void escapeStringConstant(Token* t)
1787     {
1788         t.value = TOK.string_;
1789 
1790         const start = loc();
1791         const tc = *p++;        // opening quote
1792         stringbuffer.setsize(0);
1793         while (1)
1794         {
1795             dchar c = *p++;
1796             dchar c2;
1797             switch (c)
1798             {
1799             case '\\':
1800                 switch (*p)
1801                 {
1802                 case '&':
1803                     if (Ccompile)
1804                         goto default;
1805 
1806                     c = escapeSequence(c2);
1807                     stringbuffer.writeUTF8(c);
1808                     if (c2 != dchar.init)
1809                         stringbuffer.writeUTF8(c2);
1810                     continue;
1811                 case 'u':
1812                 case 'U':
1813                     c = escapeSequence(c2);
1814                     stringbuffer.writeUTF8(c);
1815                     continue;
1816                 default:
1817                     c = escapeSequence(c2);
1818                     break;
1819                 }
1820                 break;
1821             case '\n':
1822                 endOfLine();
1823                 if (Ccompile)
1824                     goto Lunterminated;
1825                 break;
1826             case '\r':
1827                 if (*p == '\n')
1828                     continue; // ignore
1829                 c = '\n'; // treat EndOfLine as \n character
1830                 endOfLine();
1831                 if (Ccompile)
1832                     goto Lunterminated;
1833                 break;
1834             case '\'':
1835             case '"':
1836                 if (c != tc)
1837                     goto default;
1838                 t.setString(stringbuffer);
1839                 if (!Ccompile)
1840                     stringPostfix(t);
1841                 return;
1842             case 0:
1843             case 0x1A:
1844                 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1845                 p--;
1846             Lunterminated:
1847                 error("unterminated string constant starting at %s", start.toChars());
1848                 t.setString();
1849                 return;
1850             default:
1851                 if (c & 0x80)
1852                 {
1853                     p--;
1854                     c = decodeUTF();
1855                     if (c == LS || c == PS)
1856                     {
1857                         c = '\n';
1858                         endOfLine();
1859                         if (Ccompile)
1860                             goto Lunterminated;
1861                     }
1862                     p++;
1863                     stringbuffer.writeUTF8(c);
1864                     continue;
1865                 }
1866                 break;
1867             }
1868             stringbuffer.writeByte(c);
1869         }
1870     }
1871 
1872     /**************************************
1873      * Reference:
1874      *    https://dlang.org/spec/lex.html#characterliteral
1875      */
1876     private TOK charConstant(Token* t)
1877     {
1878         TOK tk = TOK.charLiteral;
1879         //printf("Lexer::charConstant\n");
1880         p++;
1881         dchar c = *p++;
1882         dchar c2;
1883         switch (c)
1884         {
1885         case '\\':
1886             switch (*p)
1887             {
1888             case 'u':
1889                 tk = TOK.wcharLiteral;
1890                 goto default;
1891             case 'U':
1892             case '&':
1893                 tk = TOK.dcharLiteral;
1894                 goto default;
1895             default:
1896                 t.unsvalue = escapeSequence(c2);
1897                 if (c2 != c2.init)
1898                 {
1899                     error("html entity requires 2 code units, use a string instead of a character");
1900                     t.unsvalue = '?';
1901                 }
1902                 break;
1903             }
1904             break;
1905         case '\n':
1906         L1:
1907             endOfLine();
1908             goto case;
1909         case '\r':
1910             goto case '\'';
1911         case 0:
1912         case 0x1A:
1913             // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1914             p--;
1915             goto case;
1916         case '\'':
1917             error("unterminated character constant");
1918             t.unsvalue = '?';
1919             return tk;
1920         default:
1921             if (c & 0x80)
1922             {
1923                 p--;
1924                 c = decodeUTF();
1925                 p++;
1926                 if (c == LS || c == PS)
1927                     goto L1;
1928                 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1929                     tk = TOK.wcharLiteral;
1930                 else
1931                     tk = TOK.dcharLiteral;
1932             }
1933             t.unsvalue = c;
1934             break;
1935         }
1936         if (*p != '\'')
1937         {
1938             while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' &&
1939                     *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}')
1940             {
1941                 if (*p & 0x80)
1942                 {
1943                     const s = p;
1944                     c = decodeUTF();
1945                     if (c == LS || c == PS)
1946                     {
1947                         p = s;
1948                         break;
1949                     }
1950                 }
1951                 p++;
1952             }
1953 
1954             if (*p == '\'')
1955             {
1956                 error("character constant has multiple characters");
1957                 p++;
1958             }
1959             else
1960                 error("unterminated character constant");
1961             t.unsvalue = '?';
1962             return tk;
1963         }
1964         p++;
1965         return tk;
1966     }
1967 
1968     /***************************************
1969      * Lex C character constant.
1970      * Parser is on the opening quote.
1971      * Params:
1972      *  t = token to fill in
1973      *  prefix = one of `u`, `U` or 0.
1974      * Reference:
1975      *  C11 6.4.4.4
1976      */
1977     private void clexerCharConstant(ref Token t, char prefix)
1978     {
1979         escapeStringConstant(&t);
1980         const(char)[] str = t.ustring[0 .. t.len];
1981         const n = str.length;
1982         const loc = t.loc;
1983         if (n == 0)
1984         {
1985             error(loc, "empty character constant");
1986             t.value = TOK.semicolon;
1987             return;
1988         }
1989 
1990         uint u;
1991         switch (prefix)
1992         {
1993             case 0:
1994                 if (n == 1) // fast case
1995                 {
1996                     u = str[0];
1997                 }
1998                 else if (n > 4)
1999                     error(loc, "max number of chars in character literal is 4, had %d",
2000                         cast(int)n);
2001                 else
2002                 {
2003                     foreach (i, c; str)
2004                         (cast(char*)&u)[n - 1 - i] = c;
2005                 }
2006                 break;
2007 
2008             case 'u':
2009                 dchar d1;
2010                 size_t idx;
2011                 auto msg = utf_decodeChar(str, idx, d1);
2012                 dchar d2 = 0;
2013                 if (idx < n && !msg)
2014                     msg = utf_decodeChar(str, idx, d2);
2015                 if (msg)
2016                     error(loc, "%.*s", cast(int)msg.length, msg.ptr);
2017                 else if (idx < n)
2018                     error(loc, "max number of chars in 16 bit character literal is 2, had %d",
2019                         cast(int)((n + 1) >> 1));
2020                 else if (d1 > 0x1_0000)
2021                     error(loc, "%d does not fit in 16 bits", d1);
2022                 else if (d2 > 0x1_0000)
2023                     error(loc, "%d does not fit in 16 bits", d2);
2024                 u = d1;
2025                 if (d2)
2026                     u = (d1 << 16) | d2;
2027                 break;
2028 
2029             case 'U':
2030                 dchar d;
2031                 size_t idx;
2032                 auto msg = utf_decodeChar(str, idx, d);
2033                 if (msg)
2034                     error(loc, "%.*s", cast(int)msg.length, msg.ptr);
2035                 else if (idx < n)
2036                     error(loc, "max number of chars in 32 bit character literal is 1, had %d",
2037                         cast(int)((n + 3) >> 2));
2038                 u = d;
2039                 break;
2040 
2041             default:
2042                 assert(0);
2043         }
2044         t.value = n == 1 ? TOK.charLiteral : TOK.int32Literal;
2045         t.unsvalue = u;
2046     }
2047 
2048     /***************************************
2049      * Get postfix of string literal.
2050      */
2051     private void stringPostfix(Token* t) pure @nogc
2052     {
2053         switch (*p)
2054         {
2055         case 'c':
2056         case 'w':
2057         case 'd':
2058             t.postfix = *p;
2059             p++;
2060             break;
2061         default:
2062             t.postfix = 0;
2063             break;
2064         }
2065     }
2066 
2067     /**************************************
2068      * Read in a number.
2069      * If it's an integer, store it in tok.TKutok.Vlong.
2070      *      integers can be decimal, octal or hex
2071      *      Handle the suffixes U, UL, LU, L, etc.
2072      * If it's double, store it in tok.TKutok.Vdouble.
2073      * Returns:
2074      *      TKnum
2075      *      TKdouble,...
2076      */
2077     private TOK number(Token* t)
2078     {
2079         int base = 10;
2080         const start = p;
2081         ulong n = 0; // unsigned >=64 bit integer type
2082         int d;
2083         bool err = false;
2084         bool overflow = false;
2085         bool anyBinaryDigitsNoSingleUS = false;
2086         bool anyHexDigitsNoSingleUS = false;
2087         char errorDigit = 0;
2088         dchar c = *p;
2089         if (c == '0')
2090         {
2091             ++p;
2092             c = *p;
2093             switch (c)
2094             {
2095             case '0':
2096             case '1':
2097             case '2':
2098             case '3':
2099             case '4':
2100             case '5':
2101             case '6':
2102             case '7':
2103                 base = 8;
2104                 break;
2105 
2106             case '8':
2107             case '9':
2108                 errorDigit = cast(char) c;
2109                 base = 8;
2110                 break;
2111             case 'x':
2112             case 'X':
2113                 ++p;
2114                 base = 16;
2115                 break;
2116             case 'b':
2117             case 'B':
2118                 ++p;
2119                 base = 2;
2120                 break;
2121             case '.':
2122                 if (p[1] == '.')
2123                     goto Ldone; // if ".."
2124                 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
2125                 {
2126                     if (Ccompile && (p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
2127                         goto Lreal;  // if `0.f` or `0.L`
2128                     goto Ldone; // if ".identifier" or ".unicode"
2129                 }
2130                 goto Lreal; // '.' is part of current token
2131             case 'i':
2132             case 'f':
2133             case 'F':
2134                 goto Lreal;
2135             case '_':
2136                 if (Ccompile)
2137                     error("embedded `_` not allowed");
2138                 ++p;
2139                 base = 8;
2140                 break;
2141             case 'L':
2142                 if (p[1] == 'i')
2143                     goto Lreal;
2144                 break;
2145             default:
2146                 break;
2147             }
2148         }
2149         while (1)
2150         {
2151             c = *p;
2152             switch (c)
2153             {
2154             case '0':
2155             case '1':
2156             case '2':
2157             case '3':
2158             case '4':
2159             case '5':
2160             case '6':
2161             case '7':
2162             case '8':
2163             case '9':
2164                 ++p;
2165                 d = c - '0';
2166                 break;
2167             case 'a':
2168             case 'b':
2169             case 'c':
2170             case 'd':
2171             case 'e':
2172             case 'f':
2173             case 'A':
2174             case 'B':
2175             case 'C':
2176             case 'D':
2177             case 'E':
2178             case 'F':
2179                 ++p;
2180                 if (base != 16)
2181                 {
2182                     if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
2183                         goto Lreal;
2184                 }
2185                 if (c >= 'a')
2186                     d = c + 10 - 'a';
2187                 else
2188                     d = c + 10 - 'A';
2189                 break;
2190             case 'L':
2191                 if (p[1] == 'i')
2192                     goto Lreal;
2193                 goto Ldone;
2194             case '.':
2195                 if (p[1] == '.')
2196                     goto Ldone; // if ".."
2197                 if (base <= 10 && n > 0 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
2198                 {
2199                     if (Ccompile && base == 10 &&
2200                         (p[1] == 'e' || p[1] == 'E' || p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
2201                         goto Lreal;  // if `1.e6` or `1.f` or `1.L`
2202                     goto Ldone; // if ".identifier" or ".unicode"
2203                 }
2204                 if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80))
2205                     goto Ldone; // if ".identifier" or ".unicode"
2206                 if (base == 2)
2207                     goto Ldone; // if ".identifier" or ".unicode"
2208                 goto Lreal; // otherwise as part of a floating point literal
2209 
2210             case 'i':
2211                 if (Ccompile)
2212                     goto Ldone;
2213                 goto Lreal;
2214 
2215             case 'p':
2216             case 'P':
2217             Lreal:
2218                 p = start;
2219                 return inreal(t);
2220             case '_':
2221                 if (Ccompile)
2222                     goto default;
2223                 ++p;
2224                 continue;
2225             default:
2226                 goto Ldone;
2227             }
2228             // got a digit here, set any necessary flags, check for errors
2229             anyHexDigitsNoSingleUS = true;
2230             anyBinaryDigitsNoSingleUS = true;
2231             if (!errorDigit && d >= base)
2232             {
2233                 errorDigit = cast(char) c;
2234             }
2235             // Avoid expensive overflow check if we aren't at risk of overflow
2236             if (n <= 0x0FFF_FFFF_FFFF_FFFFUL)
2237                 n = n * base + d;
2238             else
2239             {
2240                 import core.checkedint : mulu, addu;
2241 
2242                 n = mulu(n, base, overflow);
2243                 n = addu(n, d, overflow);
2244             }
2245         }
2246     Ldone:
2247         if (errorDigit)
2248         {
2249             error(token.loc, "%s digit expected, not `%c`", base == 2 ? "binary".ptr :
2250                                                  base == 8 ? "octal".ptr :
2251                                                  "decimal".ptr, errorDigit);
2252             err = true;
2253         }
2254         if (overflow && !err)
2255         {
2256             error("integer overflow");
2257             err = true;
2258         }
2259         if ((base == 2 && !anyBinaryDigitsNoSingleUS) ||
2260             (base == 16 && !anyHexDigitsNoSingleUS))
2261             error(token.loc, "`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start);
2262 
2263         t.unsvalue = n;
2264 
2265         if (Ccompile)
2266             return cnumber(base, n);
2267 
2268         enum FLAGS : int
2269         {
2270             none = 0,
2271             decimal = 1, // decimal
2272             unsigned = 2, // u or U suffix
2273             long_ = 4, // L suffix
2274         }
2275 
2276         FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none;
2277         // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2278         const psuffix = p;
2279         while (1)
2280         {
2281             FLAGS f;
2282             switch (*p)
2283             {
2284             case 'U':
2285             case 'u':
2286                 f = FLAGS.unsigned;
2287                 goto L1;
2288             case 'l':
2289                 f = FLAGS.long_;
2290                 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
2291                 goto L1;
2292             case 'L':
2293                 f = FLAGS.long_;
2294             L1:
2295                 p++;
2296                 if ((flags & f) && !err)
2297                 {
2298                     error("repeated integer suffix `%c`", p[-1]);
2299                     err = true;
2300                 }
2301                 flags = cast(FLAGS)(flags | f);
2302                 continue;
2303             default:
2304                 break;
2305             }
2306             break;
2307         }
2308         if (base == 8 && n >= 8)
2309         {
2310             if (err)
2311                 // can't translate invalid octal value, just show a generic message
2312                 error("octal literals larger than 7 are no longer supported");
2313             else
2314                 error(token.loc, "octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead",
2315                     n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix);
2316         }
2317         TOK result;
2318         switch (flags)
2319         {
2320         case FLAGS.none:
2321             /* Octal or Hexadecimal constant.
2322              * First that fits: int, uint, long, ulong
2323              */
2324             if (n & 0x8000000000000000L)
2325                 result = TOK.uns64Literal;
2326             else if (n & 0xFFFFFFFF00000000L)
2327                 result = TOK.int64Literal;
2328             else if (n & 0x80000000)
2329                 result = TOK.uns32Literal;
2330             else
2331                 result = TOK.int32Literal;
2332             break;
2333         case FLAGS.decimal:
2334             /* First that fits: int, long, long long
2335              */
2336             if (n & 0x8000000000000000L)
2337             {
2338                 result = TOK.uns64Literal;
2339             }
2340             else if (n & 0xFFFFFFFF80000000L)
2341                 result = TOK.int64Literal;
2342             else
2343                 result = TOK.int32Literal;
2344             break;
2345         case FLAGS.unsigned:
2346         case FLAGS.decimal | FLAGS.unsigned:
2347             /* First that fits: uint, ulong
2348              */
2349             if (n & 0xFFFFFFFF00000000L)
2350                 result = TOK.uns64Literal;
2351             else
2352                 result = TOK.uns32Literal;
2353             break;
2354         case FLAGS.decimal | FLAGS.long_:
2355             if (n & 0x8000000000000000L)
2356             {
2357                 if (!err)
2358                 {
2359                     error("signed integer overflow");
2360                     err = true;
2361                 }
2362                 result = TOK.uns64Literal;
2363             }
2364             else
2365                 result = TOK.int64Literal;
2366             break;
2367         case FLAGS.long_:
2368             if (n & 0x8000000000000000L)
2369                 result = TOK.uns64Literal;
2370             else
2371                 result = TOK.int64Literal;
2372             break;
2373         case FLAGS.unsigned | FLAGS.long_:
2374         case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
2375             result = TOK.uns64Literal;
2376             break;
2377         default:
2378             debug
2379             {
2380                 printf("%x\n", flags);
2381             }
2382             assert(0);
2383         }
2384         return result;
2385     }
2386 
2387     /**************************************
2388      * Lex C integer-suffix
2389      * Params:
2390      *  base = number base
2391      *  n = raw integer value
2392      * Returns:
2393      *  token value
2394      */
2395     private TOK cnumber(int base, ulong n)
2396     {
2397         /* C11 6.4.4.1
2398          * Parse trailing suffixes:
2399          *   u or U
2400          *   l or L
2401          *   ll or LL
2402          */
2403         enum FLAGS : uint
2404         {
2405             octalhex = 1, // octal or hexadecimal
2406             decimal  = 2, // decimal
2407             unsigned = 4, // u or U suffix
2408             long_    = 8, // l or L suffix
2409             llong    = 0x10, // ll or LL
2410 
2411             // Microsoft extensions
2412             i8       = 0x20,
2413             i16      = 0x40,
2414             i32      = 0x80,
2415             i64      = 0x100,
2416         }
2417         FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex;
2418         bool err;
2419     Lsuffixes:
2420         while (1)
2421         {
2422             FLAGS f;
2423             const cs = *p;
2424             switch (cs)
2425             {
2426                 case 'U':
2427                 case 'u':
2428                     f = FLAGS.unsigned;
2429                     break;
2430 
2431                 case 'l':
2432                 case 'L':
2433                     f = FLAGS.long_;
2434                     if (cs == p[1])
2435                     {
2436                         f = FLAGS.long_ | FLAGS.llong;
2437                         ++p;
2438                     }
2439                     break;
2440 
2441                 case 'i':
2442                 case 'I':
2443                     if (p[1] == '8')
2444                     {
2445                         f = FLAGS.i8;
2446                         ++p;
2447                     }
2448                     else if (p[1] == '1' && p[2] == '6')
2449                     {
2450                         f = FLAGS.i16;
2451                         p += 2;
2452                     }
2453                     else if (p[1] == '3' && p[2] == '2')
2454                     {
2455                         f = FLAGS.i32;
2456                         p += 2;
2457                     }
2458                     else if (p[1] == '6' && p[2] == '4')
2459                     {
2460                         f = FLAGS.i64;
2461                         p += 2;
2462                     }
2463                     else
2464                         break Lsuffixes;
2465                     if (p[1] >= '0' && p[1] <= '9' && !err)
2466                     {
2467                         error("invalid integer suffix");
2468                         err = true;
2469                     }
2470                     break;
2471 
2472                 default:
2473                     break Lsuffixes;
2474             }
2475             ++p;
2476             if ((flags & f) && !err)
2477             {
2478                 error("duplicate integer suffixes");
2479                 err = true;
2480             }
2481             flags = cast(FLAGS)(flags | f);
2482         }
2483 
2484         TOK result = TOK.int32Literal;     // default
2485         switch (flags)
2486         {
2487             /* Since D doesn't have a variable sized `long` or `unsigned long` type,
2488              * this code deviates from C by picking D int, uint, long, or ulong instead
2489              */
2490 
2491             case FLAGS.octalhex:
2492                 /* Octal or Hexadecimal constant.
2493                  * First that fits: int, unsigned, long, unsigned long,
2494                  * long long, unsigned long long
2495                  */
2496                 if (n & 0x8000000000000000L)
2497                     result = TOK.uns64Literal;      // unsigned long
2498                 else if (n & 0xFFFFFFFF00000000L)
2499                     result = TOK.int64Literal;      // long
2500                 else if (n & 0x80000000)
2501                     result = TOK.uns32Literal;
2502                 else
2503                     result = TOK.int32Literal;
2504                 break;
2505 
2506             case FLAGS.decimal:
2507                 /* First that fits: int, long, long long
2508                  */
2509                 if (n & 0x8000000000000000L)
2510                     result = TOK.uns64Literal;      // unsigned long
2511                 else if (n & 0xFFFFFFFF80000000L)
2512                     result = TOK.int64Literal;      // long
2513                 else
2514                     result = TOK.int32Literal;
2515                 break;
2516 
2517             case FLAGS.octalhex | FLAGS.unsigned:
2518             case FLAGS.decimal | FLAGS.unsigned:
2519                 /* First that fits: unsigned, unsigned long, unsigned long long
2520                  */
2521                 if (n & 0xFFFFFFFF00000000L)
2522                     result = TOK.uns64Literal;      // unsigned long
2523                 else
2524                     result = TOK.uns32Literal;
2525                 break;
2526 
2527             case FLAGS.decimal | FLAGS.long_:
2528                 /* First that fits: long, long long
2529                  */
2530                 if (longsize == 4 || long_longsize == 4)
2531                 {
2532                     if (n & 0xFFFFFFFF_80000000L)
2533                         result = TOK.int64Literal;
2534                     else
2535                         result = TOK.int32Literal;  // long
2536                 }
2537                 else
2538                 {
2539                     result = TOK.int64Literal;      // long
2540                 }
2541                 break;
2542 
2543             case FLAGS.octalhex | FLAGS.long_:
2544                 /* First that fits: long, unsigned long, long long,
2545                  * unsigned long long
2546                  */
2547                 if (longsize == 4 || long_longsize == 4)
2548                 {
2549                     if (n & 0x8000000000000000L)
2550                         result = TOK.uns64Literal;
2551                     else if (n & 0xFFFFFFFF00000000L)
2552                         result = TOK.int64Literal;
2553                     else if (n & 0x80000000)
2554                         result = TOK.uns32Literal;      // unsigned long
2555                     else
2556                         result = TOK.int32Literal;      // long
2557                 }
2558                 else
2559                 {
2560                     if (n & 0x80000000_00000000L)
2561                         result = TOK.uns64Literal;      // unsigned long
2562                     else
2563                         result = TOK.int64Literal;      // long
2564                 }
2565                 break;
2566 
2567             case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_:
2568             case FLAGS.decimal  | FLAGS.unsigned | FLAGS.long_:
2569                 /* First that fits: unsigned long, unsigned long long
2570                  */
2571                 if (longsize == 4 || long_longsize == 4)
2572                 {
2573                     if (n & 0xFFFFFFFF00000000L)
2574                         result = TOK.uns64Literal;
2575                     else
2576                         result = TOK.uns32Literal;      // unsigned long
2577                 }
2578                 else
2579                 {
2580                     result = TOK.uns64Literal;  // unsigned long
2581                 }
2582                 break;
2583 
2584             case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong:
2585                 /* First that fits: long long, unsigned long long
2586                  */
2587                 if (n & 0x8000000000000000L)
2588                     result = TOK.uns64Literal;
2589                 else
2590                     result = TOK.int64Literal;
2591                 break;
2592 
2593             case FLAGS.decimal | FLAGS.long_ | FLAGS.llong:
2594                 /* long long
2595                  */
2596                 result = TOK.int64Literal;
2597                 break;
2598 
2599             case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2600             case FLAGS.decimal  | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2601                 result = TOK.uns64Literal;
2602                 break;
2603 
2604             case FLAGS.octalhex | FLAGS.i8:
2605             case FLAGS.octalhex | FLAGS.i16:
2606             case FLAGS.octalhex | FLAGS.i32:
2607             case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i8:
2608             case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i16:
2609             case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i32:
2610             case FLAGS.decimal  | FLAGS.unsigned | FLAGS.i8:
2611             case FLAGS.decimal  | FLAGS.unsigned | FLAGS.i16:
2612             case FLAGS.decimal  | FLAGS.unsigned | FLAGS.i32:
2613                 result = TOK.uns32Literal;
2614                 break;
2615 
2616             case FLAGS.decimal | FLAGS.i8:
2617             case FLAGS.decimal | FLAGS.i16:
2618             case FLAGS.decimal | FLAGS.i32:
2619                 result = TOK.int32Literal;
2620                 break;
2621 
2622             case FLAGS.octalhex | FLAGS.i64:
2623             case FLAGS.octalhex | FLAGS.unsigned | FLAGS.i64:
2624             case FLAGS.decimal  | FLAGS.unsigned | FLAGS.i64:
2625                 result = TOK.uns64Literal;
2626                 break;
2627 
2628             case FLAGS.decimal | FLAGS.i64:
2629                 result = TOK.int64Literal;
2630                 break;
2631 
2632             default:
2633                 debug printf("%x\n",flags);
2634                 assert(0);
2635         }
2636         return result;
2637     }
2638 
2639     /**************************************
2640      * Read in characters, converting them to real.
2641      * Bugs:
2642      *      Exponent overflow not detected.
2643      *      Too much requested precision is not detected.
2644      */
2645     private TOK inreal(Token* t)
2646     {
2647         //printf("Lexer::inreal()\n");
2648         debug
2649         {
2650             assert(*p == '.' || isdigit(*p));
2651         }
2652         bool isWellformedString = true;
2653         stringbuffer.setsize(0);
2654         auto pstart = p;
2655         bool hex = false;
2656         dchar c = *p++;
2657         // Leading '0x'
2658         if (c == '0')
2659         {
2660             c = *p++;
2661             if (c == 'x' || c == 'X')
2662             {
2663                 hex = true;
2664                 c = *p++;
2665             }
2666         }
2667         // Digits to left of '.'
2668         while (1)
2669         {
2670             if (c == '.')
2671             {
2672                 c = *p++;
2673                 break;
2674             }
2675             if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2676             {
2677                 c = *p++;
2678                 continue;
2679             }
2680             break;
2681         }
2682         // Digits to right of '.'
2683         while (1)
2684         {
2685             if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2686             {
2687                 c = *p++;
2688                 continue;
2689             }
2690             break;
2691         }
2692         if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
2693         {
2694             c = *p++;
2695             if (c == '-' || c == '+')
2696             {
2697                 c = *p++;
2698             }
2699             bool anyexp = false;
2700             while (1)
2701             {
2702                 if (isdigit(c))
2703                 {
2704                     anyexp = true;
2705                     c = *p++;
2706                     continue;
2707                 }
2708                 if (c == '_')
2709                 {
2710                     if (Ccompile)
2711                         error("embedded `_` in numeric literals not allowed");
2712                     c = *p++;
2713                     continue;
2714                 }
2715                 if (!anyexp)
2716                 {
2717                     error("missing exponent");
2718                     isWellformedString = false;
2719                 }
2720                 break;
2721             }
2722         }
2723         else if (hex)
2724         {
2725             error("exponent required for hex float");
2726             isWellformedString = false;
2727         }
2728         --p;
2729         while (pstart < p)
2730         {
2731             if (*pstart != '_')
2732                 stringbuffer.writeByte(*pstart);
2733             ++pstart;
2734         }
2735         stringbuffer.writeByte(0);
2736         auto sbufptr = cast(const(char)*)stringbuffer[].ptr;
2737         TOK result;
2738         bool isOutOfRange = false;
2739         t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, isOutOfRange) : CTFloat.zero);
2740 
2741         bool imaginary = false;
2742         if (*p == 'i' && Ccompile)
2743         {
2744             ++p;
2745             imaginary = true;
2746         }
2747 
2748         switch (*p)
2749         {
2750         case 'F':
2751         case 'f':
2752             if (isWellformedString && !isOutOfRange)
2753                 isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr);
2754             result = TOK.float32Literal;
2755             p++;
2756             break;
2757         default:
2758             if (isWellformedString && !isOutOfRange)
2759                 isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr);
2760             result = TOK.float64Literal;
2761             break;
2762         case 'l':
2763             if (!Ccompile)
2764                 error("use 'L' suffix instead of 'l'");
2765             goto case 'L';
2766         case 'L':
2767             ++p;
2768             if (Ccompile && long_doublesize == 8)
2769                 goto default;
2770             result = TOK.float80Literal;
2771             break;
2772         }
2773 
2774         if ((*p == 'i' || *p == 'I') && !Ccompile)
2775         {
2776             if (*p == 'I')
2777                 error("use 'i' suffix instead of 'I'");
2778             p++;
2779             imaginary = true;
2780         }
2781 
2782         if (imaginary)
2783         {
2784             switch (result)
2785             {
2786             case TOK.float32Literal:
2787                 result = TOK.imaginary32Literal;
2788                 break;
2789             case TOK.float64Literal:
2790                 result = TOK.imaginary64Literal;
2791                 break;
2792             case TOK.float80Literal:
2793                 result = TOK.imaginary80Literal;
2794                 break;
2795             default:
2796                 break;
2797             }
2798         }
2799         const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal);
2800         if (isOutOfRange && !isLong && (!Ccompile || hex))
2801         {
2802             /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex
2803              */
2804             const char* suffix = result == TOK.float32Literal ? "f" : result == TOK.float80Literal ? "L" : "";
2805             const char* type = [TOK.float32Literal: "`float`".ptr,
2806                                 TOK.float64Literal: "`double`".ptr,
2807                                 TOK.float80Literal: "`real` for the current target".ptr][result];
2808             error(scanloc, "number `%s%s` is not representable as a %s", sbufptr, suffix, type);
2809             const char* extra = result == TOK.float64Literal ? "`real` literals can be written using the `L` suffix. " : "";
2810             eSink.errorSupplemental(scanloc, "%shttps://dlang.org/spec/lex.html#floatliteral", extra);
2811         }
2812         debug
2813         {
2814             switch (result)
2815             {
2816             case TOK.float32Literal:
2817             case TOK.float64Literal:
2818             case TOK.float80Literal:
2819             case TOK.imaginary32Literal:
2820             case TOK.imaginary64Literal:
2821             case TOK.imaginary80Literal:
2822                 break;
2823             default:
2824                 assert(0);
2825             }
2826         }
2827         return result;
2828     }
2829 
2830     final Loc loc() @nogc
2831     {
2832         scanloc.charnum = cast(ushort)(1 + p - line);
2833         version (LocOffset)
2834             scanloc.fileOffset = cast(uint)(p - base);
2835         return scanloc;
2836     }
2837 
2838     void error(T...)(const(char)* format, T args)
2839     {
2840         eSink.error(token.loc, format, args);
2841     }
2842 
2843     void error(T...)(const ref Loc loc, const(char)* format, T args)
2844     {
2845         eSink.error(loc, format, args);
2846     }
2847 
2848     void deprecation(T...)(const ref Loc loc, const(char)* format, T args)
2849     {
2850         eSink.deprecation(loc, format, args);
2851     }
2852 
2853     void deprecation(T...)(const(char)* format, T args)
2854     {
2855         eSink.deprecation(token.loc, format, args);
2856     }
2857 
2858     void deprecationSupplemental(T...)(const(char)* format, T args)
2859     {
2860         eSink.deprecationSupplemental(token.loc, format, args);
2861     }
2862 
2863     /***************************************
2864      * Parse special token sequence:
2865      * Returns:
2866      *  true if the special token sequence was handled
2867      * References:
2868      *  https://dlang.org/spec/lex.html#special-token-sequence
2869      */
2870     bool parseSpecialTokenSequence()
2871     {
2872         Token n;
2873         scan(&n);
2874         if (n.value == TOK.identifier)
2875         {
2876             if (n.ident == Id.line)
2877             {
2878                 poundLine(n, false);
2879                 return true;
2880             }
2881             else
2882             {
2883                 const locx = loc();
2884                 // @@@DEPRECATED_2.103@@@
2885                 // Turn into an error in 2.113
2886                 if (inTokenStringConstant)
2887                     deprecation(locx, "token string requires valid D tokens, not `#%s`", n.ident.toChars());
2888                 else
2889                     error(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars());
2890             }
2891         }
2892         else if (n.value == TOK.if_)
2893         {
2894             const locx = loc();
2895             if (inTokenStringConstant)
2896                 error(locx, "token string requires valid D tokens, not `#if`");
2897             else
2898                 error(locx, "C preprocessor directive `#if` is not supported, use `version` or `static if`");
2899         }
2900         return false;
2901     }
2902 
2903     /*********************************************
2904      * Parse line/file preprocessor directive:
2905      *    #line linnum [filespec]
2906      * Allow __LINE__ for linnum, and __FILE__ for filespec.
2907      * Accept linemarker format:
2908      *    # linnum [filespec] {flags}
2909      * There can be zero or more flags, which are one of the digits 1..4, and
2910      * must be in ascending order. The flags are ignored.
2911      * Params:
2912      *  tok = token we're on, which is linnum of linemarker
2913      *  linemarker = true if line marker format and lexer is on linnum
2914      * References:
2915      *  linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html
2916      */
2917     final void poundLine(ref Token tok, bool linemarker)
2918     {
2919         auto linnum = this.scanloc.linnum;
2920         const(char)* filespec = null;
2921         bool flags;
2922 
2923         if (!linemarker)
2924             scan(&tok);
2925         if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal)
2926         {
2927             const lin = cast(int)(tok.unsvalue);
2928             if (lin != tok.unsvalue)
2929             {
2930                 error(tok.loc, "line number `%lld` out of range", cast(ulong)tok.unsvalue);
2931                 skipToNextLine();
2932                 return;
2933             }
2934             else
2935                 linnum = lin;
2936         }
2937         else if (tok.value == TOK.line)  // #line __LINE__
2938         {
2939         }
2940         else
2941         {
2942             error(tok.loc, "positive integer argument expected following `#line`");
2943             if (tok.value != TOK.endOfLine)
2944                 skipToNextLine();
2945             return;
2946         }
2947         while (1)
2948         {
2949             scan(&tok);
2950             switch (tok.value)
2951             {
2952             case TOK.endOfFile:
2953             case TOK.endOfLine:
2954                 if (!inTokenStringConstant)
2955                 {
2956                     this.scanloc.linnum = linnum;
2957                     if (filespec)
2958                         this.scanloc.filename = filespec;
2959                 }
2960                 return;
2961             case TOK.file:
2962                 if (filespec || flags)
2963                     goto Lerr;
2964                 filespec = mem.xstrdup(scanloc.filename);
2965                 continue;
2966             case TOK.string_:
2967                 if (filespec || flags)
2968                     goto Lerr;
2969                 if (tok.ptr[0] != '"' || tok.postfix != 0)
2970                     goto Lerr;
2971                 filespec = tok.ustring;
2972                 continue;
2973             case TOK.int32Literal:
2974                 if (!filespec)
2975                     goto Lerr;
2976                 if (linemarker && tok.unsvalue >= 1 && tok.unsvalue <= 4)
2977                 {
2978                     flags = true;   // linemarker flags seen
2979                     continue;
2980                 }
2981                 goto Lerr;
2982             default:
2983                 goto Lerr;
2984             }
2985         }
2986     Lerr:
2987         if (filespec is null)
2988             error(tok.loc, "invalid filename for `#line` directive");
2989         else if (linemarker)
2990             error(tok.loc, "invalid flag for line marker directive");
2991         else if (!Ccompile)
2992             error(tok.loc, "found `%s` when expecting new line following `#line` directive", tok.toChars());
2993         if (tok.value != TOK.endOfLine)
2994             skipToNextLine();
2995     }
2996 
2997     /***************************************
2998      * Scan forward to start of next line.
2999      * Params:
3000      *    defines = send characters to `defines`
3001      */
3002     final void skipToNextLine(OutBuffer* defines = null)
3003     {
3004         while (1)
3005         {
3006             switch (*p)
3007             {
3008             case 0:
3009             case 0x1A:
3010                 return; // do not advance p
3011 
3012             case '\n':
3013                 ++p;
3014                 break;
3015 
3016             case '\r':
3017                 ++p;
3018                 if (p[0] == '\n')
3019                    ++p;
3020                 break;
3021 
3022             default:
3023                 if (defines)
3024                     defines.writeByte(*p); // don't care about Unicode line endings for C
3025                 else if (*p & 0x80)
3026                 {
3027                     const u = decodeUTF();
3028                     if (u == PS || u == LS)
3029                     {
3030                         ++p;
3031                         break;
3032                     }
3033                 }
3034                 ++p;
3035                 continue;
3036             }
3037             break;
3038         }
3039         endOfLine();
3040         tokenizeNewlines = false;
3041     }
3042 
3043     /********************************************
3044      * Decode UTF character.
3045      * Issue error messages for invalid sequences.
3046      * Return decoded character, advance p to last character in UTF sequence.
3047      */
3048     private uint decodeUTF()
3049     {
3050         string msg;
3051         auto result = decodeUTFpure(msg);
3052 
3053         if (msg)
3054             error(token.loc, "%.*s", cast(int)msg.length, msg.ptr);
3055         return result;
3056     }
3057 
3058     /********************************************
3059      * Same as above, but the potential error message is stored to the
3060      * msg parameter instead of being issued.
3061      */
3062     private pure uint decodeUTFpure(out string msg)
3063     {
3064         const s = p;
3065         assert(*s & 0x80);
3066         // Check length of remaining string up to 4 UTF-8 characters
3067         size_t len;
3068         for (len = 1; len < 4 && s[len]; len++)
3069         {
3070         }
3071         size_t idx = 0;
3072         dchar u;
3073         msg = utf_decodeChar(s[0 .. len], idx, u);
3074         p += idx - 1;
3075         if (!msg && isBidiControl(u))
3076             msg = "Bidirectional control characters are disallowed for security reasons.";
3077         return u;
3078     }
3079 
3080     /***************************************************
3081      * Parse doc comment embedded between t.ptr and p.
3082      * Remove trailing blanks and tabs from lines.
3083      * Replace all newlines with \n.
3084      * Remove leading comment character from each line.
3085      * Decide if it's a lineComment or a blockComment.
3086      * Append to previous one for this token.
3087      *
3088      * If newParagraph is true, an extra newline will be
3089      * added between adjoining doc comments.
3090      */
3091     private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure
3092     {
3093         /* ct tells us which kind of comment it is: '/', '*', or '+'
3094          */
3095         const ct = t.ptr[2];
3096         /* Start of comment text skips over / * *, / + +, or / / /
3097          */
3098         const(char)* q = t.ptr + 3; // start of comment text
3099         const(char)* qend = p;
3100         if (ct == '*' || ct == '+')
3101             qend -= 2;
3102         /* Scan over initial row of ****'s or ++++'s or ////'s
3103          */
3104         for (; q < qend; q++)
3105         {
3106             if (*q != ct)
3107                 break;
3108         }
3109         /* Remove leading spaces until start of the comment
3110          */
3111         int linestart = 0;
3112         if (ct == '/')
3113         {
3114             while (q < qend && (*q == ' ' || *q == '\t'))
3115                 ++q;
3116         }
3117         else if (q < qend)
3118         {
3119             if (*q == '\r')
3120             {
3121                 ++q;
3122                 if (q < qend && *q == '\n')
3123                     ++q;
3124                 linestart = 1;
3125             }
3126             else if (*q == '\n')
3127             {
3128                 ++q;
3129                 linestart = 1;
3130             }
3131         }
3132         /* Remove trailing row of ****'s or ++++'s
3133          */
3134         if (ct != '/')
3135         {
3136             for (; q < qend; qend--)
3137             {
3138                 if (qend[-1] != ct)
3139                     break;
3140             }
3141         }
3142         /* Comment is now [q .. qend].
3143          * Canonicalize it into buf[].
3144          */
3145         OutBuffer buf;
3146 
3147         void trimTrailingWhitespace()
3148         {
3149             const s = buf[];
3150             auto len = s.length;
3151             while (len && (s[len - 1] == ' ' || s[len - 1] == '\t'))
3152                 --len;
3153             buf.setsize(len);
3154         }
3155 
3156         for (; q < qend; q++)
3157         {
3158             char c = *q;
3159             switch (c)
3160             {
3161             case '*':
3162             case '+':
3163                 if (linestart && c == ct)
3164                 {
3165                     linestart = 0;
3166                     /* Trim preceding whitespace up to preceding \n
3167                      */
3168                     trimTrailingWhitespace();
3169                     continue;
3170                 }
3171                 break;
3172             case ' ':
3173             case '\t':
3174                 break;
3175             case '\r':
3176                 if (q[1] == '\n')
3177                     continue; // skip the \r
3178                 goto Lnewline;
3179             default:
3180                 if (c == 226)
3181                 {
3182                     // If LS or PS
3183                     if (q[1] == 128 && (q[2] == 168 || q[2] == 169))
3184                     {
3185                         q += 2;
3186                         goto Lnewline;
3187                     }
3188                 }
3189                 linestart = 0;
3190                 break;
3191             Lnewline:
3192                 c = '\n'; // replace all newlines with \n
3193                 goto case;
3194             case '\n':
3195                 linestart = 1;
3196                 /* Trim trailing whitespace
3197                  */
3198                 trimTrailingWhitespace();
3199                 break;
3200             }
3201             buf.writeByte(c);
3202         }
3203         /* Trim trailing whitespace (if the last line does not have newline)
3204          */
3205         trimTrailingWhitespace();
3206 
3207         // Always end with a newline
3208         const s = buf[];
3209         if (s.length == 0 || s[$ - 1] != '\n')
3210             buf.writeByte('\n');
3211 
3212         // It's a line comment if the start of the doc comment comes
3213         // after other non-whitespace on the same line.
3214         auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment;
3215         // Combine with previous doc comment, if any
3216         if (*dc)
3217         {
3218             auto p = combineComments(*dc, buf[], newParagraph);
3219             *dc = p ? p[0 .. strlen(p)] : null;
3220         }
3221         else
3222             *dc = buf.extractSlice(true);
3223     }
3224 
3225     /********************************************
3226      * Combine two document comments into one,
3227      * separated by an extra newline if newParagraph is true.
3228      */
3229     static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure
3230     {
3231         //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph);
3232         const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n'
3233         if (!c1)
3234             return c2.ptr;
3235         if (!c2)
3236             return c1.ptr;
3237 
3238         int insertNewLine = 0;
3239         if (c1.length && c1[$ - 1] != '\n')
3240             insertNewLine = 1;
3241         const retSize = c1.length + insertNewLine + newParagraphSize + c2.length;
3242         auto p = cast(char*)mem.xmalloc_noscan(retSize + 1);
3243         p[0 .. c1.length] = c1[];
3244         if (insertNewLine)
3245             p[c1.length] = '\n';
3246         if (newParagraph)
3247             p[c1.length + insertNewLine] = '\n';
3248         p[retSize - c2.length .. retSize] = c2[];
3249         p[retSize] = 0;
3250         return p;
3251     }
3252 
3253     /**************************
3254      * `p` should be at start of next line
3255      */
3256     private void endOfLine() @nogc @safe
3257     {
3258         scanloc.linnum = scanloc.linnum + 1;
3259         line = p;
3260     }
3261 }
3262 
3263 
3264 /******************************* Private *****************************************/
3265 
3266 private:
3267 
3268 private enum LS = 0x2028;       // UTF line separator
3269 private enum PS = 0x2029;       // UTF paragraph separator
3270 
3271 /********************************************
3272  * Do our own char maps
3273  */
3274 private static immutable cmtable = ()
3275 {
3276     ubyte[256] table;
3277     foreach (const c; 0 .. table.length)
3278     {
3279         if ('0' <= c && c <= '7')
3280             table[c] |= CMoctal;
3281         if (c_isxdigit(c))
3282             table[c] |= CMhex;
3283         if (c_isalnum(c) || c == '_')
3284             table[c] |= CMidchar;
3285 
3286         switch (c)
3287         {
3288             case 'x': case 'X':
3289             case 'b': case 'B':
3290                 table[c] |= CMzerosecond;
3291                 break;
3292 
3293             case '0': .. case '9':
3294             case 'e': case 'E':
3295             case 'f': case 'F':
3296             case 'l': case 'L':
3297             case 'p': case 'P':
3298             case 'u': case 'U':
3299             case 'i':
3300             case '.':
3301             case '_':
3302                 table[c] |= CMzerosecond | CMdigitsecond;
3303                 break;
3304 
3305             default:
3306                 break;
3307         }
3308 
3309         switch (c)
3310         {
3311             case '\\':
3312             case '\n':
3313             case '\r':
3314             case 0:
3315             case 0x1A:
3316             case '\'':
3317                 break;
3318             default:
3319                 if (!(c & 0x80))
3320                     table[c] |= CMsinglechar;
3321                 break;
3322         }
3323     }
3324     return table;
3325 }();
3326 
3327 private
3328 {
3329     enum CMoctal  = 0x1;
3330     enum CMhex    = 0x2;
3331     enum CMidchar = 0x4;
3332     enum CMzerosecond = 0x8;
3333     enum CMdigitsecond = 0x10;
3334     enum CMsinglechar = 0x20;
3335 }
3336 
3337 private bool isoctal(const char c) pure @nogc @safe
3338 {
3339     return (cmtable[c] & CMoctal) != 0;
3340 }
3341 
3342 private bool ishex(const char c) pure @nogc @safe
3343 {
3344     return (cmtable[c] & CMhex) != 0;
3345 }
3346 
3347 private bool isidchar(const char c) pure @nogc @safe
3348 {
3349     return (cmtable[c] & CMidchar) != 0;
3350 }
3351 
3352 private bool isZeroSecond(const char c) pure @nogc @safe
3353 {
3354     return (cmtable[c] & CMzerosecond) != 0;
3355 }
3356 
3357 private bool isDigitSecond(const char c) pure @nogc @safe
3358 {
3359     return (cmtable[c] & CMdigitsecond) != 0;
3360 }
3361 
3362 private bool issinglechar(const char c) pure @nogc @safe
3363 {
3364     return (cmtable[c] & CMsinglechar) != 0;
3365 }
3366 
3367 private bool c_isxdigit(const int c) pure @nogc @safe
3368 {
3369     return (( c >= '0' && c <= '9') ||
3370             ( c >= 'a' && c <= 'f') ||
3371             ( c >= 'A' && c <= 'F'));
3372 }
3373 
3374 private bool c_isalnum(const int c) pure @nogc @safe
3375 {
3376     return (( c >= '0' && c <= '9') ||
3377             ( c >= 'a' && c <= 'z') ||
3378             ( c >= 'A' && c <= 'Z'));
3379 }
3380 
3381 /******************************* Unittest *****************************************/
3382 
3383 unittest
3384 {
3385     fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3386 
3387     ErrorSink errorSink = new ErrorSinkStderr;
3388 
3389     void test(T)(string sequence, T expected, bool Ccompile = false)
3390     {
3391         auto p = cast(const(char)*)sequence.ptr;
3392         dchar c2;
3393         Lexer lexer = new Lexer(errorSink);
3394         assert(expected == lexer.escapeSequence(Loc.initial, p, Ccompile, c2));
3395         assert(p == sequence.ptr + sequence.length);
3396     }
3397 
3398     test(`'`, '\'');
3399     test(`"`, '"');
3400     test(`?`, '?');
3401     test(`\`, '\\');
3402     test(`0`, '\0');
3403     test(`a`, '\a');
3404     test(`b`, '\b');
3405     test(`f`, '\f');
3406     test(`n`, '\n');
3407     test(`r`, '\r');
3408     test(`t`, '\t');
3409     test(`v`, '\v');
3410 
3411     test(`x00`, 0x00);
3412     test(`xff`, 0xff);
3413     test(`xFF`, 0xff);
3414     test(`xa7`, 0xa7);
3415     test(`x3c`, 0x3c);
3416     test(`xe2`, 0xe2);
3417 
3418     test(`1`, '\1');
3419     test(`42`, '\42');
3420     test(`357`, '\357');
3421 
3422     test(`u1234`, '\u1234');
3423     test(`uf0e4`, '\uf0e4');
3424 
3425     test(`U0001f603`, '\U0001f603');
3426 
3427     test(`&quot;`, '"');
3428     test(`&lt;`, '<');
3429     test(`&gt;`, '>');
3430 }
3431 
3432 unittest
3433 {
3434     fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3435 
3436     static class ErrorSinkTest : ErrorSinkNull
3437     {
3438       nothrow:
3439       extern (C++):
3440       override:
3441 
3442         import core.stdc.stdio;
3443         import core.stdc.stdarg;
3444 
3445         string expected;
3446         bool gotError;
3447 
3448         void error(const ref Loc loc, const(char)* format, ...)
3449         {
3450             gotError = true;
3451             char[100] buffer = void;
3452             va_list ap;
3453             va_start(ap, format);
3454             auto actual = buffer[0 .. vsnprintf(buffer.ptr, buffer.length, format, ap)];
3455             va_end(ap);
3456             assert(expected == actual);
3457         }
3458     }
3459 
3460     ErrorSinkTest errorSink = new ErrorSinkTest;
3461 
3462     void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false)
3463     {
3464         errorSink.expected = expectedError;
3465         errorSink.gotError = false;
3466         auto p = cast(const(char)*)sequence.ptr;
3467         Lexer lexer = new Lexer(errorSink);
3468         dchar c2;
3469         auto actualReturnValue = lexer.escapeSequence(Loc.initial, p, Ccompile, c2);
3470         assert(errorSink.gotError);
3471         assert(expectedReturnValue == actualReturnValue);
3472 
3473         auto actualScanLength = p - sequence.ptr;
3474         assert(expectedScanLength == actualScanLength);
3475     }
3476 
3477     test("c", `undefined escape sequence \c`, 'c', 1);
3478     test("!", `undefined escape sequence \!`, '!', 1);
3479     test("&quot;", `undefined escape sequence \&`, '&', 1, true);
3480 
3481     test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2);
3482 
3483     test("u1"  , `escape hex sequence has 1 hex digits instead of 4`,   0x1, 2);
3484     test("u12" , `escape hex sequence has 2 hex digits instead of 4`,  0x12, 3);
3485     test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4);
3486 
3487     test("U0"      , `escape hex sequence has 1 hex digits instead of 8`,       0x0, 2);
3488     test("U00"     , `escape hex sequence has 2 hex digits instead of 8`,      0x00, 3);
3489     test("U000"    , `escape hex sequence has 3 hex digits instead of 8`,     0x000, 4);
3490     test("U0000"   , `escape hex sequence has 4 hex digits instead of 8`,    0x0000, 5);
3491     test("U0001f"  , `escape hex sequence has 5 hex digits instead of 8`,   0x0001f, 6);
3492     test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`,  0x0001f6, 7);
3493     test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8);
3494 
3495     test("ud800"    , `invalid UTF character \U0000d800`, '?', 5);
3496     test("udfff"    , `invalid UTF character \U0000dfff`, '?', 5);
3497     test("U00110000", `invalid UTF character \U00110000`, '?', 9);
3498 
3499     test("xg0"      , `undefined escape hex sequence \xg`, 'g', 2);
3500     test("ug000"    , `undefined escape hex sequence \ug`, 'g', 2);
3501     test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2);
3502 
3503     test("&BAD;", `unnamed character entity &BAD;`  , '?', 5);
3504     test("&quot", `unterminated named entity &quot;`, '?', 5);
3505     test("&quot", `unterminated named entity &quot;`, '?', 5);
3506 
3507     test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3);
3508 }
3509 
3510 unittest
3511 {
3512     fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3513     /* Not much here, just trying things out.
3514      */
3515     string text = "int"; // We rely on the implicit null-terminator
3516     ErrorSink errorSink = new ErrorSinkStderr;
3517     scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, false, false, errorSink, null);
3518     TOK tok;
3519     tok = lex1.nextToken();
3520     //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32);
3521     assert(tok == TOK.int32);
3522     tok = lex1.nextToken();
3523     assert(tok == TOK.endOfFile);
3524     tok = lex1.nextToken();
3525     assert(tok == TOK.endOfFile);
3526     tok = lex1.nextToken();
3527     assert(tok == TOK.endOfFile);
3528 }
3529 
3530 unittest
3531 {
3532     fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3533 
3534     // We don't want to see Lexer error output during these tests.
3535     ErrorSink errorSink = new ErrorSinkNull;
3536 
3537     // Test malformed input: even malformed input should end in a TOK.endOfFile.
3538     static immutable char[][] testcases =
3539     [   // Testcase must end with 0 or 0x1A.
3540         [0], // not malformed, but pathological
3541         ['\'', 0],
3542         ['\'', 0x1A],
3543         ['{', '{', 'q', '{', 0],
3544         [0xFF, 0],
3545         [0xFF, 0x80, 0],
3546         [0xFF, 0xFF, 0],
3547         [0xFF, 0xFF, 0],
3548         ['x', '"', 0x1A],
3549     ];
3550 
3551     foreach (testcase; testcases)
3552     {
3553         scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, false, false, errorSink, null);
3554         TOK tok = lex2.nextToken();
3555         size_t iterations = 1;
3556         while ((tok != TOK.endOfFile) && (iterations++ < testcase.length))
3557         {
3558             tok = lex2.nextToken();
3559         }
3560         assert(tok == TOK.endOfFile);
3561         tok = lex2.nextToken();
3562         assert(tok == TOK.endOfFile);
3563     }
3564 }