1 /**
2  * Implements the lexical analyzer, which converts source code into lexical tokens.
3  *
4  * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical)
5  *
6  * Copyright:   Copyright (C) 1999-2023 by The D Language Foundation, All Rights Reserved
7  * Authors:     $(LINK2 https://www.digitalmars.com, Walter Bright)
8  * License:     $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d)
10  * Documentation:  https://dlang.org/phobos/dmd_lexer.html
11  * Coverage:    https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d
12  */
13 
14 module dmd.lexer;
15 
16 import core.stdc.ctype;
17 import core.stdc.stdio;
18 import core.stdc.string;
19 
20 import dmd.entity;
21 import dmd.errorsink;
22 import dmd.id;
23 import dmd.identifier;
24 import dmd.location;
25 import dmd.root.array;
26 import dmd.root.ctfloat;
27 import dmd.common.outbuffer;
28 import dmd.root.port;
29 import dmd.root.rmem;
30 import dmd.root.utf;
31 import dmd.tokens;
32 
33 nothrow:
34 
35 version (DMDLIB)
36 {
37     version = LocOffset;
38 }
39 
40 /***********************************************************
41  * Values to use for various magic identifiers
42  */
43 struct CompileEnv
44 {
45     uint versionNumber;      /// __VERSION__
46     const(char)[] date;      /// __DATE__
47     const(char)[] time;      /// __TIME__
48     const(char)[] vendor;    /// __VENDOR__
49     const(char)[] timestamp; /// __TIMESTAMP__
50 
51     bool previewIn;          /// `in` means `[ref] scope const`, accepts rvalues
52     bool ddocOutput;         /// collect embedded documentation comments
53     bool shortenedMethods = true;   /// allow => in normal function declarations
54 }
55 
56 /***********************************************************
57  */
58 class Lexer
59 {
60     private __gshared OutBuffer stringbuffer;
61 
62     Loc scanloc;            // for error messages
63     Loc prevloc;            // location of token before current
64 
65     const(char)* p;         // current character
66 
67     Token token;
68 
69     // For ImportC
70     bool Ccompile;              /// true if compiling ImportC
71 
72     // The following are valid only if (Ccompile == true)
73     ubyte boolsize;             /// size of a C _Bool, default 1
74     ubyte shortsize;            /// size of a C short, default 2
75     ubyte intsize;              /// size of a C int, default 4
76     ubyte longsize;             /// size of C long, 4 or 8
77     ubyte long_longsize;        /// size of a C long long, default 8
78     ubyte long_doublesize;      /// size of C long double, 8 or D real.sizeof
79     ubyte wchar_tsize;          /// size of C wchar_t, 2 or 4
80 
81     ErrorSink eSink;            /// send error messages through this interface
82     CompileEnv compileEnv;      /// environment
83 
84     private
85     {
86         const(char)* base;      // pointer to start of buffer
87         const(char)* end;       // pointer to last element of buffer
88         const(char)* line;      // start of current line
89 
90         bool doDocComment;      // collect doc comment information
91         bool anyToken;          // seen at least one token
92         bool commentToken;      // comments are TOK.comment's
93         bool tokenizeNewlines;  // newlines are turned into TOK.endOfLine's
94 
95         bool whitespaceToken;   // tokenize whitespaces (only for DMDLIB)
96 
97         int inTokenStringConstant; // can be larger than 1 when in nested q{} strings
98         int lastDocLine;        // last line of previous doc comment
99 
100         Token* tokenFreelist;
101     }
102 
103   nothrow:
104 
105     /*********************
106      * Creates a Lexer for the source code base[begoffset..endoffset+1].
107      * The last character, base[endoffset], must be null (0) or EOF (0x1A).
108      *
109      * Params:
110      *  filename = used for error messages
111      *  base = source code, must be terminated by a null (0) or EOF (0x1A) character
112      *  begoffset = starting offset into base[]
113      *  endoffset = the last offset to read into base[]
114      *  doDocComment = handle documentation comments
115      *  commentToken = comments become TOK.comment's
116      *  errorSink = where error messages go, must not be null
117      *  compileEnv = version, vendor, date, time, etc.
118      */
119     this(const(char)* filename, const(char)* base, size_t begoffset,
120         size_t endoffset, bool doDocComment, bool commentToken,
121         ErrorSink errorSink,
122         const CompileEnv* compileEnv) pure scope
123     {
124         scanloc = Loc(filename, 1, 1);
125         // debug printf("Lexer::Lexer(%p)\n", base);
126         // debug printf("lexer.filename = %s\n", filename);
127         token = Token.init;
128         this.base = base;
129         this.end = base + endoffset;
130         p = base + begoffset;
131         line = p;
132         this.doDocComment = doDocComment;
133         this.commentToken = commentToken;
134         this.tokenizeNewlines = false;
135         this.inTokenStringConstant = 0;
136         this.lastDocLine = 0;
137         this.eSink = errorSink;
138         assert(errorSink);
139         if (compileEnv)
140             this.compileEnv = *compileEnv;
141         else
142         {
143             this.compileEnv.versionNumber = 1;
144             this.compileEnv.vendor = "DLF";
145         }
146         //initKeywords();
147         /* If first line starts with '#!', ignore the line
148          */
149         if (p && p[0] == '#' && p[1] == '!')
150         {
151             p += 2;
152             for (;;p++)
153             {
154                 char c = *p;
155                 switch (c)
156                 {
157                 case '\n':
158                     p++;
159                     goto case;
160                 case 0:
161                 case 0x1A:
162                     break;
163 
164                 default:
165                     // Note: We do allow malformed UTF-8 on shebang line.
166                     // It could have a meaning if the native system
167                     // encoding is not Unicode. See test compilable/test13512.d
168                     // for example encoded in KOI-8.
169                     // We also allow bidirectional control characters.
170                     // We do not execute the shebang line, so it can't be used
171                     // to conceal code. It is up to the shell to sanitize it.
172                     continue;
173                 }
174                 break;
175             }
176             endOfLine();
177         }
178     }
179 
180     /***********************
181      * Alternative entry point for DMDLIB, adds `whitespaceToken`
182      */
183     this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset,
184         bool doDocComment, bool commentToken, bool whitespaceToken,
185         ErrorSink errorSink, const CompileEnv* compileEnv = null
186         )
187     {
188         this(filename, base, begoffset, endoffset, doDocComment, commentToken, errorSink, compileEnv);
189         this.whitespaceToken = whitespaceToken;
190     }
191 
192     /******************
193      * Used for unittests for a mock Lexer
194      */
195     this(ErrorSink errorSink) scope { assert(errorSink); this.eSink = errorSink; }
196 
197     /**************************************
198      * Reset lexer to lex #define's
199      */
200     final void resetDefineLines(const(char)[] slice)
201     {
202         base = slice.ptr;
203         end = base + slice.length;
204         assert(*end == 0);
205         p = base;
206         line = p;
207         tokenizeNewlines = true;
208         inTokenStringConstant = 0;
209         lastDocLine = 0;
210         scanloc = Loc("#defines", 1, 1);
211     }
212 
213     /**********************************
214      * Set up for next #define line.
215      * p should be at start of next line.
216      */
217     final void nextDefineLine()
218     {
219         tokenizeNewlines = true;
220     }
221 
222     /***************
223      * Range interface
224      */
225 
226     final bool empty() const pure @property @nogc @safe
227     {
228         return front() == TOK.endOfFile;
229     }
230 
231     final TOK front() const pure @property @nogc @safe
232     {
233         return token.value;
234     }
235 
236     final void popFront()
237     {
238         nextToken();
239     }
240 
241     /// Returns: a newly allocated `Token`.
242     Token* allocateToken() pure nothrow @safe
243     {
244         if (tokenFreelist)
245         {
246             Token* t = tokenFreelist;
247             tokenFreelist = t.next;
248             t.next = null;
249             return t;
250         }
251         return new Token();
252     }
253 
254     /// Frees the given token by returning it to the freelist.
255     private void releaseToken(Token* token) pure nothrow @nogc @safe
256     {
257         if (mem.isGCEnabled)
258             *token = Token.init;
259         token.next = tokenFreelist;
260         tokenFreelist = token;
261     }
262 
263     final TOK nextToken()
264     {
265         prevloc = token.loc;
266         if (token.next)
267         {
268             Token* t = token.next;
269             memcpy(&token, t, Token.sizeof);
270             releaseToken(t);
271         }
272         else
273         {
274             scan(&token);
275         }
276         //printf(token.toChars());
277         return token.value;
278     }
279 
280     /***********************
281      * Look ahead at next token's value.
282      */
283     final TOK peekNext()
284     {
285         return peek(&token).value;
286     }
287 
288     /***********************
289      * Look 2 tokens ahead at value.
290      */
291     final TOK peekNext2()
292     {
293         Token* t = peek(&token);
294         return peek(t).value;
295     }
296 
297     /****************************
298      * Turn next token in buffer into a token.
299      * Params:
300      *  t = the token to set the resulting Token to
301      */
302     final void scan(Token* t)
303     {
304         const lastLine = scanloc.linnum;
305         Loc startLoc;
306         t.blockComment = null;
307         t.lineComment = null;
308 
309         while (1)
310         {
311             t.ptr = p;
312             //printf("p = %p, *p = '%c'\n",p,*p);
313             t.loc = loc();
314             switch (*p)
315             {
316             case 0:
317             case 0x1A:
318                 t.value = TOK.endOfFile; // end of file
319                 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile.
320                 return;
321             case ' ':
322                 // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary.
323                 while ((cast(size_t)p) % uint.sizeof)
324                 {
325                     if (*p != ' ')
326                         goto LendSkipFourSpaces;
327                     p++;
328                 }
329                 while (*(cast(uint*)p) == 0x20202020) // ' ' == 0x20
330                     p += 4;
331                 // Skip over any remaining space on the line.
332                 while (*p == ' ')
333                     p++;
334             LendSkipFourSpaces:
335                 version (DMDLIB)
336                 {
337                     if (whitespaceToken)
338                     {
339                         t.value = TOK.whitespace;
340                         return;
341                     }
342                 }
343                 continue; // skip white space
344             case '\t':
345             case '\v':
346             case '\f':
347                 p++;
348                 version (DMDLIB)
349                 {
350                     if (whitespaceToken)
351                     {
352                         t.value = TOK.whitespace;
353                         return;
354                     }
355                 }
356                 continue; // skip white space
357             case '\r':
358                 p++;
359                 if (*p != '\n') // if CR stands by itself
360                 {
361                     endOfLine();
362                     if (tokenizeNewlines)
363                     {
364                         t.value = TOK.endOfLine;
365                         tokenizeNewlines = false;
366                         return;
367                     }
368                 }
369                 version (DMDLIB)
370                 {
371                     if (whitespaceToken)
372                     {
373                         t.value = TOK.whitespace;
374                         return;
375                     }
376                 }
377                 continue; // skip white space
378             case '\n':
379                 p++;
380                 endOfLine();
381                 if (tokenizeNewlines)
382                 {
383                     t.value = TOK.endOfLine;
384                     tokenizeNewlines = false;
385                     return;
386                 }
387                 version (DMDLIB)
388                 {
389                     if (whitespaceToken)
390                     {
391                         t.value = TOK.whitespace;
392                         return;
393                     }
394                 }
395                 continue; // skip white space
396             case '0':
397                 if (!isZeroSecond(p[1]))        // if numeric literal does not continue
398                 {
399                     ++p;
400                     t.unsvalue = 0;
401                     t.value = TOK.int32Literal;
402                     return;
403                 }
404                 goto Lnumber;
405 
406             case '1': .. case '9':
407                 if (!isDigitSecond(p[1]))       // if numeric literal does not continue
408                 {
409                     t.unsvalue = *p - '0';
410                     ++p;
411                     t.value = TOK.int32Literal;
412                     return;
413                 }
414             Lnumber:
415                 t.value = number(t);
416                 return;
417 
418             case '\'':
419                 if (issinglechar(p[1]) && p[2] == '\'')
420                 {
421                     t.unsvalue = p[1];        // simple one character literal
422                     t.value = TOK.charLiteral;
423                     p += 3;
424                 }
425                 else if (Ccompile)
426                 {
427                     clexerCharConstant(*t, 0);
428                 }
429                 else
430                 {
431                     t.value = charConstant(t);
432                 }
433                 return;
434 
435             case 'u':
436             case 'U':
437             case 'L':
438                 if (!Ccompile)
439                     goto case_ident;
440                 if (p[1] == '\'')       // C wide character constant
441                 {
442                     char c = *p;
443                     if (c == 'L')       // convert L to u or U
444                         c = (wchar_tsize == 4) ? 'u' : 'U';
445                     ++p;
446                     clexerCharConstant(*t, c);
447                     return;
448                 }
449                 else if (p[1] == '\"')  // C wide string literal
450                 {
451                     const c = *p;
452                     ++p;
453                     escapeStringConstant(t);
454                     t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') :
455                                 c == 'u' ? 'w' :
456                                 'd';
457                     return;
458                 }
459                 else if (p[1] == '8' && p[2] == '\"') // C UTF-8 string literal
460                 {
461                     p += 2;
462                     escapeStringConstant(t);
463                     return;
464                 }
465                 goto case_ident;
466 
467             case 'r':
468                 if (Ccompile || p[1] != '"')
469                     goto case_ident;
470                 p++;
471                 goto case '`';
472             case '`':
473                 if (Ccompile)
474                     goto default;
475                 wysiwygStringConstant(t);
476                 return;
477             case 'q':
478                 if (Ccompile)
479                     goto case_ident;
480                 if (p[1] == '"')
481                 {
482                     p++;
483                     delimitedStringConstant(t);
484                     return;
485                 }
486                 else if (p[1] == '{')
487                 {
488                     p++;
489                     tokenStringConstant(t);
490                     return;
491                 }
492                 else
493                     goto case_ident;
494             case '"':
495                 escapeStringConstant(t);
496                 return;
497             case 'a':
498             case 'b':
499             case 'c':
500             case 'd':
501             case 'e':
502             case 'f':
503             case 'g':
504             case 'h':
505             case 'i':
506             case 'j':
507             case 'k':
508             case 'l':
509             case 'm':
510             case 'n':
511             case 'o':
512             case 'p':
513                 /*case 'q': case 'r':*/
514             case 's':
515             case 't':
516             //case 'u':
517             case 'v':
518             case 'w':
519             case 'x':
520             case 'y':
521             case 'z':
522             case 'A':
523             case 'B':
524             case 'C':
525             case 'D':
526             case 'E':
527             case 'F':
528             case 'G':
529             case 'H':
530             case 'I':
531             case 'J':
532             case 'K':
533             //case 'L':
534             case 'M':
535             case 'N':
536             case 'O':
537             case 'P':
538             case 'Q':
539             case 'R':
540             case 'S':
541             case 'T':
542             //case 'U':
543             case 'V':
544             case 'W':
545             case 'X':
546             case 'Y':
547             case 'Z':
548             case '_':
549             case_ident:
550                 {
551                     while (1)
552                     {
553                         const c = *++p;
554                         if (isidchar(c))
555                             continue;
556                         else if (c & 0x80)
557                         {
558                             const s = p;
559                             const u = decodeUTF();
560                             if (isUniAlpha(u))
561                                 continue;
562                             error(t.loc, "char 0x%04x not allowed in identifier", u);
563                             p = s;
564                         }
565                         break;
566                     }
567                     Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr));
568                     t.ident = id;
569                     t.value = cast(TOK)id.getValue();
570 
571                     anyToken = 1;
572 
573                     /* Different keywords for C and D
574                      */
575                     if (Ccompile)
576                     {
577                         if (t.value != TOK.identifier)
578                         {
579                             t.value = Ckeywords[t.value];  // filter out D keywords
580                         }
581                     }
582                     else if (t.value >= FirstCKeyword)
583                         t.value = TOK.identifier;       // filter out C keywords
584 
585                     else if (*t.ptr == '_') // if special identifier token
586                     {
587                         void toToken(const(char)[] s)
588                         {
589                             t.value = TOK.string_;
590                             t.ustring = s.ptr;
591                             t.len = cast(uint)s.length;
592                             t.postfix = 0;
593                         }
594 
595                         if (id == Id.DATE)
596                             toToken(compileEnv.date);
597                         else if (id == Id.TIME)
598                             toToken(compileEnv.time);
599                         else if (id == Id.VENDOR)
600                             toToken(compileEnv.vendor);
601                         else if (id == Id.TIMESTAMP)
602                             toToken(compileEnv.timestamp);
603                         else if (id == Id.VERSIONX)
604                         {
605                             t.value = TOK.int64Literal;
606                             t.unsvalue = compileEnv.versionNumber;
607                         }
608                         else if (id == Id.EOFX)
609                         {
610                             t.value = TOK.endOfFile;
611                             // Advance scanner to end of file
612                             while (!(*p == 0 || *p == 0x1A))
613                                 p++;
614                         }
615                     }
616                     //printf("t.value = %d\n",t.value);
617                     return;
618                 }
619             case '/':
620                 p++;
621                 switch (*p)
622                 {
623                 case '=':
624                     p++;
625                     t.value = TOK.divAssign;
626                     return;
627                 case '*':
628                     p++;
629                     startLoc = loc();
630                     while (1)
631                     {
632                         while (1)
633                         {
634                             const c = *p;
635                             switch (c)
636                             {
637                             case '/':
638                                 break;
639                             case '\n':
640                                 endOfLine();
641                                 p++;
642                                 continue;
643                             case '\r':
644                                 p++;
645                                 if (*p != '\n')
646                                     endOfLine();
647                                 continue;
648                             case 0:
649                             case 0x1A:
650                                 error(t.loc, "unterminated /* */ comment");
651                                 p = end;
652                                 t.loc = loc();
653                                 t.value = TOK.endOfFile;
654                                 return;
655                             default:
656                                 if (c & 0x80)
657                                 {
658                                     const u = decodeUTF();
659                                     if (u == PS || u == LS)
660                                         endOfLine();
661                                 }
662                                 p++;
663                                 continue;
664                             }
665                             break;
666                         }
667                         p++;
668                         if (p[-2] == '*' && p - 3 != t.ptr)
669                             break;
670                     }
671                     if (commentToken)
672                     {
673                         t.loc = startLoc;
674                         t.value = TOK.comment;
675                         return;
676                     }
677                     else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr)
678                     {
679                         // if /** but not /**/
680                         getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
681                         lastDocLine = scanloc.linnum;
682                     }
683                     continue;
684                 case '/': // do // style comments
685                     startLoc = loc();
686                     while (1)
687                     {
688                         const c = *++p;
689                         switch (c)
690                         {
691                         case '\n':
692                             break;
693                         case '\r':
694                             if (p[1] == '\n')
695                                 p++;
696                             break;
697                         case 0:
698                         case 0x1A:
699                             if (commentToken)
700                             {
701                                 p = end;
702                                 t.loc = startLoc;
703                                 t.value = TOK.comment;
704                                 return;
705                             }
706                             if (doDocComment && t.ptr[2] == '/')
707                             {
708                                 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
709                                 lastDocLine = scanloc.linnum;
710                             }
711                             p = end;
712                             t.loc = loc();
713                             t.value = TOK.endOfFile;
714                             return;
715                         default:
716                             if (c & 0x80)
717                             {
718                                 const u = decodeUTF();
719                                 if (u == PS || u == LS)
720                                     break;
721                             }
722                             continue;
723                         }
724                         break;
725                     }
726                     if (commentToken)
727                     {
728                         version (DMDLIB) {}
729                         else
730                         {
731                             p++;
732                             endOfLine();
733                         }
734                         t.loc = startLoc;
735                         t.value = TOK.comment;
736                         return;
737                     }
738                     if (doDocComment && t.ptr[2] == '/')
739                     {
740                         getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
741                         lastDocLine = scanloc.linnum;
742                     }
743                     p++;
744                     endOfLine();
745                     continue;
746                 case '+':
747                     if (!Ccompile)
748                     {
749                         int nest;
750                         startLoc = loc();
751                         p++;
752                         nest = 1;
753                         while (1)
754                         {
755                             char c = *p;
756                             switch (c)
757                             {
758                             case '/':
759                                 p++;
760                                 if (*p == '+')
761                                 {
762                                     p++;
763                                     nest++;
764                                 }
765                                 continue;
766                             case '+':
767                                 p++;
768                                 if (*p == '/')
769                                 {
770                                     p++;
771                                     if (--nest == 0)
772                                         break;
773                                 }
774                                 continue;
775                             case '\r':
776                                 p++;
777                                 if (*p != '\n')
778                                     endOfLine();
779                                 continue;
780                             case '\n':
781                                 endOfLine();
782                                 p++;
783                                 continue;
784                             case 0:
785                             case 0x1A:
786                                 error(t.loc, "unterminated /+ +/ comment");
787                                 p = end;
788                                 t.loc = loc();
789                                 t.value = TOK.endOfFile;
790                                 return;
791                             default:
792                                 if (c & 0x80)
793                                 {
794                                     uint u = decodeUTF();
795                                     if (u == PS || u == LS)
796                                         endOfLine();
797                                 }
798                                 p++;
799                                 continue;
800                             }
801                             break;
802                         }
803                         if (commentToken)
804                         {
805                             t.loc = startLoc;
806                             t.value = TOK.comment;
807                             return;
808                         }
809                         if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr)
810                         {
811                             // if /++ but not /++/
812                             getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
813                             lastDocLine = scanloc.linnum;
814                         }
815                         continue;
816                     }
817                     break;
818                 default:
819                     break;
820                 }
821                 t.value = TOK.div;
822                 return;
823             case '.':
824                 p++;
825                 if (isdigit(*p))
826                 {
827                     /* Note that we don't allow ._1 and ._ as being
828                      * valid floating point numbers.
829                      */
830                     p--;
831                     t.value = inreal(t);
832                 }
833                 else if (p[0] == '.')
834                 {
835                     if (p[1] == '.')
836                     {
837                         p += 2;
838                         t.value = TOK.dotDotDot;
839                     }
840                     else
841                     {
842                         p++;
843                         t.value = TOK.slice;
844                     }
845                 }
846                 else
847                     t.value = TOK.dot;
848                 return;
849             case '&':
850                 p++;
851                 if (*p == '=')
852                 {
853                     p++;
854                     t.value = TOK.andAssign;
855                 }
856                 else if (*p == '&')
857                 {
858                     p++;
859                     t.value = TOK.andAnd;
860                 }
861                 else
862                     t.value = TOK.and;
863                 return;
864             case '|':
865                 p++;
866                 if (*p == '=')
867                 {
868                     p++;
869                     t.value = TOK.orAssign;
870                 }
871                 else if (*p == '|')
872                 {
873                     p++;
874                     t.value = TOK.orOr;
875                 }
876                 else
877                     t.value = TOK.or;
878                 return;
879             case '-':
880                 p++;
881                 if (*p == '=')
882                 {
883                     p++;
884                     t.value = TOK.minAssign;
885                 }
886                 else if (*p == '-')
887                 {
888                     p++;
889                     t.value = TOK.minusMinus;
890                 }
891                 else if (*p == '>')
892                 {
893                     ++p;
894                     t.value = TOK.arrow;
895                 }
896                 else
897                     t.value = TOK.min;
898                 return;
899             case '+':
900                 p++;
901                 if (*p == '=')
902                 {
903                     p++;
904                     t.value = TOK.addAssign;
905                 }
906                 else if (*p == '+')
907                 {
908                     p++;
909                     t.value = TOK.plusPlus;
910                 }
911                 else
912                     t.value = TOK.add;
913                 return;
914             case '<':
915                 p++;
916                 if (*p == '=')
917                 {
918                     p++;
919                     t.value = TOK.lessOrEqual; // <=
920                 }
921                 else if (*p == '<')
922                 {
923                     p++;
924                     if (*p == '=')
925                     {
926                         p++;
927                         t.value = TOK.leftShiftAssign; // <<=
928                     }
929                     else
930                         t.value = TOK.leftShift; // <<
931                 }
932                 else if (*p == ':' && Ccompile)
933                 {
934                     ++p;
935                     t.value = TOK.leftBracket;  // <:
936                 }
937                 else if (*p == '%' && Ccompile)
938                 {
939                     ++p;
940                     t.value = TOK.leftCurly;    // <%
941                 }
942                 else
943                     t.value = TOK.lessThan; // <
944                 return;
945             case '>':
946                 p++;
947                 if (*p == '=')
948                 {
949                     p++;
950                     t.value = TOK.greaterOrEqual; // >=
951                 }
952                 else if (*p == '>')
953                 {
954                     p++;
955                     if (*p == '=')
956                     {
957                         p++;
958                         t.value = TOK.rightShiftAssign; // >>=
959                     }
960                     else if (*p == '>')
961                     {
962                         p++;
963                         if (*p == '=')
964                         {
965                             p++;
966                             t.value = TOK.unsignedRightShiftAssign; // >>>=
967                         }
968                         else
969                             t.value = TOK.unsignedRightShift; // >>>
970                     }
971                     else
972                         t.value = TOK.rightShift; // >>
973                 }
974                 else
975                     t.value = TOK.greaterThan; // >
976                 return;
977             case '!':
978                 p++;
979                 if (*p == '=')
980                 {
981                     p++;
982                     t.value = TOK.notEqual; // !=
983                 }
984                 else
985                     t.value = TOK.not; // !
986                 return;
987             case '=':
988                 p++;
989                 if (*p == '=')
990                 {
991                     p++;
992                     t.value = TOK.equal; // ==
993                 }
994                 else if (*p == '>')
995                 {
996                     p++;
997                     t.value = TOK.goesTo; // =>
998                 }
999                 else
1000                     t.value = TOK.assign; // =
1001                 return;
1002             case '~':
1003                 p++;
1004                 if (*p == '=')
1005                 {
1006                     p++;
1007                     t.value = TOK.concatenateAssign; // ~=
1008                 }
1009                 else
1010                     t.value = TOK.tilde; // ~
1011                 return;
1012             case '^':
1013                 p++;
1014                 if (*p == '^')
1015                 {
1016                     p++;
1017                     if (*p == '=')
1018                     {
1019                         p++;
1020                         t.value = TOK.powAssign; // ^^=
1021                     }
1022                     else
1023                         t.value = TOK.pow; // ^^
1024                 }
1025                 else if (*p == '=')
1026                 {
1027                     p++;
1028                     t.value = TOK.xorAssign; // ^=
1029                 }
1030                 else
1031                     t.value = TOK.xor; // ^
1032                 return;
1033             case '(':
1034                 p++;
1035                 t.value = TOK.leftParenthesis;
1036                 return;
1037             case ')':
1038                 p++;
1039                 t.value = TOK.rightParenthesis;
1040                 return;
1041             case '[':
1042                 p++;
1043                 t.value = TOK.leftBracket;
1044                 return;
1045             case ']':
1046                 p++;
1047                 t.value = TOK.rightBracket;
1048                 return;
1049             case '{':
1050                 p++;
1051                 t.value = TOK.leftCurly;
1052                 return;
1053             case '}':
1054                 p++;
1055                 t.value = TOK.rightCurly;
1056                 return;
1057             case '?':
1058                 p++;
1059                 t.value = TOK.question;
1060                 return;
1061             case ',':
1062                 p++;
1063                 t.value = TOK.comma;
1064                 return;
1065             case ';':
1066                 p++;
1067                 t.value = TOK.semicolon;
1068                 return;
1069             case ':':
1070                 p++;
1071                 if (*p == ':')
1072                 {
1073                     ++p;
1074                     t.value = TOK.colonColon;
1075                 }
1076                 else if (*p == '>' && Ccompile)
1077                 {
1078                     ++p;
1079                     t.value = TOK.rightBracket;
1080                 }
1081                 else
1082                     t.value = TOK.colon;
1083                 return;
1084             case '$':
1085                 p++;
1086                 t.value = TOK.dollar;
1087                 return;
1088             case '@':
1089                 p++;
1090                 t.value = TOK.at;
1091                 return;
1092             case '*':
1093                 p++;
1094                 if (*p == '=')
1095                 {
1096                     p++;
1097                     t.value = TOK.mulAssign;
1098                 }
1099                 else
1100                     t.value = TOK.mul;
1101                 return;
1102             case '%':
1103                 p++;
1104                 if (*p == '=')
1105                 {
1106                     p++;
1107                     t.value = TOK.modAssign;
1108                 }
1109                 else if (*p == '>' && Ccompile)
1110                 {
1111                     ++p;
1112                     t.value = TOK.rightCurly;
1113                 }
1114                 else if (*p == ':' && Ccompile)
1115                 {
1116                     goto case '#';      // %: means #
1117                 }
1118                 else
1119                     t.value = TOK.mod;
1120                 return;
1121             case '#':
1122                 {
1123                     // https://issues.dlang.org/show_bug.cgi?id=22825
1124                     // Special token sequences are terminated by newlines,
1125                     // and should not be skipped over.
1126                     this.tokenizeNewlines = true;
1127                     p++;
1128                     if (parseSpecialTokenSequence())
1129                         continue;
1130                     t.value = TOK.pound;
1131                     return;
1132                 }
1133             default:
1134                 {
1135                     dchar c = *p;
1136                     if (c & 0x80)
1137                     {
1138                         c = decodeUTF();
1139                         // Check for start of unicode identifier
1140                         if (isUniAlpha(c))
1141                             goto case_ident;
1142                         if (c == PS || c == LS)
1143                         {
1144                             endOfLine();
1145                             p++;
1146                             if (tokenizeNewlines)
1147                             {
1148                                 t.value = TOK.endOfLine;
1149                                 tokenizeNewlines = false;
1150                                 return;
1151                             }
1152                             continue;
1153                         }
1154                     }
1155                     if (c < 0x80 && isprint(c))
1156                         error(t.loc, "character '%c' is not a valid token", c);
1157                     else
1158                         error(t.loc, "character 0x%02x is not a valid token", c);
1159                     p++;
1160                     continue;
1161                     // assert(0);
1162                 }
1163             }
1164         }
1165     }
1166 
1167     final Token* peek(Token* ct)
1168     {
1169         Token* t;
1170         if (ct.next)
1171             t = ct.next;
1172         else
1173         {
1174             t = allocateToken();
1175             scan(t);
1176             ct.next = t;
1177         }
1178         return t;
1179     }
1180 
1181     /*********************************
1182      * tk is on the opening (.
1183      * Look ahead and return token that is past the closing ).
1184      */
1185     final Token* peekPastParen(Token* tk)
1186     {
1187         //printf("peekPastParen()\n");
1188         int parens = 1;
1189         int curlynest = 0;
1190         while (1)
1191         {
1192             tk = peek(tk);
1193             //tk.print();
1194             switch (tk.value)
1195             {
1196             case TOK.leftParenthesis:
1197                 parens++;
1198                 continue;
1199             case TOK.rightParenthesis:
1200                 --parens;
1201                 if (parens)
1202                     continue;
1203                 tk = peek(tk);
1204                 break;
1205             case TOK.leftCurly:
1206                 curlynest++;
1207                 continue;
1208             case TOK.rightCurly:
1209                 if (--curlynest >= 0)
1210                     continue;
1211                 break;
1212             case TOK.semicolon:
1213                 if (curlynest)
1214                     continue;
1215                 break;
1216             case TOK.endOfFile:
1217                 break;
1218             default:
1219                 continue;
1220             }
1221             return tk;
1222         }
1223     }
1224 
1225     /*******************************************
1226      * Parse escape sequence.
1227      */
1228     private uint escapeSequence(out dchar c2)
1229     {
1230         return Lexer.escapeSequence(token.loc, p, Ccompile, c2);
1231     }
1232 
1233     /********
1234      * Parse the given string literal escape sequence into a single character.
1235      * D https://dlang.org/spec/lex.html#escape_sequences
1236      * C11 6.4.4.4
1237      * Params:
1238      *  loc = location to use for error messages
1239      *  sequence = pointer to string with escape sequence to parse. Updated to
1240      *             point past the end of the escape sequence
1241      *  Ccompile = true for compile C11 escape sequences
1242      *  c2 = returns second `dchar` of html entity with 2 code units, otherwise stays `dchar.init`
1243      * Returns:
1244      *  the escape sequence as a single character
1245      */
1246     private dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile, out dchar c2)
1247     {
1248         const(char)* p = sequence; // cache sequence reference on stack
1249         scope(exit) sequence = p;
1250 
1251         uint c = *p;
1252         int ndigits;
1253         switch (c)
1254         {
1255         case '\'':
1256         case '"':
1257         case '?':
1258         case '\\':
1259         Lconsume:
1260             p++;
1261             break;
1262         case 'a':
1263             c = 7;
1264             goto Lconsume;
1265         case 'b':
1266             c = 8;
1267             goto Lconsume;
1268         case 'f':
1269             c = 12;
1270             goto Lconsume;
1271         case 'n':
1272             c = 10;
1273             goto Lconsume;
1274         case 'r':
1275             c = 13;
1276             goto Lconsume;
1277         case 't':
1278             c = 9;
1279             goto Lconsume;
1280         case 'v':
1281             c = 11;
1282             goto Lconsume;
1283         case 'u':
1284             ndigits = 4;
1285             goto Lhex;
1286         case 'U':
1287             ndigits = 8;
1288             goto Lhex;
1289         case 'x':
1290             ndigits = 2;
1291         Lhex:
1292             p++;
1293             c = *p;
1294             if (ishex(cast(char)c))
1295             {
1296                 uint v = 0;
1297                 int n = 0;
1298                 if (Ccompile && ndigits == 2)
1299                 {
1300                     /* C11 6.4.4.4-7 one to infinity hex digits
1301                      */
1302                     do
1303                     {
1304                         if (isdigit(cast(char)c))
1305                             c -= '0';
1306                         else if (islower(c))
1307                             c -= 'a' - 10;
1308                         else
1309                             c -= 'A' - 10;
1310                         v = v * 16 + c;
1311                         c = *++p;
1312                     } while (ishex(cast(char)c));
1313                 }
1314                 else
1315                 {
1316                     while (1)
1317                     {
1318                         if (isdigit(cast(char)c))
1319                             c -= '0';
1320                         else if (islower(c))
1321                             c -= 'a' - 10;
1322                         else
1323                             c -= 'A' - 10;
1324                         v = v * 16 + c;
1325                         c = *++p;
1326                         if (++n == ndigits)
1327                             break;
1328                         if (!ishex(cast(char)c))
1329                         {
1330                             error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits);
1331                             break;
1332                         }
1333                     }
1334                     if (ndigits != 2 && !utf_isValidDchar(v))
1335                     {
1336                         error(loc, "invalid UTF character \\U%08x", v);
1337                         v = '?'; // recover with valid UTF character
1338                     }
1339                 }
1340                 c = v;
1341             }
1342             else
1343             {
1344                 error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c);
1345                 p++;
1346             }
1347             break;
1348         case '&':
1349             if (Ccompile)
1350                 goto default;
1351 
1352             // named character entity
1353             for (const idstart = ++p; 1; p++)
1354             {
1355                 switch (*p)
1356                 {
1357                 case ';':
1358                     auto entity = HtmlNamedEntity(idstart[0 .. p - idstart]);
1359                     c = entity[0];
1360                     if (entity == entity.init)
1361                     {
1362                         error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart);
1363                         c = '?';
1364                     }
1365                     if (entity[1] != entity.init[1])
1366                         c2 = entity[1];
1367 
1368                     p++;
1369                     break;
1370                 default:
1371                     if (isalpha(*p) || (p != idstart && isdigit(*p)))
1372                         continue;
1373                     error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart);
1374                     c = '?';
1375                     break;
1376                 }
1377                 break;
1378             }
1379             break;
1380         case 0:
1381         case 0x1A:
1382             // end of file
1383             c = '\\';
1384             break;
1385         default:
1386             if (isoctal(cast(char)c))
1387             {
1388                 uint v = 0;
1389                 int n = 0;
1390                 do
1391                 {
1392                     v = v * 8 + (c - '0');
1393                     c = *++p;
1394                 }
1395                 while (++n < 3 && isoctal(cast(char)c));
1396                 c = v;
1397                 if (c > 0xFF)
1398                     error(loc, "escape octal sequence \\%03o is larger than \\377", c);
1399             }
1400             else
1401             {
1402                 error(loc, "undefined escape sequence \\%c", c);
1403                 p++;
1404             }
1405             break;
1406         }
1407         return c;
1408     }
1409 
1410     /**
1411     Lex a wysiwyg string. `p` must be pointing to the first character before the
1412     contents of the string literal. The character pointed to by `p` will be used as
1413     the terminating character (i.e. backtick or double-quote).
1414     Params:
1415         result = pointer to the token that accepts the result
1416     */
1417     private void wysiwygStringConstant(Token* result)
1418     {
1419         result.value = TOK.string_;
1420         Loc start = loc();
1421         auto terminator = p[0];
1422         p++;
1423         stringbuffer.setsize(0);
1424         while (1)
1425         {
1426             dchar c = p[0];
1427             p++;
1428             switch (c)
1429             {
1430             case '\n':
1431                 endOfLine();
1432                 break;
1433             case '\r':
1434                 if (p[0] == '\n')
1435                     continue; // ignore
1436                 c = '\n'; // treat EndOfLine as \n character
1437                 endOfLine();
1438                 break;
1439             case 0:
1440             case 0x1A:
1441                 error("unterminated string constant starting at %s", start.toChars());
1442                 result.setString();
1443                 // rewind `p` so it points to the EOF character
1444                 p--;
1445                 return;
1446             default:
1447                 if (c == terminator)
1448                 {
1449                     result.setString(stringbuffer);
1450                     stringPostfix(result);
1451                     return;
1452                 }
1453                 else if (c & 0x80)
1454                 {
1455                     p--;
1456                     const u = decodeUTF();
1457                     p++;
1458                     if (u == PS || u == LS)
1459                         endOfLine();
1460                     stringbuffer.writeUTF8(u);
1461                     continue;
1462                 }
1463                 break;
1464             }
1465             stringbuffer.writeByte(c);
1466         }
1467     }
1468 
1469     /**
1470     Lex a delimited string. Some examples of delimited strings are:
1471     ---
1472     q"(foo(xxx))"      // "foo(xxx)"
1473     q"[foo$(LPAREN)]"  // "foo$(LPAREN)"
1474     q"/foo]/"          // "foo]"
1475     q"HERE
1476     foo
1477     HERE"              // "foo\n"
1478     ---
1479     It is assumed that `p` points to the opening double-quote '"'.
1480     Params:
1481         result = pointer to the token that accepts the result
1482     */
1483     private void delimitedStringConstant(Token* result)
1484     {
1485         result.value = TOK.string_;
1486         Loc start = loc();
1487         dchar delimleft = 0;
1488         dchar delimright = 0;
1489         uint nest = 1;
1490         uint nestcount = ~0; // dead assignment, needed to suppress warning
1491         Identifier hereid = null;
1492         uint blankrol = 0;
1493         uint startline = 0;
1494         p++;
1495         stringbuffer.setsize(0);
1496         while (1)
1497         {
1498             const s = p;
1499             dchar c = *p++;
1500             //printf("c = '%c'\n", c);
1501             switch (c)
1502             {
1503             case '\n':
1504             Lnextline:
1505                 endOfLine();
1506                 startline = 1;
1507                 if (blankrol)
1508                 {
1509                     blankrol = 0;
1510                     continue;
1511                 }
1512                 if (hereid)
1513                 {
1514                     stringbuffer.writeUTF8(c);
1515                     continue;
1516                 }
1517                 break;
1518             case '\r':
1519                 if (*p == '\n')
1520                     continue; // ignore
1521                 c = '\n'; // treat EndOfLine as \n character
1522                 goto Lnextline;
1523             case 0:
1524             case 0x1A:
1525                 error("unterminated delimited string constant starting at %s", start.toChars());
1526                 result.setString();
1527                 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1528                 p--;
1529                 return;
1530             default:
1531                 if (c & 0x80)
1532                 {
1533                     p--;
1534                     c = decodeUTF();
1535                     p++;
1536                     if (c == PS || c == LS)
1537                         goto Lnextline;
1538                 }
1539                 break;
1540             }
1541             if (delimleft == 0)
1542             {
1543                 delimleft = c;
1544                 nest = 1;
1545                 nestcount = 1;
1546                 if (c == '(')
1547                     delimright = ')';
1548                 else if (c == '{')
1549                     delimright = '}';
1550                 else if (c == '[')
1551                     delimright = ']';
1552                 else if (c == '<')
1553                     delimright = '>';
1554                 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1555                 {
1556                     // Start of identifier; must be a heredoc
1557                     Token tok;
1558                     p = s;
1559                     scan(&tok); // read in heredoc identifier
1560                     if (tok.value != TOK.identifier)
1561                     {
1562                         error("identifier expected for heredoc, not %s", tok.toChars());
1563                         delimright = c;
1564                     }
1565                     else
1566                     {
1567                         hereid = tok.ident;
1568                         //printf("hereid = '%s'\n", hereid.toChars());
1569                         blankrol = 1;
1570                     }
1571                     nest = 0;
1572                 }
1573                 else
1574                 {
1575                     delimright = c;
1576                     nest = 0;
1577                     if (isspace(c))
1578                         error("delimiter cannot be whitespace");
1579                 }
1580             }
1581             else
1582             {
1583                 if (blankrol)
1584                 {
1585                     error("heredoc rest of line should be blank");
1586                     blankrol = 0;
1587                     continue;
1588                 }
1589                 if (nest == 1)
1590                 {
1591                     if (c == delimleft)
1592                         nestcount++;
1593                     else if (c == delimright)
1594                     {
1595                         nestcount--;
1596                         if (nestcount == 0)
1597                             goto Ldone;
1598                     }
1599                 }
1600                 else if (c == delimright)
1601                     goto Ldone;
1602                 if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid)
1603                 {
1604                     Token tok;
1605                     auto psave = p;
1606                     p = s;
1607                     scan(&tok); // read in possible heredoc identifier
1608                     //printf("endid = '%s'\n", tok.ident.toChars());
1609                     if (tok.value == TOK.identifier && tok.ident is hereid)
1610                     {
1611                         /* should check that rest of line is blank
1612                          */
1613                         goto Ldone;
1614                     }
1615                     p = psave;
1616                 }
1617                 stringbuffer.writeUTF8(c);
1618                 startline = 0;
1619             }
1620         }
1621     Ldone:
1622         if (*p == '"')
1623             p++;
1624         else if (hereid)
1625             error("delimited string must end in `%s\"`", hereid.toChars());
1626         else if (isspace(delimright))
1627             error("delimited string must end in `\"`");
1628         else
1629             error(token.loc, "delimited string must end in `%c\"`", delimright);
1630         result.setString(stringbuffer);
1631         stringPostfix(result);
1632     }
1633 
1634     /**
1635     Lex a token string. Some examples of token strings are:
1636     ---
1637     q{ foo(xxx) }    // " foo(xxx) "
1638     q{foo$(LPAREN)}  // "foo$(LPAREN)"
1639     q{{foo}"}"}      // "{foo}"}""
1640     ---
1641     It is assumed that `p` points to the opening curly-brace.
1642     Params:
1643         result = pointer to the token that accepts the result
1644     */
1645     private void tokenStringConstant(Token* result)
1646     {
1647         result.value = TOK.string_;
1648 
1649         uint nest = 1;
1650         const start = loc();
1651         const pstart = ++p;
1652         inTokenStringConstant++;
1653         scope(exit) inTokenStringConstant--;
1654         while (1)
1655         {
1656             Token tok;
1657             scan(&tok);
1658             switch (tok.value)
1659             {
1660             case TOK.leftCurly:
1661                 nest++;
1662                 continue;
1663             case TOK.rightCurly:
1664                 if (--nest == 0)
1665                 {
1666                     result.setString(pstart, p - 1 - pstart);
1667                     stringPostfix(result);
1668                     return;
1669                 }
1670                 continue;
1671             case TOK.endOfFile:
1672                 error("unterminated token string constant starting at %s", start.toChars());
1673                 result.setString();
1674                 return;
1675             default:
1676                 continue;
1677             }
1678         }
1679     }
1680 
1681     /**
1682     Scan a quoted string while building the processed string value by
1683     handling escape sequences. The result is returned in the given `t` token.
1684     This function assumes that `p` currently points to the opening quote
1685     of the string.
1686     Params:
1687         t = the token to set the resulting string to
1688     * References:
1689     *   D https://dlang.org/spec/lex.html#double_quoted_strings
1690     *   ImportC C11 6.4.5
1691     */
1692     private void escapeStringConstant(Token* t)
1693     {
1694         t.value = TOK.string_;
1695 
1696         const start = loc();
1697         const tc = *p++;        // opening quote
1698         stringbuffer.setsize(0);
1699         while (1)
1700         {
1701             dchar c = *p++;
1702             dchar c2;
1703             switch (c)
1704             {
1705             case '\\':
1706                 switch (*p)
1707                 {
1708                 case '&':
1709                     if (Ccompile)
1710                         goto default;
1711 
1712                     c = escapeSequence(c2);
1713                     stringbuffer.writeUTF8(c);
1714                     if (c2 != dchar.init)
1715                         stringbuffer.writeUTF8(c2);
1716                     continue;
1717                 case 'u':
1718                 case 'U':
1719                     c = escapeSequence(c2);
1720                     stringbuffer.writeUTF8(c);
1721                     continue;
1722                 default:
1723                     c = escapeSequence(c2);
1724                     break;
1725                 }
1726                 break;
1727             case '\n':
1728                 endOfLine();
1729                 if (Ccompile)
1730                     goto Lunterminated;
1731                 break;
1732             case '\r':
1733                 if (*p == '\n')
1734                     continue; // ignore
1735                 c = '\n'; // treat EndOfLine as \n character
1736                 endOfLine();
1737                 if (Ccompile)
1738                     goto Lunterminated;
1739                 break;
1740             case '\'':
1741             case '"':
1742                 if (c != tc)
1743                     goto default;
1744                 t.setString(stringbuffer);
1745                 if (!Ccompile)
1746                     stringPostfix(t);
1747                 return;
1748             case 0:
1749             case 0x1A:
1750                 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1751                 p--;
1752             Lunterminated:
1753                 error("unterminated string constant starting at %s", start.toChars());
1754                 t.setString();
1755                 return;
1756             default:
1757                 if (c & 0x80)
1758                 {
1759                     p--;
1760                     c = decodeUTF();
1761                     if (c == LS || c == PS)
1762                     {
1763                         c = '\n';
1764                         endOfLine();
1765                         if (Ccompile)
1766                             goto Lunterminated;
1767                     }
1768                     p++;
1769                     stringbuffer.writeUTF8(c);
1770                     continue;
1771                 }
1772                 break;
1773             }
1774             stringbuffer.writeByte(c);
1775         }
1776     }
1777 
1778     /**************************************
1779      * Reference:
1780      *    https://dlang.org/spec/lex.html#characterliteral
1781      */
1782     private TOK charConstant(Token* t)
1783     {
1784         TOK tk = TOK.charLiteral;
1785         //printf("Lexer::charConstant\n");
1786         p++;
1787         dchar c = *p++;
1788         dchar c2;
1789         switch (c)
1790         {
1791         case '\\':
1792             switch (*p)
1793             {
1794             case 'u':
1795                 tk = TOK.wcharLiteral;
1796                 goto default;
1797             case 'U':
1798             case '&':
1799                 tk = TOK.dcharLiteral;
1800                 goto default;
1801             default:
1802                 t.unsvalue = escapeSequence(c2);
1803                 if (c2 != c2.init)
1804                 {
1805                     error("html entity requires 2 code units, use a string instead of a character");
1806                     t.unsvalue = '?';
1807                 }
1808                 break;
1809             }
1810             break;
1811         case '\n':
1812         L1:
1813             endOfLine();
1814             goto case;
1815         case '\r':
1816             goto case '\'';
1817         case 0:
1818         case 0x1A:
1819             // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1820             p--;
1821             goto case;
1822         case '\'':
1823             error("unterminated character constant");
1824             t.unsvalue = '?';
1825             return tk;
1826         default:
1827             if (c & 0x80)
1828             {
1829                 p--;
1830                 c = decodeUTF();
1831                 p++;
1832                 if (c == LS || c == PS)
1833                     goto L1;
1834                 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1835                     tk = TOK.wcharLiteral;
1836                 else
1837                     tk = TOK.dcharLiteral;
1838             }
1839             t.unsvalue = c;
1840             break;
1841         }
1842         if (*p != '\'')
1843         {
1844             while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' &&
1845                     *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}')
1846             {
1847                 if (*p & 0x80)
1848                 {
1849                     const s = p;
1850                     c = decodeUTF();
1851                     if (c == LS || c == PS)
1852                     {
1853                         p = s;
1854                         break;
1855                     }
1856                 }
1857                 p++;
1858             }
1859 
1860             if (*p == '\'')
1861             {
1862                 error("character constant has multiple characters");
1863                 p++;
1864             }
1865             else
1866                 error("unterminated character constant");
1867             t.unsvalue = '?';
1868             return tk;
1869         }
1870         p++;
1871         return tk;
1872     }
1873 
1874     /***************************************
1875      * Lex C character constant.
1876      * Parser is on the opening quote.
1877      * Params:
1878      *  t = token to fill in
1879      *  prefix = one of `u`, `U` or 0.
1880      * Reference:
1881      *  C11 6.4.4.4
1882      */
1883     private void clexerCharConstant(ref Token t, char prefix)
1884     {
1885         escapeStringConstant(&t);
1886         const(char)[] str = t.ustring[0 .. t.len];
1887         const n = str.length;
1888         const loc = t.loc;
1889         if (n == 0)
1890         {
1891             error(loc, "empty character constant");
1892             t.value = TOK.semicolon;
1893             return;
1894         }
1895 
1896         uint u;
1897         switch (prefix)
1898         {
1899             case 0:
1900                 if (n == 1) // fast case
1901                 {
1902                     u = str[0];
1903                 }
1904                 else if (n > 4)
1905                     error(loc, "max number of chars in character literal is 4, had %d",
1906                         cast(int)n);
1907                 else
1908                 {
1909                     foreach (i, c; str)
1910                         (cast(char*)&u)[n - 1 - i] = c;
1911                 }
1912                 break;
1913 
1914             case 'u':
1915                 dchar d1;
1916                 size_t idx;
1917                 auto msg = utf_decodeChar(str, idx, d1);
1918                 dchar d2 = 0;
1919                 if (idx < n && !msg)
1920                     msg = utf_decodeChar(str, idx, d2);
1921                 if (msg)
1922                     error(loc, "%.*s", cast(int)msg.length, msg.ptr);
1923                 else if (idx < n)
1924                     error(loc, "max number of chars in 16 bit character literal is 2, had %d",
1925                         cast(int)((n + 1) >> 1));
1926                 else if (d1 > 0x1_0000)
1927                     error(loc, "%d does not fit in 16 bits", d1);
1928                 else if (d2 > 0x1_0000)
1929                     error(loc, "%d does not fit in 16 bits", d2);
1930                 u = d1;
1931                 if (d2)
1932                     u = (d1 << 16) | d2;
1933                 break;
1934 
1935             case 'U':
1936                 dchar d;
1937                 size_t idx;
1938                 auto msg = utf_decodeChar(str, idx, d);
1939                 if (msg)
1940                     error(loc, "%.*s", cast(int)msg.length, msg.ptr);
1941                 else if (idx < n)
1942                     error(loc, "max number of chars in 32 bit character literal is 1, had %d",
1943                         cast(int)((n + 3) >> 2));
1944                 u = d;
1945                 break;
1946 
1947             default:
1948                 assert(0);
1949         }
1950         t.value = n == 1 ? TOK.charLiteral : TOK.int32Literal;
1951         t.unsvalue = u;
1952     }
1953 
1954     /***************************************
1955      * Get postfix of string literal.
1956      */
1957     private void stringPostfix(Token* t) pure @nogc
1958     {
1959         switch (*p)
1960         {
1961         case 'c':
1962         case 'w':
1963         case 'd':
1964             t.postfix = *p;
1965             p++;
1966             break;
1967         default:
1968             t.postfix = 0;
1969             break;
1970         }
1971     }
1972 
1973     /**************************************
1974      * Read in a number.
1975      * If it's an integer, store it in tok.TKutok.Vlong.
1976      *      integers can be decimal, octal or hex
1977      *      Handle the suffixes U, UL, LU, L, etc.
1978      * If it's double, store it in tok.TKutok.Vdouble.
1979      * Returns:
1980      *      TKnum
1981      *      TKdouble,...
1982      */
1983     private TOK number(Token* t)
1984     {
1985         int base = 10;
1986         const start = p;
1987         ulong n = 0; // unsigned >=64 bit integer type
1988         int d;
1989         bool err = false;
1990         bool overflow = false;
1991         bool anyBinaryDigitsNoSingleUS = false;
1992         bool anyHexDigitsNoSingleUS = false;
1993         char errorDigit = 0;
1994         dchar c = *p;
1995         if (c == '0')
1996         {
1997             ++p;
1998             c = *p;
1999             switch (c)
2000             {
2001             case '0':
2002             case '1':
2003             case '2':
2004             case '3':
2005             case '4':
2006             case '5':
2007             case '6':
2008             case '7':
2009                 base = 8;
2010                 break;
2011 
2012             case '8':
2013             case '9':
2014                 errorDigit = cast(char) c;
2015                 base = 8;
2016                 break;
2017             case 'x':
2018             case 'X':
2019                 ++p;
2020                 base = 16;
2021                 break;
2022             case 'b':
2023             case 'B':
2024                 ++p;
2025                 base = 2;
2026                 break;
2027             case '.':
2028                 if (p[1] == '.')
2029                     goto Ldone; // if ".."
2030                 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
2031                 {
2032                     if (Ccompile && (p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
2033                         goto Lreal;  // if `0.f` or `0.L`
2034                     goto Ldone; // if ".identifier" or ".unicode"
2035                 }
2036                 goto Lreal; // '.' is part of current token
2037             case 'i':
2038             case 'f':
2039             case 'F':
2040                 goto Lreal;
2041             case '_':
2042                 if (Ccompile)
2043                     error("embedded `_` not allowed");
2044                 ++p;
2045                 base = 8;
2046                 break;
2047             case 'L':
2048                 if (p[1] == 'i')
2049                     goto Lreal;
2050                 break;
2051             default:
2052                 break;
2053             }
2054         }
2055         while (1)
2056         {
2057             c = *p;
2058             switch (c)
2059             {
2060             case '0':
2061             case '1':
2062             case '2':
2063             case '3':
2064             case '4':
2065             case '5':
2066             case '6':
2067             case '7':
2068             case '8':
2069             case '9':
2070                 ++p;
2071                 d = c - '0';
2072                 break;
2073             case 'a':
2074             case 'b':
2075             case 'c':
2076             case 'd':
2077             case 'e':
2078             case 'f':
2079             case 'A':
2080             case 'B':
2081             case 'C':
2082             case 'D':
2083             case 'E':
2084             case 'F':
2085                 ++p;
2086                 if (base != 16)
2087                 {
2088                     if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
2089                         goto Lreal;
2090                 }
2091                 if (c >= 'a')
2092                     d = c + 10 - 'a';
2093                 else
2094                     d = c + 10 - 'A';
2095                 break;
2096             case 'L':
2097                 if (p[1] == 'i')
2098                     goto Lreal;
2099                 goto Ldone;
2100             case '.':
2101                 if (p[1] == '.')
2102                     goto Ldone; // if ".."
2103                 if (base <= 10 && n > 0 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
2104                 {
2105                     if (Ccompile && base == 10 &&
2106                         (p[1] == 'e' || p[1] == 'E' || p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
2107                         goto Lreal;  // if `1.e6` or `1.f` or `1.L`
2108                     goto Ldone; // if ".identifier" or ".unicode"
2109                 }
2110                 if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80))
2111                     goto Ldone; // if ".identifier" or ".unicode"
2112                 if (base == 2)
2113                     goto Ldone; // if ".identifier" or ".unicode"
2114                 goto Lreal; // otherwise as part of a floating point literal
2115             case 'p':
2116             case 'P':
2117             case 'i':
2118             Lreal:
2119                 p = start;
2120                 return inreal(t);
2121             case '_':
2122                 if (Ccompile)
2123                     goto default;
2124                 ++p;
2125                 continue;
2126             default:
2127                 goto Ldone;
2128             }
2129             // got a digit here, set any necessary flags, check for errors
2130             anyHexDigitsNoSingleUS = true;
2131             anyBinaryDigitsNoSingleUS = true;
2132             if (!errorDigit && d >= base)
2133             {
2134                 errorDigit = cast(char) c;
2135             }
2136             // Avoid expensive overflow check if we aren't at risk of overflow
2137             if (n <= 0x0FFF_FFFF_FFFF_FFFFUL)
2138                 n = n * base + d;
2139             else
2140             {
2141                 import core.checkedint : mulu, addu;
2142 
2143                 n = mulu(n, base, overflow);
2144                 n = addu(n, d, overflow);
2145             }
2146         }
2147     Ldone:
2148         if (errorDigit)
2149         {
2150             error(token.loc, "%s digit expected, not `%c`", base == 2 ? "binary".ptr :
2151                                                  base == 8 ? "octal".ptr :
2152                                                  "decimal".ptr, errorDigit);
2153             err = true;
2154         }
2155         if (overflow && !err)
2156         {
2157             error("integer overflow");
2158             err = true;
2159         }
2160         if ((base == 2 && !anyBinaryDigitsNoSingleUS) ||
2161             (base == 16 && !anyHexDigitsNoSingleUS))
2162             error(token.loc, "`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start);
2163 
2164         t.unsvalue = n;
2165 
2166         if (Ccompile)
2167             return cnumber(base, n);
2168 
2169         enum FLAGS : int
2170         {
2171             none = 0,
2172             decimal = 1, // decimal
2173             unsigned = 2, // u or U suffix
2174             long_ = 4, // L suffix
2175         }
2176 
2177         FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none;
2178         // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2179         const psuffix = p;
2180         while (1)
2181         {
2182             FLAGS f;
2183             switch (*p)
2184             {
2185             case 'U':
2186             case 'u':
2187                 f = FLAGS.unsigned;
2188                 goto L1;
2189             case 'l':
2190                 f = FLAGS.long_;
2191                 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
2192                 goto L1;
2193             case 'L':
2194                 f = FLAGS.long_;
2195             L1:
2196                 p++;
2197                 if ((flags & f) && !err)
2198                 {
2199                     error("unrecognized token");
2200                     err = true;
2201                 }
2202                 flags = cast(FLAGS)(flags | f);
2203                 continue;
2204             default:
2205                 break;
2206             }
2207             break;
2208         }
2209         if (base == 8 && n >= 8)
2210         {
2211             if (err)
2212                 // can't translate invalid octal value, just show a generic message
2213                 error("octal literals larger than 7 are no longer supported");
2214             else
2215                 error(token.loc, "octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead",
2216                     n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix);
2217         }
2218         TOK result;
2219         switch (flags)
2220         {
2221         case FLAGS.none:
2222             /* Octal or Hexadecimal constant.
2223              * First that fits: int, uint, long, ulong
2224              */
2225             if (n & 0x8000000000000000L)
2226                 result = TOK.uns64Literal;
2227             else if (n & 0xFFFFFFFF00000000L)
2228                 result = TOK.int64Literal;
2229             else if (n & 0x80000000)
2230                 result = TOK.uns32Literal;
2231             else
2232                 result = TOK.int32Literal;
2233             break;
2234         case FLAGS.decimal:
2235             /* First that fits: int, long, long long
2236              */
2237             if (n & 0x8000000000000000L)
2238             {
2239                 result = TOK.uns64Literal;
2240             }
2241             else if (n & 0xFFFFFFFF80000000L)
2242                 result = TOK.int64Literal;
2243             else
2244                 result = TOK.int32Literal;
2245             break;
2246         case FLAGS.unsigned:
2247         case FLAGS.decimal | FLAGS.unsigned:
2248             /* First that fits: uint, ulong
2249              */
2250             if (n & 0xFFFFFFFF00000000L)
2251                 result = TOK.uns64Literal;
2252             else
2253                 result = TOK.uns32Literal;
2254             break;
2255         case FLAGS.decimal | FLAGS.long_:
2256             if (n & 0x8000000000000000L)
2257             {
2258                 if (!err)
2259                 {
2260                     error("signed integer overflow");
2261                     err = true;
2262                 }
2263                 result = TOK.uns64Literal;
2264             }
2265             else
2266                 result = TOK.int64Literal;
2267             break;
2268         case FLAGS.long_:
2269             if (n & 0x8000000000000000L)
2270                 result = TOK.uns64Literal;
2271             else
2272                 result = TOK.int64Literal;
2273             break;
2274         case FLAGS.unsigned | FLAGS.long_:
2275         case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
2276             result = TOK.uns64Literal;
2277             break;
2278         default:
2279             debug
2280             {
2281                 printf("%x\n", flags);
2282             }
2283             assert(0);
2284         }
2285         return result;
2286     }
2287 
2288     /**************************************
2289      * Lex C integer-suffix
2290      * Params:
2291      *  base = number base
2292      *  n = raw integer value
2293      * Returns:
2294      *  token value
2295      */
2296     private TOK cnumber(int base, ulong n)
2297     {
2298         /* C11 6.4.4.1
2299          * Parse trailing suffixes:
2300          *   u or U
2301          *   l or L
2302          *   ll or LL
2303          */
2304         enum FLAGS : uint
2305         {
2306             octalhex = 1, // octal or hexadecimal
2307             decimal  = 2, // decimal
2308             unsigned = 4, // u or U suffix
2309             long_    = 8, // l or L suffix
2310             llong    = 0x10 // ll or LL
2311         }
2312         FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex;
2313         bool err;
2314     Lsuffixes:
2315         while (1)
2316         {
2317             FLAGS f;
2318             const cs = *p;
2319             switch (cs)
2320             {
2321                 case 'U':
2322                 case 'u':
2323                     f = FLAGS.unsigned;
2324                     break;
2325 
2326                 case 'l':
2327                 case 'L':
2328                     f = FLAGS.long_;
2329                     if (cs == p[1])
2330                     {
2331                         f = FLAGS.long_ | FLAGS.llong;
2332                         ++p;
2333                     }
2334                     break;
2335 
2336                 default:
2337                     break Lsuffixes;
2338             }
2339             ++p;
2340             if ((flags & f) && !err)
2341             {
2342                 error("duplicate integer suffixes");
2343                 err = true;
2344             }
2345             flags = cast(FLAGS)(flags | f);
2346         }
2347 
2348         TOK result = TOK.int32Literal;     // default
2349         switch (flags)
2350         {
2351             /* Since D doesn't have a variable sized `long` or `unsigned long` type,
2352              * this code deviates from C by picking D int, uint, long, or ulong instead
2353              */
2354 
2355             case FLAGS.octalhex:
2356                 /* Octal or Hexadecimal constant.
2357                  * First that fits: int, unsigned, long, unsigned long,
2358                  * long long, unsigned long long
2359                  */
2360                 if (n & 0x8000000000000000L)
2361                     result = TOK.uns64Literal;      // unsigned long
2362                 else if (n & 0xFFFFFFFF00000000L)
2363                     result = TOK.int64Literal;      // long
2364                 else if (n & 0x80000000)
2365                     result = TOK.uns32Literal;
2366                 else
2367                     result = TOK.int32Literal;
2368                 break;
2369 
2370             case FLAGS.decimal:
2371                 /* First that fits: int, long, long long
2372                  */
2373                 if (n & 0x8000000000000000L)
2374                     result = TOK.uns64Literal;      // unsigned long
2375                 else if (n & 0xFFFFFFFF80000000L)
2376                     result = TOK.int64Literal;      // long
2377                 else
2378                     result = TOK.int32Literal;
2379                 break;
2380 
2381             case FLAGS.octalhex | FLAGS.unsigned:
2382             case FLAGS.decimal | FLAGS.unsigned:
2383                 /* First that fits: unsigned, unsigned long, unsigned long long
2384                  */
2385                 if (n & 0xFFFFFFFF00000000L)
2386                     result = TOK.uns64Literal;      // unsigned long
2387                 else
2388                     result = TOK.uns32Literal;
2389                 break;
2390 
2391             case FLAGS.decimal | FLAGS.long_:
2392                 /* First that fits: long, long long
2393                  */
2394                 if (longsize == 4 || long_longsize == 4)
2395                 {
2396                     if (n & 0xFFFFFFFF_80000000L)
2397                         result = TOK.int64Literal;
2398                     else
2399                         result = TOK.int32Literal;  // long
2400                 }
2401                 else
2402                 {
2403                     result = TOK.int64Literal;      // long
2404                 }
2405                 break;
2406 
2407             case FLAGS.octalhex | FLAGS.long_:
2408                 /* First that fits: long, unsigned long, long long,
2409                  * unsigned long long
2410                  */
2411                 if (longsize == 4 || long_longsize == 4)
2412                 {
2413                     if (n & 0x8000000000000000L)
2414                         result = TOK.uns64Literal;
2415                     else if (n & 0xFFFFFFFF00000000L)
2416                         result = TOK.int64Literal;
2417                     else if (n & 0x80000000)
2418                         result = TOK.uns32Literal;      // unsigned long
2419                     else
2420                         result = TOK.int32Literal;      // long
2421                 }
2422                 else
2423                 {
2424                     if (n & 0x80000000_00000000L)
2425                         result = TOK.uns64Literal;      // unsigned long
2426                     else
2427                         result = TOK.int64Literal;      // long
2428                 }
2429                 break;
2430 
2431             case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_:
2432             case FLAGS.decimal  | FLAGS.unsigned | FLAGS.long_:
2433                 /* First that fits: unsigned long, unsigned long long
2434                  */
2435                 if (longsize == 4 || long_longsize == 4)
2436                 {
2437                     if (n & 0xFFFFFFFF00000000L)
2438                         result = TOK.uns64Literal;
2439                     else
2440                         result = TOK.uns32Literal;      // unsigned long
2441                 }
2442                 else
2443                 {
2444                     result = TOK.uns64Literal;  // unsigned long
2445                 }
2446                 break;
2447 
2448             case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong:
2449                 /* First that fits: long long, unsigned long long
2450                  */
2451                 if (n & 0x8000000000000000L)
2452                     result = TOK.uns64Literal;
2453                 else
2454                     result = TOK.int64Literal;
2455                 break;
2456 
2457             case FLAGS.decimal | FLAGS.long_ | FLAGS.llong:
2458                 /* long long
2459                  */
2460                 result = TOK.int64Literal;
2461                 break;
2462 
2463             case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2464             case FLAGS.decimal  | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2465                 result = TOK.uns64Literal;
2466                 break;
2467 
2468             default:
2469                 debug printf("%x\n",flags);
2470                 assert(0);
2471         }
2472         return result;
2473     }
2474 
2475     /**************************************
2476      * Read in characters, converting them to real.
2477      * Bugs:
2478      *      Exponent overflow not detected.
2479      *      Too much requested precision is not detected.
2480      */
2481     private TOK inreal(Token* t)
2482     {
2483         //printf("Lexer::inreal()\n");
2484         debug
2485         {
2486             assert(*p == '.' || isdigit(*p));
2487         }
2488         bool isWellformedString = true;
2489         stringbuffer.setsize(0);
2490         auto pstart = p;
2491         bool hex = false;
2492         dchar c = *p++;
2493         // Leading '0x'
2494         if (c == '0')
2495         {
2496             c = *p++;
2497             if (c == 'x' || c == 'X')
2498             {
2499                 hex = true;
2500                 c = *p++;
2501             }
2502         }
2503         // Digits to left of '.'
2504         while (1)
2505         {
2506             if (c == '.')
2507             {
2508                 c = *p++;
2509                 break;
2510             }
2511             if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2512             {
2513                 c = *p++;
2514                 continue;
2515             }
2516             break;
2517         }
2518         // Digits to right of '.'
2519         while (1)
2520         {
2521             if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2522             {
2523                 c = *p++;
2524                 continue;
2525             }
2526             break;
2527         }
2528         if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
2529         {
2530             c = *p++;
2531             if (c == '-' || c == '+')
2532             {
2533                 c = *p++;
2534             }
2535             bool anyexp = false;
2536             while (1)
2537             {
2538                 if (isdigit(c))
2539                 {
2540                     anyexp = true;
2541                     c = *p++;
2542                     continue;
2543                 }
2544                 if (c == '_')
2545                 {
2546                     if (Ccompile)
2547                         error("embedded `_` in numeric literals not allowed");
2548                     c = *p++;
2549                     continue;
2550                 }
2551                 if (!anyexp)
2552                 {
2553                     error("missing exponent");
2554                     isWellformedString = false;
2555                 }
2556                 break;
2557             }
2558         }
2559         else if (hex)
2560         {
2561             error("exponent required for hex float");
2562             isWellformedString = false;
2563         }
2564         --p;
2565         while (pstart < p)
2566         {
2567             if (*pstart != '_')
2568                 stringbuffer.writeByte(*pstart);
2569             ++pstart;
2570         }
2571         stringbuffer.writeByte(0);
2572         auto sbufptr = cast(const(char)*)stringbuffer[].ptr;
2573         TOK result;
2574         bool isOutOfRange = false;
2575         t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, isOutOfRange) : CTFloat.zero);
2576 
2577         bool imaginary = false;
2578         if (*p == 'i' && Ccompile)
2579         {
2580             ++p;
2581             imaginary = true;
2582         }
2583 
2584         switch (*p)
2585         {
2586         case 'F':
2587         case 'f':
2588             if (isWellformedString && !isOutOfRange)
2589                 isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr);
2590             result = TOK.float32Literal;
2591             p++;
2592             break;
2593         default:
2594             if (isWellformedString && !isOutOfRange)
2595                 isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr);
2596             result = TOK.float64Literal;
2597             break;
2598         case 'l':
2599             if (!Ccompile)
2600                 error("use 'L' suffix instead of 'l'");
2601             goto case 'L';
2602         case 'L':
2603             ++p;
2604             if (Ccompile && long_doublesize == 8)
2605                 goto default;
2606             result = TOK.float80Literal;
2607             break;
2608         }
2609 
2610         if ((*p == 'i' || *p == 'I') && !Ccompile)
2611         {
2612             if (*p == 'I')
2613                 error("use 'i' suffix instead of 'I'");
2614             p++;
2615             imaginary = true;
2616         }
2617 
2618         if (imaginary)
2619         {
2620             switch (result)
2621             {
2622             case TOK.float32Literal:
2623                 result = TOK.imaginary32Literal;
2624                 break;
2625             case TOK.float64Literal:
2626                 result = TOK.imaginary64Literal;
2627                 break;
2628             case TOK.float80Literal:
2629                 result = TOK.imaginary80Literal;
2630                 break;
2631             default:
2632                 break;
2633             }
2634         }
2635         const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal);
2636         if (isOutOfRange && !isLong && (!Ccompile || hex))
2637         {
2638             /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex
2639              */
2640             const char* suffix = result == TOK.float32Literal ? "f" : result == TOK.float80Literal ? "L" : "";
2641             const char* type = [TOK.float32Literal: "`float`".ptr,
2642                                 TOK.float64Literal: "`double`".ptr,
2643                                 TOK.float80Literal: "`real` for the current target".ptr][result];
2644             error(scanloc, "number `%s%s` is not representable as a %s", sbufptr, suffix, type);
2645             const char* extra = result == TOK.float64Literal ? "`real` literals can be written using the `L` suffix. " : "";
2646             eSink.errorSupplemental(scanloc, "%shttps://dlang.org/spec/lex.html#floatliteral", extra);
2647         }
2648         debug
2649         {
2650             switch (result)
2651             {
2652             case TOK.float32Literal:
2653             case TOK.float64Literal:
2654             case TOK.float80Literal:
2655             case TOK.imaginary32Literal:
2656             case TOK.imaginary64Literal:
2657             case TOK.imaginary80Literal:
2658                 break;
2659             default:
2660                 assert(0);
2661             }
2662         }
2663         return result;
2664     }
2665 
2666     final Loc loc() pure @nogc
2667     {
2668         scanloc.charnum = cast(uint)(1 + p - line);
2669         version (LocOffset)
2670             scanloc.fileOffset = cast(uint)(p - base);
2671         return scanloc;
2672     }
2673 
2674     void error(T...)(const(char)* format, T args)
2675     {
2676         eSink.error(token.loc, format, args);
2677     }
2678 
2679     void error(T...)(const ref Loc loc, const(char)* format, T args)
2680     {
2681         eSink.error(loc, format, args);
2682     }
2683 
2684     void deprecation(T...)(const ref Loc loc, const(char)* format, T args)
2685     {
2686         eSink.deprecation(loc, format, args);
2687     }
2688 
2689     void deprecation(T...)(const(char)* format, T args)
2690     {
2691         eSink.deprecation(token.loc, format, args);
2692     }
2693 
2694     void deprecationSupplemental(T...)(const(char)* format, T args)
2695     {
2696         eSink.deprecationSupplemental(token.loc, format, args);
2697     }
2698 
2699     /***************************************
2700      * Parse special token sequence:
2701      * Returns:
2702      *  true if the special token sequence was handled
2703      * References:
2704      *  https://dlang.org/spec/lex.html#special-token-sequence
2705      */
2706     bool parseSpecialTokenSequence()
2707     {
2708         Token n;
2709         scan(&n);
2710         if (n.value == TOK.identifier)
2711         {
2712             if (n.ident == Id.line)
2713             {
2714                 poundLine(n, false);
2715                 return true;
2716             }
2717             else
2718             {
2719                 const locx = loc();
2720                 // @@@DEPRECATED_2.103@@@
2721                 // Turn into an error in 2.113
2722                 if (inTokenStringConstant)
2723                     deprecation(locx, "token string requires valid D tokens, not `#%s`", n.ident.toChars());
2724                 else
2725                     error(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars());
2726             }
2727         }
2728         else if (n.value == TOK.if_)
2729         {
2730             const locx = loc();
2731             if (inTokenStringConstant)
2732                 error(locx, "token string requires valid D tokens, not `#if`");
2733             else
2734                 error(locx, "C preprocessor directive `#if` is not supported, use `version` or `static if`");
2735         }
2736         return false;
2737     }
2738 
2739     /*********************************************
2740      * Parse line/file preprocessor directive:
2741      *    #line linnum [filespec]
2742      * Allow __LINE__ for linnum, and __FILE__ for filespec.
2743      * Accept linemarker format:
2744      *    # linnum [filespec] {flags}
2745      * There can be zero or more flags, which are one of the digits 1..4, and
2746      * must be in ascending order. The flags are ignored.
2747      * Params:
2748      *  tok = token we're on, which is linnum of linemarker
2749      *  linemarker = true if line marker format and lexer is on linnum
2750      * References:
2751      *  linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html
2752      */
2753     final void poundLine(ref Token tok, bool linemarker)
2754     {
2755         auto linnum = this.scanloc.linnum;
2756         const(char)* filespec = null;
2757         bool flags;
2758 
2759         if (!linemarker)
2760             scan(&tok);
2761         if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal)
2762         {
2763             const lin = cast(int)(tok.unsvalue);
2764             if (lin != tok.unsvalue)
2765             {
2766                 error(tok.loc, "line number `%lld` out of range", cast(ulong)tok.unsvalue);
2767                 skipToNextLine();
2768                 return;
2769             }
2770             else
2771                 linnum = lin;
2772         }
2773         else if (tok.value == TOK.line)  // #line __LINE__
2774         {
2775         }
2776         else
2777         {
2778             error(tok.loc, "positive integer argument expected following `#line`");
2779             if (tok.value != TOK.endOfLine)
2780                 skipToNextLine();
2781             return;
2782         }
2783         while (1)
2784         {
2785             scan(&tok);
2786             switch (tok.value)
2787             {
2788             case TOK.endOfFile:
2789             case TOK.endOfLine:
2790                 if (!inTokenStringConstant)
2791                 {
2792                     this.scanloc.linnum = linnum;
2793                     if (filespec)
2794                         this.scanloc.filename = filespec;
2795                 }
2796                 return;
2797             case TOK.file:
2798                 if (filespec || flags)
2799                     goto Lerr;
2800                 filespec = mem.xstrdup(scanloc.filename);
2801                 continue;
2802             case TOK.string_:
2803                 if (filespec || flags)
2804                     goto Lerr;
2805                 if (tok.ptr[0] != '"' || tok.postfix != 0)
2806                     goto Lerr;
2807                 filespec = tok.ustring;
2808                 continue;
2809             case TOK.int32Literal:
2810                 if (!filespec)
2811                     goto Lerr;
2812                 if (linemarker && tok.unsvalue >= 1 && tok.unsvalue <= 4)
2813                 {
2814                     flags = true;   // linemarker flags seen
2815                     continue;
2816                 }
2817                 goto Lerr;
2818             default:
2819                 goto Lerr;
2820             }
2821         }
2822     Lerr:
2823         if (filespec is null)
2824             error(tok.loc, "invalid filename for `#line` directive");
2825         else if (linemarker)
2826             error(tok.loc, "invalid flag for line marker directive");
2827         else if (!Ccompile)
2828             error(tok.loc, "found `%s` when expecting new line following `#line` directive", tok.toChars());
2829         if (tok.value != TOK.endOfLine)
2830             skipToNextLine();
2831     }
2832 
2833     /***************************************
2834      * Scan forward to start of next line.
2835      * Params:
2836      *    defines = send characters to `defines`
2837      */
2838     final void skipToNextLine(OutBuffer* defines = null)
2839     {
2840         while (1)
2841         {
2842             switch (*p)
2843             {
2844             case 0:
2845             case 0x1A:
2846                 return; // do not advance p
2847 
2848             case '\n':
2849                 ++p;
2850                 break;
2851 
2852             case '\r':
2853                 ++p;
2854                 if (p[0] == '\n')
2855                    ++p;
2856                 break;
2857 
2858             default:
2859                 if (defines)
2860                     defines.writeByte(*p); // don't care about Unicode line endings for C
2861                 else if (*p & 0x80)
2862                 {
2863                     const u = decodeUTF();
2864                     if (u == PS || u == LS)
2865                     {
2866                         ++p;
2867                         break;
2868                     }
2869                 }
2870                 ++p;
2871                 continue;
2872             }
2873             break;
2874         }
2875         endOfLine();
2876         tokenizeNewlines = false;
2877     }
2878 
2879     /********************************************
2880      * Decode UTF character.
2881      * Issue error messages for invalid sequences.
2882      * Return decoded character, advance p to last character in UTF sequence.
2883      */
2884     private uint decodeUTF()
2885     {
2886         string msg;
2887         auto result = decodeUTFpure(msg);
2888 
2889         if (msg)
2890             error(token.loc, "%.*s", cast(int)msg.length, msg.ptr);
2891         return result;
2892     }
2893 
2894     /********************************************
2895      * Same as above, but the potential error message is stored to the
2896      * msg parameter instead of being issued.
2897      */
2898     private pure uint decodeUTFpure(out string msg)
2899     {
2900         const s = p;
2901         assert(*s & 0x80);
2902         // Check length of remaining string up to 4 UTF-8 characters
2903         size_t len;
2904         for (len = 1; len < 4 && s[len]; len++)
2905         {
2906         }
2907         size_t idx = 0;
2908         dchar u;
2909         msg = utf_decodeChar(s[0 .. len], idx, u);
2910         p += idx - 1;
2911         if (!msg && isBidiControl(u))
2912             msg = "Bidirectional control characters are disallowed for security reasons.";
2913         return u;
2914     }
2915 
2916     /***************************************************
2917      * Parse doc comment embedded between t.ptr and p.
2918      * Remove trailing blanks and tabs from lines.
2919      * Replace all newlines with \n.
2920      * Remove leading comment character from each line.
2921      * Decide if it's a lineComment or a blockComment.
2922      * Append to previous one for this token.
2923      *
2924      * If newParagraph is true, an extra newline will be
2925      * added between adjoining doc comments.
2926      */
2927     private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure
2928     {
2929         /* ct tells us which kind of comment it is: '/', '*', or '+'
2930          */
2931         const ct = t.ptr[2];
2932         /* Start of comment text skips over / * *, / + +, or / / /
2933          */
2934         const(char)* q = t.ptr + 3; // start of comment text
2935         const(char)* qend = p;
2936         if (ct == '*' || ct == '+')
2937             qend -= 2;
2938         /* Scan over initial row of ****'s or ++++'s or ////'s
2939          */
2940         for (; q < qend; q++)
2941         {
2942             if (*q != ct)
2943                 break;
2944         }
2945         /* Remove leading spaces until start of the comment
2946          */
2947         int linestart = 0;
2948         if (ct == '/')
2949         {
2950             while (q < qend && (*q == ' ' || *q == '\t'))
2951                 ++q;
2952         }
2953         else if (q < qend)
2954         {
2955             if (*q == '\r')
2956             {
2957                 ++q;
2958                 if (q < qend && *q == '\n')
2959                     ++q;
2960                 linestart = 1;
2961             }
2962             else if (*q == '\n')
2963             {
2964                 ++q;
2965                 linestart = 1;
2966             }
2967         }
2968         /* Remove trailing row of ****'s or ++++'s
2969          */
2970         if (ct != '/')
2971         {
2972             for (; q < qend; qend--)
2973             {
2974                 if (qend[-1] != ct)
2975                     break;
2976             }
2977         }
2978         /* Comment is now [q .. qend].
2979          * Canonicalize it into buf[].
2980          */
2981         OutBuffer buf;
2982 
2983         void trimTrailingWhitespace()
2984         {
2985             const s = buf[];
2986             auto len = s.length;
2987             while (len && (s[len - 1] == ' ' || s[len - 1] == '\t'))
2988                 --len;
2989             buf.setsize(len);
2990         }
2991 
2992         for (; q < qend; q++)
2993         {
2994             char c = *q;
2995             switch (c)
2996             {
2997             case '*':
2998             case '+':
2999                 if (linestart && c == ct)
3000                 {
3001                     linestart = 0;
3002                     /* Trim preceding whitespace up to preceding \n
3003                      */
3004                     trimTrailingWhitespace();
3005                     continue;
3006                 }
3007                 break;
3008             case ' ':
3009             case '\t':
3010                 break;
3011             case '\r':
3012                 if (q[1] == '\n')
3013                     continue; // skip the \r
3014                 goto Lnewline;
3015             default:
3016                 if (c == 226)
3017                 {
3018                     // If LS or PS
3019                     if (q[1] == 128 && (q[2] == 168 || q[2] == 169))
3020                     {
3021                         q += 2;
3022                         goto Lnewline;
3023                     }
3024                 }
3025                 linestart = 0;
3026                 break;
3027             Lnewline:
3028                 c = '\n'; // replace all newlines with \n
3029                 goto case;
3030             case '\n':
3031                 linestart = 1;
3032                 /* Trim trailing whitespace
3033                  */
3034                 trimTrailingWhitespace();
3035                 break;
3036             }
3037             buf.writeByte(c);
3038         }
3039         /* Trim trailing whitespace (if the last line does not have newline)
3040          */
3041         trimTrailingWhitespace();
3042 
3043         // Always end with a newline
3044         const s = buf[];
3045         if (s.length == 0 || s[$ - 1] != '\n')
3046             buf.writeByte('\n');
3047 
3048         // It's a line comment if the start of the doc comment comes
3049         // after other non-whitespace on the same line.
3050         auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment;
3051         // Combine with previous doc comment, if any
3052         if (*dc)
3053         {
3054             auto p = combineComments(*dc, buf[], newParagraph);
3055             *dc = p ? p[0 .. strlen(p)] : null;
3056         }
3057         else
3058             *dc = buf.extractSlice(true);
3059     }
3060 
3061     /********************************************
3062      * Combine two document comments into one,
3063      * separated by an extra newline if newParagraph is true.
3064      */
3065     static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure
3066     {
3067         //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph);
3068         const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n'
3069         if (!c1)
3070             return c2.ptr;
3071         if (!c2)
3072             return c1.ptr;
3073 
3074         int insertNewLine = 0;
3075         if (c1.length && c1[$ - 1] != '\n')
3076             insertNewLine = 1;
3077         const retSize = c1.length + insertNewLine + newParagraphSize + c2.length;
3078         auto p = cast(char*)mem.xmalloc_noscan(retSize + 1);
3079         p[0 .. c1.length] = c1[];
3080         if (insertNewLine)
3081             p[c1.length] = '\n';
3082         if (newParagraph)
3083             p[c1.length + insertNewLine] = '\n';
3084         p[retSize - c2.length .. retSize] = c2[];
3085         p[retSize] = 0;
3086         return p;
3087     }
3088 
3089     /**************************
3090      * `p` should be at start of next line
3091      */
3092     private void endOfLine() pure @nogc @safe
3093     {
3094         scanloc.linnum++;
3095         line = p;
3096     }
3097 }
3098 
3099 
3100 /******************************* Private *****************************************/
3101 
3102 private:
3103 
3104 private enum LS = 0x2028;       // UTF line separator
3105 private enum PS = 0x2029;       // UTF paragraph separator
3106 
3107 /********************************************
3108  * Do our own char maps
3109  */
3110 private static immutable cmtable = ()
3111 {
3112     ubyte[256] table;
3113     foreach (const c; 0 .. table.length)
3114     {
3115         if ('0' <= c && c <= '7')
3116             table[c] |= CMoctal;
3117         if (c_isxdigit(c))
3118             table[c] |= CMhex;
3119         if (c_isalnum(c) || c == '_')
3120             table[c] |= CMidchar;
3121 
3122         switch (c)
3123         {
3124             case 'x': case 'X':
3125             case 'b': case 'B':
3126                 table[c] |= CMzerosecond;
3127                 break;
3128 
3129             case '0': .. case '9':
3130             case 'e': case 'E':
3131             case 'f': case 'F':
3132             case 'l': case 'L':
3133             case 'p': case 'P':
3134             case 'u': case 'U':
3135             case 'i':
3136             case '.':
3137             case '_':
3138                 table[c] |= CMzerosecond | CMdigitsecond;
3139                 break;
3140 
3141             default:
3142                 break;
3143         }
3144 
3145         switch (c)
3146         {
3147             case '\\':
3148             case '\n':
3149             case '\r':
3150             case 0:
3151             case 0x1A:
3152             case '\'':
3153                 break;
3154             default:
3155                 if (!(c & 0x80))
3156                     table[c] |= CMsinglechar;
3157                 break;
3158         }
3159     }
3160     return table;
3161 }();
3162 
3163 private
3164 {
3165     enum CMoctal  = 0x1;
3166     enum CMhex    = 0x2;
3167     enum CMidchar = 0x4;
3168     enum CMzerosecond = 0x8;
3169     enum CMdigitsecond = 0x10;
3170     enum CMsinglechar = 0x20;
3171 }
3172 
3173 private bool isoctal(const char c) pure @nogc @safe
3174 {
3175     return (cmtable[c] & CMoctal) != 0;
3176 }
3177 
3178 private bool ishex(const char c) pure @nogc @safe
3179 {
3180     return (cmtable[c] & CMhex) != 0;
3181 }
3182 
3183 private bool isidchar(const char c) pure @nogc @safe
3184 {
3185     return (cmtable[c] & CMidchar) != 0;
3186 }
3187 
3188 private bool isZeroSecond(const char c) pure @nogc @safe
3189 {
3190     return (cmtable[c] & CMzerosecond) != 0;
3191 }
3192 
3193 private bool isDigitSecond(const char c) pure @nogc @safe
3194 {
3195     return (cmtable[c] & CMdigitsecond) != 0;
3196 }
3197 
3198 private bool issinglechar(const char c) pure @nogc @safe
3199 {
3200     return (cmtable[c] & CMsinglechar) != 0;
3201 }
3202 
3203 private bool c_isxdigit(const int c) pure @nogc @safe
3204 {
3205     return (( c >= '0' && c <= '9') ||
3206             ( c >= 'a' && c <= 'f') ||
3207             ( c >= 'A' && c <= 'F'));
3208 }
3209 
3210 private bool c_isalnum(const int c) pure @nogc @safe
3211 {
3212     return (( c >= '0' && c <= '9') ||
3213             ( c >= 'a' && c <= 'z') ||
3214             ( c >= 'A' && c <= 'Z'));
3215 }
3216 
3217 /******************************* Unittest *****************************************/
3218 
3219 unittest
3220 {
3221     fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3222 
3223     ErrorSink errorSink = new ErrorSinkStderr;
3224 
3225     void test(T)(string sequence, T expected, bool Ccompile = false)
3226     {
3227         auto p = cast(const(char)*)sequence.ptr;
3228         dchar c2;
3229         Lexer lexer = new Lexer(errorSink);
3230         assert(expected == lexer.escapeSequence(Loc.initial, p, Ccompile, c2));
3231         assert(p == sequence.ptr + sequence.length);
3232     }
3233 
3234     test(`'`, '\'');
3235     test(`"`, '"');
3236     test(`?`, '?');
3237     test(`\`, '\\');
3238     test(`0`, '\0');
3239     test(`a`, '\a');
3240     test(`b`, '\b');
3241     test(`f`, '\f');
3242     test(`n`, '\n');
3243     test(`r`, '\r');
3244     test(`t`, '\t');
3245     test(`v`, '\v');
3246 
3247     test(`x00`, 0x00);
3248     test(`xff`, 0xff);
3249     test(`xFF`, 0xff);
3250     test(`xa7`, 0xa7);
3251     test(`x3c`, 0x3c);
3252     test(`xe2`, 0xe2);
3253 
3254     test(`1`, '\1');
3255     test(`42`, '\42');
3256     test(`357`, '\357');
3257 
3258     test(`u1234`, '\u1234');
3259     test(`uf0e4`, '\uf0e4');
3260 
3261     test(`U0001f603`, '\U0001f603');
3262 
3263     test(`&quot;`, '"');
3264     test(`&lt;`, '<');
3265     test(`&gt;`, '>');
3266 }
3267 
3268 unittest
3269 {
3270     fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3271 
3272     static class ErrorSinkTest : ErrorSinkNull
3273     {
3274       nothrow:
3275       extern (C++):
3276       override:
3277 
3278         import core.stdc.stdio;
3279         import core.stdc.stdarg;
3280 
3281         string expected;
3282         bool gotError;
3283 
3284         void error(const ref Loc loc, const(char)* format, ...)
3285         {
3286             gotError = true;
3287             char[100] buffer = void;
3288             va_list ap;
3289             va_start(ap, format);
3290             auto actual = buffer[0 .. vsnprintf(buffer.ptr, buffer.length, format, ap)];
3291             va_end(ap);
3292             assert(expected == actual);
3293         }
3294     }
3295 
3296     ErrorSinkTest errorSink = new ErrorSinkTest;
3297 
3298     void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false)
3299     {
3300         errorSink.expected = expectedError;
3301         errorSink.gotError = false;
3302         auto p = cast(const(char)*)sequence.ptr;
3303         Lexer lexer = new Lexer(errorSink);
3304         dchar c2;
3305         auto actualReturnValue = lexer.escapeSequence(Loc.initial, p, Ccompile, c2);
3306         assert(errorSink.gotError);
3307         assert(expectedReturnValue == actualReturnValue);
3308 
3309         auto actualScanLength = p - sequence.ptr;
3310         assert(expectedScanLength == actualScanLength);
3311     }
3312 
3313     test("c", `undefined escape sequence \c`, 'c', 1);
3314     test("!", `undefined escape sequence \!`, '!', 1);
3315     test("&quot;", `undefined escape sequence \&`, '&', 1, true);
3316 
3317     test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2);
3318 
3319     test("u1"  , `escape hex sequence has 1 hex digits instead of 4`,   0x1, 2);
3320     test("u12" , `escape hex sequence has 2 hex digits instead of 4`,  0x12, 3);
3321     test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4);
3322 
3323     test("U0"      , `escape hex sequence has 1 hex digits instead of 8`,       0x0, 2);
3324     test("U00"     , `escape hex sequence has 2 hex digits instead of 8`,      0x00, 3);
3325     test("U000"    , `escape hex sequence has 3 hex digits instead of 8`,     0x000, 4);
3326     test("U0000"   , `escape hex sequence has 4 hex digits instead of 8`,    0x0000, 5);
3327     test("U0001f"  , `escape hex sequence has 5 hex digits instead of 8`,   0x0001f, 6);
3328     test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`,  0x0001f6, 7);
3329     test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8);
3330 
3331     test("ud800"    , `invalid UTF character \U0000d800`, '?', 5);
3332     test("udfff"    , `invalid UTF character \U0000dfff`, '?', 5);
3333     test("U00110000", `invalid UTF character \U00110000`, '?', 9);
3334 
3335     test("xg0"      , `undefined escape hex sequence \xg`, 'g', 2);
3336     test("ug000"    , `undefined escape hex sequence \ug`, 'g', 2);
3337     test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2);
3338 
3339     test("&BAD;", `unnamed character entity &BAD;`  , '?', 5);
3340     test("&quot", `unterminated named entity &quot;`, '?', 5);
3341     test("&quot", `unterminated named entity &quot;`, '?', 5);
3342 
3343     test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3);
3344 }
3345 
3346 unittest
3347 {
3348     fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3349     /* Not much here, just trying things out.
3350      */
3351     string text = "int"; // We rely on the implicit null-terminator
3352     ErrorSink errorSink = new ErrorSinkStderr;
3353     scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, false, false, errorSink, null);
3354     TOK tok;
3355     tok = lex1.nextToken();
3356     //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32);
3357     assert(tok == TOK.int32);
3358     tok = lex1.nextToken();
3359     assert(tok == TOK.endOfFile);
3360     tok = lex1.nextToken();
3361     assert(tok == TOK.endOfFile);
3362     tok = lex1.nextToken();
3363     assert(tok == TOK.endOfFile);
3364 }
3365 
3366 unittest
3367 {
3368     fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3369 
3370     // We don't want to see Lexer error output during these tests.
3371     ErrorSink errorSink = new ErrorSinkNull;
3372 
3373     // Test malformed input: even malformed input should end in a TOK.endOfFile.
3374     static immutable char[][] testcases =
3375     [   // Testcase must end with 0 or 0x1A.
3376         [0], // not malformed, but pathological
3377         ['\'', 0],
3378         ['\'', 0x1A],
3379         ['{', '{', 'q', '{', 0],
3380         [0xFF, 0],
3381         [0xFF, 0x80, 0],
3382         [0xFF, 0xFF, 0],
3383         [0xFF, 0xFF, 0],
3384         ['x', '"', 0x1A],
3385     ];
3386 
3387     foreach (testcase; testcases)
3388     {
3389         scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, false, false, errorSink, null);
3390         TOK tok = lex2.nextToken();
3391         size_t iterations = 1;
3392         while ((tok != TOK.endOfFile) && (iterations++ < testcase.length))
3393         {
3394             tok = lex2.nextToken();
3395         }
3396         assert(tok == TOK.endOfFile);
3397         tok = lex2.nextToken();
3398         assert(tok == TOK.endOfFile);
3399     }
3400 }