1 /**
2  * Check the arguments to `printf` and `scanf` against the `format` string.
3  *
4  * Copyright:   Copyright (C) 1999-2023 by The D Language Foundation, All Rights Reserved
5  * Authors:     $(LINK2 https://www.digitalmars.com, Walter Bright)
6  * License:     $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/chkformat.d, _chkformat.d)
8  * Documentation:  https://dlang.org/phobos/dmd_chkformat.html
9  * Coverage:    https://codecov.io/gh/dlang/dmd/src/master/src/dmd/chkformat.d
10  */
11 module dmd.chkformat;
12 
13 //import core.stdc.stdio : printf, scanf;
14 import core.stdc.ctype : isdigit;
15 
16 import dmd.astenums;
17 import dmd.cond;
18 import dmd.errorsink;
19 import dmd.expression;
20 import dmd.globals;
21 import dmd.identifier;
22 import dmd.location;
23 import dmd.mtype;
24 import dmd.target;
25 
26 
27 /******************************************
28  * Check that arguments to a printf format string are compatible
29  * with that string. Issue errors for incompatibilities.
30  *
31  * Follows the C99 specification for printf.
32  *
33  * Takes a generous, rather than strict, view of compatiblity.
34  * For example, an unsigned value can be formatted with a signed specifier.
35  *
36  * Diagnosed incompatibilities are:
37  *
38  * 1. incompatible sizes which will cause argument misalignment
39  * 2. deferencing arguments that are not pointers
40  * 3. insufficient number of arguments
41  * 4. struct arguments
42  * 5. array and slice arguments
43  * 6. non-pointer arguments to `s` specifier
44  * 7. non-standard formats
45  * 8. undefined behavior per C99
46  *
47  * Per the C Standard, extra arguments are ignored.
48  *
49  * No attempt is made to fix the arguments or the format string.
50  *
51  * Params:
52  *      loc = location for error messages
53  *      format = format string
54  *      args = arguments to match with format string
55  *      isVa_list = if a "v" function (format check only)
56  *      eSink = where the error messages go
57  *
58  * Returns:
59  *      `true` if errors occurred
60  * References:
61  * C99 7.19.6.1
62  * https://www.cplusplus.com/reference/cstdio/printf/
63  */
64 public
65 bool checkPrintfFormat(ref const Loc loc, scope const char[] format, scope Expression[] args, bool isVa_list, ErrorSink eSink)
66 {
67     //printf("checkPrintFormat('%.*s')\n", cast(int)format.length, format.ptr);
68     size_t n;    // index in args
69     for (size_t i = 0; i < format.length;)
70     {
71         if (format[i] != '%')
72         {
73             ++i;
74             continue;
75         }
76         bool widthStar;
77         bool precisionStar;
78         size_t j = i;
79         const fmt = parsePrintfFormatSpecifier(format, j, widthStar, precisionStar);
80         const slice = format[i .. j];
81         i = j;
82 
83         if (fmt == Format.percent)
84             continue;                   // "%%", no arguments
85         if (fmt == Format.GNU_m)
86             continue;                   // "%m", no arguments
87 
88         if (isVa_list)
89         {
90             // format check only
91             if (fmt == Format.error)
92                 eSink.deprecation(loc, "format specifier `\"%.*s\"` is invalid", cast(int)slice.length, slice.ptr);
93             continue;
94         }
95 
96         Expression getNextArg(ref bool skip)
97         {
98             if (n == args.length)
99             {
100                 if (args.length < (n + 1))
101                     eSink.deprecation(loc, "more format specifiers than %d arguments", cast(int)n);
102                 else
103                     skip = true;
104                 return null;
105             }
106             return args[n++];
107         }
108 
109         void errorMsg(const char* prefix, Expression arg, const char* texpect, Type tactual)
110         {
111             eSink.deprecation(arg.loc, "%sargument `%s` for format specification `\"%.*s\"` must be `%s`, not `%s`",
112                   prefix ? prefix : "", arg.toChars(), cast(int)slice.length, slice.ptr, texpect, tactual.toChars());
113         }
114 
115         if (widthStar)
116         {
117             bool skip;
118             auto e = getNextArg(skip);
119             if (skip)
120                 continue;
121             if (!e)
122                 return true;
123             auto t = e.type.toBasetype();
124             if (t.ty != Tint32 && t.ty != Tuns32)
125                 errorMsg("width ", e, "int", t);
126         }
127 
128         if (precisionStar)
129         {
130             bool skip;
131             auto e = getNextArg(skip);
132             if (skip)
133                 continue;
134             if (!e)
135                 return true;
136             auto t = e.type.toBasetype();
137             if (t.ty != Tint32 && t.ty != Tuns32)
138                 errorMsg("precision ", e, "int", t);
139         }
140 
141         bool skip;
142         auto e = getNextArg(skip);
143         if (skip)
144             continue;
145         if (!e)
146             return true;
147         auto t = e.type.toBasetype();
148         auto tnext = t.nextOf();
149         const c_longsize = target.c.longsize;
150         const ptrsize = target.ptrsize;
151 
152         // Types which are promoted to int are allowed.
153         // Spec: C99 6.5.2.2.7
154         final switch (fmt)
155         {
156             case Format.u:      // unsigned int
157             case Format.d:      // int
158                 if (t.ty != Tint32 && t.ty != Tuns32)
159                     errorMsg(null, e, fmt == Format.u ? "uint" : "int", t);
160                 break;
161 
162             case Format.hhu:    // unsigned char
163             case Format.hhd:    // signed char
164                 if (t.ty != Tint32 && t.ty != Tuns32 && t.ty != Tint8 && t.ty != Tuns8)
165                     errorMsg(null, e, fmt == Format.hhu ? "ubyte" : "byte", t);
166                 break;
167 
168             case Format.hu:     // unsigned short int
169             case Format.hd:     // short int
170                 if (t.ty != Tint32 && t.ty != Tuns32 && t.ty != Tint16 && t.ty != Tuns16)
171                     errorMsg(null, e, fmt == Format.hu ? "ushort" : "short", t);
172                 break;
173 
174             case Format.lu:     // unsigned long int
175             case Format.ld:     // long int
176                 if (!(t.isintegral() && t.size() == c_longsize))
177                 {
178                     if (fmt == Format.lu)
179                         errorMsg(null, e, (c_longsize == 4 ? "uint" : "ulong"), t);
180                     else
181                         errorMsg(null, e, (c_longsize == 4 ? "int" : "long"), t);
182                     if (t.isintegral() && t.size() != c_longsize)
183                         eSink.errorSupplemental(e.loc, "C `long` is %d bytes on your system", c_longsize);
184                 }
185                 break;
186 
187             case Format.llu:    // unsigned long long int
188             case Format.lld:    // long long int
189                 if (t.ty != Tint64 && t.ty != Tuns64)
190                     errorMsg(null, e, fmt == Format.llu ? "ulong" : "long", t);
191                 break;
192 
193             case Format.ju:     // uintmax_t
194             case Format.jd:     // intmax_t
195                 if (t.ty != Tint64 && t.ty != Tuns64)
196                 {
197                     if (fmt == Format.ju)
198                         errorMsg(null, e, "core.stdc.stdint.uintmax_t", t);
199                     else
200                         errorMsg(null, e, "core.stdc.stdint.intmax_t", t);
201                 }
202                 break;
203 
204             case Format.zd:     // size_t
205                 if (!(t.isintegral() && t.size() == ptrsize))
206                     errorMsg(null, e, "size_t", t);
207                 break;
208 
209             case Format.td:     // ptrdiff_t
210                 if (!(t.isintegral() && t.size() == ptrsize))
211                     errorMsg(null, e, "ptrdiff_t", t);
212                 break;
213 
214             case Format.lg:
215             case Format.g:      // double
216                 if (t.ty != Tfloat64 && t.ty != Timaginary64)
217                     errorMsg(null, e, "double", t);
218                 break;
219 
220             case Format.Lg:     // long double
221                 if (t.ty != Tfloat80 && t.ty != Timaginary80)
222                     errorMsg(null, e, "real", t);
223                 break;
224 
225             case Format.p:      // pointer
226                 if (t.ty != Tpointer && t.ty != Tnull && t.ty != Tclass && t.ty != Tdelegate && t.ty != Taarray)
227                     errorMsg(null, e, "void*", t);
228                 break;
229 
230             case Format.n:      // pointer to int
231                 if (!(t.ty == Tpointer && tnext.ty == Tint32 && tnext.isMutable()))
232                     errorMsg(null, e, "int*", t);
233                 break;
234 
235             case Format.ln:     // pointer to long int
236                 if (!(t.ty == Tpointer && tnext.isintegral() && tnext.size() == c_longsize))
237                     errorMsg(null, e, (c_longsize == 4 ? "int*" : "long*"), t);
238                 break;
239 
240             case Format.lln:    // pointer to long long int
241                 if (!(t.ty == Tpointer && tnext.ty == Tint64))
242                     errorMsg(null, e, "long*", t);
243                 break;
244 
245             case Format.hn:     // pointer to short
246                 if (!(t.ty == Tpointer && tnext.ty == Tint16))
247                     errorMsg(null, e, "short*", t);
248                 break;
249 
250             case Format.hhn:    // pointer to signed char
251                 if (!(t.ty == Tpointer && tnext.ty == Tint16))
252                     errorMsg(null, e, "byte*", t);
253                 break;
254 
255             case Format.jn:     // pointer to intmax_t
256                 if (!(t.ty == Tpointer && tnext.ty == Tint64))
257                     errorMsg(null, e, "core.stdc.stdint.intmax_t*", t);
258                 break;
259 
260             case Format.zn:     // pointer to size_t
261                 if (!(t.ty == Tpointer && tnext.isintegral() && tnext.isunsigned() && tnext.size() == ptrsize))
262                     errorMsg(null, e, "size_t*", t);
263                 break;
264 
265             case Format.tn:     // pointer to ptrdiff_t
266                 if (!(t.ty == Tpointer && tnext.isintegral() && !tnext.isunsigned() && tnext.size() == ptrsize))
267                     errorMsg(null, e, "ptrdiff_t*", t);
268                 break;
269 
270             case Format.c:      // char
271                 if (t.ty != Tint32 && t.ty != Tuns32)
272                     errorMsg(null, e, "char", t);
273                 break;
274 
275             case Format.lc:     // wint_t
276                 if (t.ty != Tint32 && t.ty != Tuns32)
277                     errorMsg(null, e, "wchar_t", t);
278                 break;
279 
280             case Format.s:      // pointer to char string
281                 if (!(t.ty == Tpointer && (tnext.ty == Tchar || tnext.ty == Tint8 || tnext.ty == Tuns8)))
282                     errorMsg(null, e, "char*", t);
283                 break;
284 
285             case Format.ls:     // pointer to wchar_t string
286                 if (!(t.ty == Tpointer && tnext.ty.isSomeChar && tnext.size() == target.c.wchar_tsize))
287                     errorMsg(null, e, "wchar_t*", t);
288                 break;
289 
290             case Format.error:
291                 eSink.deprecation(loc, "format specifier `\"%.*s\"` is invalid", cast(int)slice.length, slice.ptr);
292                 break;
293 
294             case Format.GNU_m:
295             case Format.POSIX_ms:
296             case Format.POSIX_mls:
297             case Format.percent:
298                 assert(0);
299         }
300     }
301     return false;
302 }
303 
304 /******************************************
305  * Check that arguments to a scanf format string are compatible
306  * with that string. Issue errors for incompatibilities.
307  *
308  * Follows the C99 specification for scanf.
309  *
310  * Takes a generous, rather than strict, view of compatiblity.
311  * For example, an unsigned value can be formatted with a signed specifier.
312  *
313  * Diagnosed incompatibilities are:
314  *
315  * 1. incompatible sizes which will cause argument misalignment
316  * 2. deferencing arguments that are not pointers
317  * 3. insufficient number of arguments
318  * 4. struct arguments
319  * 5. array and slice arguments
320  * 6. non-standard formats
321  * 7. undefined behavior per C99
322  *
323  * Per the C Standard, extra arguments are ignored.
324  *
325  * No attempt is made to fix the arguments or the format string.
326  *
327  * Params:
328  *      loc = location for error messages
329  *      format = format string
330  *      args = arguments to match with format string
331  *      isVa_list = if a "v" function (format check only)
332  *      eSink = where the error messages go
333  *
334  * Returns:
335  *      `true` if errors occurred
336  * References:
337  * C99 7.19.6.2
338  * https://www.cplusplus.com/reference/cstdio/scanf/
339  */
340 public
341 bool checkScanfFormat(ref const Loc loc, scope const char[] format, scope Expression[] args, bool isVa_list, ErrorSink eSink)
342 {
343     size_t n = 0;
344     for (size_t i = 0; i < format.length;)
345     {
346         if (format[i] != '%')
347         {
348             ++i;
349             continue;
350         }
351         bool asterisk;
352         size_t j = i;
353         const fmt = parseScanfFormatSpecifier(format, j, asterisk);
354         const slice = format[i .. j];
355         i = j;
356 
357         if (fmt == Format.percent || asterisk)
358             continue;   // "%%", "%*": no arguments
359 
360         if (isVa_list)
361         {
362             // format check only
363             if (fmt == Format.error)
364                 eSink.deprecation(loc, "format specifier `\"%.*s\"` is invalid", cast(int)slice.length, slice.ptr);
365             continue;
366         }
367 
368         Expression getNextArg()
369         {
370             if (n == args.length)
371             {
372                 if (!asterisk)
373                     eSink.deprecation(loc, "more format specifiers than %d arguments", cast(int)n);
374                 return null;
375             }
376             return args[n++];
377         }
378 
379         void errorMsg(const char* prefix, Expression arg, const char* texpect, Type tactual)
380         {
381             eSink.deprecation(arg.loc, "%sargument `%s` for format specification `\"%.*s\"` must be `%s`, not `%s`",
382                   prefix ? prefix : "", arg.toChars(), cast(int)slice.length, slice.ptr, texpect, tactual.toChars());
383         }
384 
385         auto e = getNextArg();
386         if (!e)
387             return true;
388 
389         auto t = e.type.toBasetype();
390         auto tnext = t.nextOf();
391         const c_longsize = target.c.longsize;
392         const ptrsize = target.ptrsize;
393 
394         final switch (fmt)
395         {
396             case Format.n:
397             case Format.d:      // pointer to int
398                 if (!(t.ty == Tpointer && tnext.ty == Tint32))
399                     errorMsg(null, e, "int*", t);
400                 break;
401 
402             case Format.hhn:
403             case Format.hhd:    // pointer to signed char
404                 if (!(t.ty == Tpointer && tnext.ty == Tint16))
405                     errorMsg(null, e, "byte*", t);
406                 break;
407 
408             case Format.hn:
409             case Format.hd:     // pointer to short
410                 if (!(t.ty == Tpointer && tnext.ty == Tint16))
411                     errorMsg(null, e, "short*", t);
412                 break;
413 
414             case Format.ln:
415             case Format.ld:     // pointer to long int
416                 if (!(t.ty == Tpointer && tnext.isintegral() && !tnext.isunsigned() && tnext.size() == c_longsize))
417                     errorMsg(null, e, (c_longsize == 4 ? "int*" : "long*"), t);
418                 break;
419 
420             case Format.lln:
421             case Format.lld:    // pointer to long long int
422                 if (!(t.ty == Tpointer && tnext.ty == Tint64))
423                     errorMsg(null, e, "long*", t);
424                 break;
425 
426             case Format.jn:
427             case Format.jd:     // pointer to intmax_t
428                 if (!(t.ty == Tpointer && tnext.ty == Tint64))
429                     errorMsg(null, e, "core.stdc.stdint.intmax_t*", t);
430                 break;
431 
432             case Format.zn:
433             case Format.zd:     // pointer to size_t
434                 if (!(t.ty == Tpointer && tnext.isintegral() && tnext.isunsigned() && tnext.size() == ptrsize))
435                     errorMsg(null, e, "size_t*", t);
436                 break;
437 
438             case Format.tn:
439             case Format.td:     // pointer to ptrdiff_t
440                 if (!(t.ty == Tpointer && tnext.isintegral() && !tnext.isunsigned() && tnext.size() == ptrsize))
441                     errorMsg(null, e, "ptrdiff_t*", t);
442                 break;
443 
444             case Format.u:      // pointer to unsigned int
445                 if (!(t.ty == Tpointer && tnext.ty == Tuns32))
446                     errorMsg(null, e, "uint*", t);
447                 break;
448 
449             case Format.hhu:    // pointer to unsigned char
450                 if (!(t.ty == Tpointer && tnext.ty == Tuns8))
451                     errorMsg(null, e, "ubyte*", t);
452                 break;
453 
454             case Format.hu:     // pointer to unsigned short int
455                 if (!(t.ty == Tpointer && tnext.ty == Tuns16))
456                     errorMsg(null, e, "ushort*", t);
457                 break;
458 
459             case Format.lu:     // pointer to unsigned long int
460                 if (!(t.ty == Tpointer && tnext.isintegral() && tnext.isunsigned() && tnext.size() == c_longsize))
461                     errorMsg(null, e, (c_longsize == 4 ? "uint*" : "ulong*"), t);
462                 break;
463 
464             case Format.llu:    // pointer to unsigned long long int
465                 if (!(t.ty == Tpointer && tnext.ty == Tuns64))
466                     errorMsg(null, e, "ulong*", t);
467                 break;
468 
469             case Format.ju:     // pointer to uintmax_t
470                 if (!(t.ty == Tpointer && tnext.ty == Tuns64))
471                     errorMsg(null, e, "core.stdc.stdint.uintmax_t*", t);
472                 break;
473 
474             case Format.g:      // pointer to float
475                 if (!(t.ty == Tpointer && tnext.ty == Tfloat32))
476                     errorMsg(null, e, "float*", t);
477                 break;
478 
479             case Format.lg:     // pointer to double
480                 if (!(t.ty == Tpointer && tnext.ty == Tfloat64))
481                     errorMsg(null, e, "double*", t);
482                 break;
483 
484             case Format.Lg:     // pointer to long double
485                 if (!(t.ty == Tpointer && tnext.ty == Tfloat80))
486                     errorMsg(null, e, "real*", t);
487                 break;
488 
489             case Format.c:
490             case Format.s:      // pointer to char string
491                 if (!(t.ty == Tpointer && (tnext.ty == Tchar || tnext.ty == Tint8 || tnext.ty == Tuns8)))
492                     errorMsg(null, e, "char*", t);
493                 break;
494 
495             case Format.lc:
496             case Format.ls:     // pointer to wchar_t string
497                 if (!(t.ty == Tpointer && tnext.ty.isSomeChar && tnext.size() == target.c.wchar_tsize))
498                     errorMsg(null, e, "wchar_t*", t);
499                 break;
500 
501             case Format.p:      // double pointer
502                 if (!(t.ty == Tpointer && tnext.ty == Tpointer))
503                     errorMsg(null, e, "void**", t);
504                 break;
505 
506             case Format.POSIX_ms: // pointer to pointer to char string
507                 Type tnext2 = tnext ? tnext.nextOf() : null;
508                 if (!(t.ty == Tpointer && tnext.ty == Tpointer && (tnext2.ty == Tchar || tnext2.ty == Tint8 || tnext2.ty == Tuns8)))
509                     errorMsg(null, e, "char**", t);
510                 break;
511 
512             case Format.POSIX_mls: // pointer to pointer to wchar_t string
513                 Type tnext2 = tnext ? tnext.nextOf() : null;
514                 if (!(t.ty == Tpointer && tnext.ty == Tpointer && tnext2.ty.isSomeChar && tnext2.size() == target.c.wchar_tsize))
515                     errorMsg(null, e, "wchar_t**", t);
516                 break;
517 
518             case Format.error:
519                 eSink.deprecation(loc, "format specifier `\"%.*s\"` is invalid", cast(int)slice.length, slice.ptr);
520                 break;
521 
522             case Format.GNU_m:
523             case Format.percent:
524                 assert(0);
525         }
526     }
527     return false;
528 }
529 
530 /*****************************************************************************************************/
531 
532 private:
533 
534 /**************************************
535  * Parse the *format specifier* which is of the form:
536  *
537  * `%[*][width][length]specifier`
538  *
539  * Params:
540  *      format = format string
541  *      idx = index of `%` of start of format specifier,
542  *          which gets updated to index past the end of it,
543  *          even if `Format.error` is returned
544  *      asterisk = set if there is a `*` sub-specifier
545  * Returns:
546  *      Format
547  */
548 Format parseScanfFormatSpecifier(scope const char[] format, ref size_t idx,
549         out bool asterisk) nothrow pure @safe
550 {
551     auto i = idx;
552     assert(format[i] == '%');
553     const length = format.length;
554 
555     Format error()
556     {
557         idx = i;
558         return Format.error;
559     }
560 
561     ++i;
562     if (i == length)
563         return error();
564 
565     if (format[i] == '%')
566     {
567         idx = i + 1;
568         return Format.percent;
569     }
570 
571     // * sub-specifier
572     if (format[i] == '*')
573     {
574         ++i;
575         if (i == length)
576             return error();
577         asterisk = true;
578     }
579 
580     // fieldWidth
581     while (isdigit(format[i]))
582     {
583         i++;
584         if (i == length)
585             return error();
586     }
587 
588     /* Read the specifier
589      */
590     Format specifier;
591     Modifier flags = Modifier.none;
592     switch (format[i])
593     {
594         case 'm':
595             // https://pubs.opengroup.org/onlinepubs/9699919799/functions/scanf.html
596             // POSIX.1-2017 C Extension (CX)
597             flags = Modifier.m;
598             ++i;
599             if (i == length)
600                 return error();
601             if (format[i] == 'l')
602             {
603                 ++i;
604                 if (i == length)
605                     return error();
606                 flags = Modifier.ml;
607             }
608 
609             // Check valid conversion types for %m.
610             if (format[i] == 'c' || format[i] == 's')
611                 specifier = flags == Modifier.ml ? Format.POSIX_mls :
612                                                    Format.POSIX_ms;
613             else if (format[i] == 'C' || format[i] == 'S')
614                 specifier = flags == Modifier.m ? Format.POSIX_mls :
615                                                   Format.error;
616             else if (format[i] == '[')
617                 goto case '[';
618             else
619                 specifier = Format.error;
620             ++i;
621             break;
622 
623         case 'l':
624             // Look for wchar_t scanset %l[..]
625             immutable j = i + 1;
626             if (j < length && format[j] == '[')
627             {
628                 i = j;
629                 flags = Modifier.l;
630                 goto case '[';
631             }
632             goto default;
633 
634         case '[':
635             // Read the scanset
636             i++;
637             if (i == length)
638                 return error();
639             // If the conversion specifier begins with `[]` or `[^]`, the right
640             // bracket character is not the terminator, but in the scanlist.
641             if (format[i] == '^')
642             {
643                 i++;
644                 if (i == length)
645                     return error();
646             }
647             if (format[i] == ']')
648             {
649                 i++;
650                 if (i == length)
651                     return error();
652             }
653             // A scanset can be anything, so we just check that it is paired
654             while (i < length)
655             {
656                 if (format[i] == ']')
657                     break;
658                 ++i;
659             }
660             // no `]` found
661             if (i == length)
662                 return error();
663 
664             specifier = flags == Modifier.none ? Format.s         :
665                         flags == Modifier.l    ? Format.ls        :
666                         flags == Modifier.m    ? Format.POSIX_ms  :
667                         flags == Modifier.ml   ? Format.POSIX_mls :
668                                                  Format.error;
669             ++i;
670             break;
671 
672         default:
673             char genSpec;
674             specifier = parseGenericFormatSpecifier(format, i, genSpec);
675             if (specifier == Format.error)
676                 return error();
677             break;
678     }
679 
680     idx = i;
681     return specifier;  // success
682 }
683 
684 /**************************************
685  * Parse the *format specifier* which is of the form:
686  *
687  * `%[flags][field width][.precision][length modifier]specifier`
688  *
689  * Params:
690  *      format = format string
691  *      idx = index of `%` of start of format specifier,
692  *          which gets updated to index past the end of it,
693  *          even if `Format.error` is returned
694  *      widthStar = set if * for width
695  *      precisionStar = set if * for precision
696  *      useGNUExts = true if parsing GNU format extensions
697  * Returns:
698  *      Format
699  */
700 Format parsePrintfFormatSpecifier(scope const char[] format, ref size_t idx,
701         out bool widthStar, out bool precisionStar, bool useGNUExts =
702         findCondition(global.versionids, Identifier.idPool("CRuntime_Glibc"))) nothrow pure @safe
703 {
704     auto i = idx;
705     assert(format[i] == '%');
706     const length = format.length;
707     bool hash;
708     bool zero;
709     bool flags;
710     bool width;
711     bool precision;
712 
713     Format error()
714     {
715         idx = i;
716         return Format.error;
717     }
718 
719     ++i;
720     if (i == length)
721         return error();
722 
723     if (format[i] == '%')
724     {
725         idx = i + 1;
726         return Format.percent;
727     }
728 
729     /* Read the `flags`
730      */
731     while (1)
732     {
733         const c = format[i];
734         if (c == '-' ||
735             c == '+' ||
736             c == ' ')
737         {
738             flags = true;
739         }
740         else if (c == '#')
741         {
742             hash = true;
743         }
744         else if (c == '0')
745         {
746             zero = true;
747         }
748         else
749             break;
750         ++i;
751         if (i == length)
752             return error();
753     }
754 
755     /* Read the `field width`
756      */
757     {
758         const c = format[i];
759         if (c == '*')
760         {
761             width = true;
762             widthStar = true;
763             ++i;
764             if (i == length)
765                 return error();
766         }
767         else if ('1' <= c && c <= '9')
768         {
769             width = true;
770             ++i;
771             if (i == length)
772                 return error();
773             while ('0' <= format[i] && format[i] <= '9')
774             {
775                 ++i;
776                 if (i == length)
777                     return error();
778             }
779         }
780     }
781 
782     /* Read the `precision`
783      */
784     if (format[i] == '.')
785     {
786         precision = true;
787         ++i;
788         if (i == length)
789             return error();
790         const c = format[i];
791         if (c == '*')
792         {
793             precisionStar = true;
794             ++i;
795             if (i == length)
796                 return error();
797         }
798         else if ('0' <= c && c <= '9')
799         {
800             ++i;
801             if (i == length)
802                 return error();
803             while ('0' <= format[i] && format[i] <= '9')
804             {
805                 ++i;
806                 if (i == length)
807                     return error();
808             }
809         }
810     }
811 
812     /* Read the specifier
813      */
814     char genSpec;
815     Format specifier;
816     switch (format[i])
817     {
818         case 'm':
819             // https://www.gnu.org/software/libc/manual/html_node/Other-Output-Conversions.html
820             if (useGNUExts)
821             {
822                 specifier = Format.GNU_m;
823                 genSpec = format[i];
824                 ++i;
825                 break;
826             }
827             goto default;
828 
829         default:
830             specifier = parseGenericFormatSpecifier(format, i, genSpec);
831             if (specifier == Format.error)
832                 return error();
833             break;
834     }
835 
836     switch (genSpec)
837     {
838         case 'c':
839         case 's':
840         case 'C':
841         case 'S':
842             if (hash || zero)
843                 return error();
844             break;
845 
846         case 'd':
847         case 'i':
848             if (hash)
849                 return error();
850             break;
851 
852         case 'm':
853             if (hash || zero || flags)
854                 return error();
855             break;
856 
857         case 'n':
858             if (hash || zero || precision || width || flags)
859                 return error();
860             break;
861 
862         default:
863             break;
864     }
865 
866     idx = i;
867     return specifier;  // success
868 }
869 
870 /* Different kinds of conversion modifiers. */
871 enum Modifier
872 {
873     none,
874     h,          // short
875     hh,         // char
876     j,          // intmax_t
877     l,          // wint_t/wchar_t
878     ll,         // long long int
879     L,          // long double
880     m,          // char**
881     ml,         // wchar_t**
882     t,          // ptrdiff_t
883     z           // size_t
884 }
885 
886 /* Different kinds of formatting specifications, variations we don't
887    care about are merged. (Like we don't care about the difference between
888    f, e, g, a, etc.)
889 
890    For `scanf`, every format is a pointer.
891  */
892 enum Format
893 {
894     d,          // int
895     hhd,        // signed char
896     hd,         // short int
897     ld,         // long int
898     lld,        // long long int
899     jd,         // intmax_t
900     zd,         // size_t
901     td,         // ptrdiff_t
902     u,          // unsigned int
903     hhu,        // unsigned char
904     hu,         // unsigned short int
905     lu,         // unsigned long int
906     llu,        // unsigned long long int
907     ju,         // uintmax_t
908     g,          // float (scanf) / double (printf)
909     lg,         // double (scanf)
910     Lg,         // long double (both)
911     s,          // char string (both)
912     ls,         // wchar_t string (both)
913     c,          // char (printf)
914     lc,         // wint_t (printf)
915     p,          // pointer
916     n,          // pointer to int
917     hhn,        // pointer to signed char
918     hn,         // pointer to short
919     ln,         // pointer to long int
920     lln,        // pointer to long long int
921     jn,         // pointer to intmax_t
922     zn,         // pointer to size_t
923     tn,         // pointer to ptrdiff_t
924     GNU_m,      // GNU ext. : string corresponding to the error code in errno (printf)
925     POSIX_ms,   // POSIX ext. : dynamically allocated char string  (scanf)
926     POSIX_mls,  // POSIX ext. : dynamically allocated wchar_t string (scanf)
927     percent,    // %% (i.e. no argument)
928     error,      // invalid format specification
929 }
930 
931 /**************************************
932  * Parse the *length specifier* and the *specifier* of the following form:
933  * `[length]specifier`
934  *
935  * Params:
936  *      format = format string
937  *      idx = index of of start of format specifier,
938  *          which gets updated to index past the end of it,
939  *          even if `Format.error` is returned
940  *      genSpecifier = Generic specifier. For instance, it will be set to `d` if the
941  *           format is `hdd`.
942  * Returns:
943  *      Format
944  */
945 Format parseGenericFormatSpecifier(scope const char[] format,
946     ref size_t idx, out char genSpecifier) nothrow pure @safe
947 {
948     const length = format.length;
949 
950     /* Read the `length modifier`
951      */
952     const lm = format[idx];
953     Modifier flags;
954     switch (lm)
955     {
956         case 'j':
957         case 'z':
958         case 't':
959         case 'L':
960             flags = lm == 'j' ? Modifier.j :
961                     lm == 'z' ? Modifier.z :
962                     lm == 't' ? Modifier.t :
963                                 Modifier.L;
964             ++idx;
965             if (idx == length)
966                 return Format.error;
967             break;
968 
969         case 'h':
970         case 'l':
971             ++idx;
972             if (idx == length)
973                 return Format.error;
974             if (lm == format[idx])
975             {
976                 flags = lm == 'h' ? Modifier.hh : Modifier.ll;
977                 ++idx;
978                 if (idx == length)
979                     return Format.error;
980             }
981             else
982                 flags = lm == 'h' ? Modifier.h : Modifier.l;
983             break;
984 
985         default:
986             flags = Modifier.none;
987             break;
988     }
989 
990     /* Read the `specifier`
991      */
992     Format specifier;
993     const sc = format[idx];
994     genSpecifier = sc;
995     switch (sc)
996     {
997         case 'd':
998         case 'i':
999             specifier = flags == Modifier.none ? Format.d   :
1000                         flags == Modifier.hh   ? Format.hhd :
1001                         flags == Modifier.h    ? Format.hd  :
1002                         flags == Modifier.ll   ? Format.lld :
1003                         flags == Modifier.l    ? Format.ld  :
1004                         flags == Modifier.j    ? Format.jd  :
1005                         flags == Modifier.z    ? Format.zd  :
1006                         flags == Modifier.t    ? Format.td  :
1007                                                  Format.error;
1008             break;
1009 
1010         case 'u':
1011         case 'o':
1012         case 'x':
1013         case 'X':
1014             specifier = flags == Modifier.none ? Format.u   :
1015                         flags == Modifier.hh   ? Format.hhu :
1016                         flags == Modifier.h    ? Format.hu  :
1017                         flags == Modifier.ll   ? Format.llu :
1018                         flags == Modifier.l    ? Format.lu  :
1019                         flags == Modifier.j    ? Format.ju  :
1020                         flags == Modifier.z    ? Format.zd  :
1021                         flags == Modifier.t    ? Format.td  :
1022                                                  Format.error;
1023             break;
1024 
1025         case 'f':
1026         case 'F':
1027         case 'e':
1028         case 'E':
1029         case 'g':
1030         case 'G':
1031         case 'a':
1032         case 'A':
1033             specifier = flags == Modifier.none ? Format.g  :
1034                         flags == Modifier.L    ? Format.Lg :
1035                         flags == Modifier.l    ? Format.lg :
1036                                                  Format.error;
1037             break;
1038 
1039         case 'c':
1040             specifier = flags == Modifier.none ? Format.c       :
1041                         flags == Modifier.l    ? Format.lc      :
1042                                                  Format.error;
1043             break;
1044 
1045         case 's':
1046             specifier = flags == Modifier.none ? Format.s       :
1047                         flags == Modifier.l    ? Format.ls      :
1048                                                  Format.error;
1049             break;
1050 
1051         case 'p':
1052             specifier = flags == Modifier.none ? Format.p :
1053                                                  Format.error;
1054             break;
1055 
1056         case 'n':
1057             specifier = flags == Modifier.none ? Format.n   :
1058                         flags == Modifier.ll   ? Format.lln :
1059                         flags == Modifier.l    ? Format.ln  :
1060                         flags == Modifier.hh   ? Format.hhn :
1061                         flags == Modifier.h    ? Format.hn  :
1062                         flags == Modifier.j    ? Format.jn  :
1063                         flags == Modifier.z    ? Format.zn  :
1064                         flags == Modifier.t    ? Format.tn  :
1065                                                  Format.error;
1066             break;
1067 
1068         case 'C':
1069             // POSIX.1-2017 X/Open System Interfaces (XSI)
1070             // %C format is equivalent to %lc
1071             specifier = flags == Modifier.none ? Format.lc :
1072                                                  Format.error;
1073             break;
1074 
1075         case 'S':
1076             // POSIX.1-2017 X/Open System Interfaces (XSI)
1077             // %S format is equivalent to %ls
1078             specifier = flags == Modifier.none ? Format.ls :
1079                                                  Format.error;
1080             break;
1081 
1082         default:
1083             specifier = Format.error;
1084             break;
1085     }
1086 
1087     ++idx;
1088     return specifier; // success
1089 }
1090 
1091 @("parseGenericFormatSpecifier") unittest
1092 {
1093     char genSpecifier;
1094     size_t idx;
1095 
1096     void testG(string fmtStr, Format expectedFormat, char expectedGenSpecifier)
1097     {
1098         idx = 0;
1099         assert(parseGenericFormatSpecifier(fmtStr, idx, genSpecifier) == expectedFormat);
1100         assert(genSpecifier == expectedGenSpecifier);
1101     }
1102 
1103     testG("hhd", Format.hhd, 'd');
1104     testG("hn", Format.hn, 'n');
1105     testG("ji", Format.jd, 'i');
1106     testG("lu", Format.lu, 'u');
1107 
1108     idx = 0;
1109     assert(parseGenericFormatSpecifier("k", idx, genSpecifier) == Format.error);
1110 }
1111 
1112 @("parsePrintfFormatSpecifier") unittest
1113 {
1114     bool useGNUExts = false;
1115 
1116     size_t idx = 0;
1117     bool widthStar;
1118     bool precisionStar;
1119 
1120     void testP(string fmtStr, Format expectedFormat, size_t expectedIdx)
1121     {
1122         idx = 0;
1123         assert(parsePrintfFormatSpecifier(fmtStr, idx, widthStar, precisionStar, useGNUExts) == expectedFormat);
1124         assert(idx == expectedIdx);
1125     }
1126 
1127     // one for each Format
1128     testP("%d", Format.d, 2);
1129     assert(!widthStar && !precisionStar);
1130 
1131     testP("%ld", Format.ld, 3);
1132     testP("%lld", Format.lld, 4);
1133     testP("%jd", Format.jd, 3);
1134     testP("%zd", Format.zd, 3);
1135     testP("%td", Format.td, 3);
1136     testP("%g", Format.g, 2);
1137     testP("%Lg", Format.Lg, 3);
1138     testP("%p", Format.p, 2);
1139     testP("%n", Format.n, 2);
1140     testP("%ln", Format.ln, 3);
1141     testP("%lln", Format.lln, 4);
1142     testP("%hn", Format.hn, 3);
1143     testP("%hhn", Format.hhn, 4);
1144     testP("%jn", Format.jn, 3);
1145     testP("%zn", Format.zn, 3);
1146     testP("%tn", Format.tn, 3);
1147     testP("%c", Format.c, 2);
1148     testP("%lc", Format.lc, 3);
1149     testP("%s", Format.s, 2);
1150     testP("%ls", Format.ls, 3);
1151     testP("%%", Format.percent, 2);
1152 
1153     // Synonyms
1154     testP("%i", Format.d, 2);
1155     testP("%u", Format.u, 2);
1156     testP("%o", Format.u, 2);
1157     testP("%x", Format.u, 2);
1158     testP("%X", Format.u, 2);
1159     testP("%f", Format.g, 2);
1160     testP("%F", Format.g, 2);
1161     testP("%G", Format.g, 2);
1162     testP("%a", Format.g, 2);
1163     testP("%La", Format.Lg, 3);
1164     testP("%A", Format.g, 2);
1165     testP("%lg", Format.lg, 3);
1166 
1167     // width, precision
1168     testP("%*d", Format.d, 3);
1169     assert(widthStar && !precisionStar);
1170 
1171     testP("%.*d", Format.d, 4);
1172     assert(!widthStar && precisionStar);
1173 
1174     testP("%*.*d", Format.d, 5);
1175     assert(widthStar && precisionStar);
1176 
1177     // Too short formats
1178     foreach (s; ["%", "%-", "%+", "% ", "%#", "%0", "%*", "%1", "%19", "%.", "%.*", "%.1", "%.12",
1179                     "%j", "%z", "%t", "%l", "%h", "%ll", "%hh"])
1180     {
1181         testP(s, Format.error, s.length);
1182     }
1183 
1184     // Undefined format combinations
1185     foreach (s; ["%#d", "%llg", "%jg", "%zg", "%tg", "%hg", "%hhg",
1186                     "%#c", "%0c", "%jc", "%zc", "%tc", "%Lc", "%hc", "%hhc", "%llc",
1187                     "%#s", "%0s", "%js", "%zs", "%ts", "%Ls", "%hs", "%hhs", "%lls",
1188                     "%jp", "%zp", "%tp", "%Lp", "%hp", "%lp", "%hhp", "%llp",
1189                     "%-n", "%+n", "% n", "%#n", "%0n", "%*n", "%1n", "%19n", "%.n", "%.*n", "%.1n", "%.12n", "%Ln", "%K"])
1190     {
1191         testP(s, Format.error, s.length);
1192     }
1193 
1194     testP("%C", Format.lc, 2);
1195     testP("%S", Format.ls, 2);
1196 
1197     // GNU extensions: explicitly toggle ISO/GNU flag.
1198     foreach (s; ["%jm", "%zm", "%tm", "%Lm", "%hm", "%hhm", "%lm", "%llm",
1199                     "%#m", "%+m", "%-m", "% m", "%0m"])
1200     {
1201         useGNUExts = false;
1202         testP(s, Format.error, s.length);
1203         useGNUExts = true;
1204         testP(s, Format.error, s.length);
1205     }
1206 
1207     foreach (s; ["%m", "%md", "%mz", "%mc", "%mm", "%msyz", "%ml", "%mlz", "%mlc", "%mlm"])
1208     {
1209         // valid cases, all parsed as `%m`
1210         // GNU printf()
1211         useGNUExts = true;
1212         testP(s, Format.GNU_m, 2);
1213 
1214         // ISO printf()
1215         useGNUExts = false;
1216         testP(s, Format.error, 2);
1217     }
1218 }
1219 
1220 @("parseScanfFormatSpecifier") unittest
1221 {
1222     size_t idx;
1223     bool asterisk;
1224 
1225     void testS(string fmtStr, Format expectedFormat, size_t expectedIdx)
1226     {
1227         idx = 0;
1228         assert(parseScanfFormatSpecifier(fmtStr, idx, asterisk) == expectedFormat);
1229         assert(idx == expectedIdx);
1230     }
1231 
1232     // one for each Format
1233     testS("%d", Format.d, 2);
1234     testS("%hhd", Format.hhd, 4);
1235     testS("%hd", Format.hd, 3);
1236     testS("%ld", Format.ld, 3);
1237     testS("%lld", Format.lld, 4);
1238     testS("%jd", Format.jd, 3);
1239     testS("%zd", Format.zd, 3);
1240     testS("%td", Format.td, 3);
1241     testS("%u", Format.u, 2);
1242     testS("%hhu", Format.hhu, 4);
1243     testS("%hu", Format.hu, 3);
1244     testS("%lu", Format.lu, 3);
1245     testS("%llu", Format.llu, 4);
1246     testS("%ju", Format.ju, 3);
1247     testS("%g", Format.g, 2);
1248     testS("%lg", Format.lg, 3);
1249     testS("%Lg", Format.Lg, 3);
1250     testS("%p", Format.p, 2);
1251     testS("%s", Format.s, 2);
1252     testS("%ls", Format.ls, 3);
1253     testS("%%", Format.percent, 2);
1254 
1255     // Synonyms
1256     testS("%i", Format.d, 2);
1257     testS("%n", Format.n, 2);
1258 
1259     testS("%o", Format.u, 2);
1260     testS("%x", Format.u, 2);
1261     testS("%f", Format.g, 2);
1262     testS("%e", Format.g, 2);
1263     testS("%a", Format.g, 2);
1264     testS("%c", Format.c, 2);
1265 
1266     // asterisk
1267     testS("%*d", Format.d, 3);
1268     assert(asterisk);
1269 
1270     testS("%9ld", Format.ld, 4);
1271     assert(!asterisk);
1272 
1273     testS("%*25984hhd", Format.hhd, 10);
1274     assert(asterisk);
1275 
1276     // scansets
1277     testS("%[a-zA-Z]", Format.s, 9);
1278     assert(!asterisk);
1279 
1280     testS("%*25l[a-z]", Format.ls, 10);
1281     assert(asterisk);
1282 
1283     testS("%[]]", Format.s, 4);
1284     assert(!asterisk);
1285 
1286     testS("%[^]]", Format.s, 5);
1287     assert(!asterisk);
1288 
1289     // Too short formats
1290     foreach (s; ["%", "% ", "%#", "%0", "%*", "%1", "%19",
1291                  "%j", "%z", "%t", "%l", "%h", "%ll", "%hh", "%K"])
1292     {
1293 
1294         testS(s, Format.error, s.length);
1295     }
1296 
1297 
1298     // Undefined format combinations
1299     foreach (s; ["%Ld", "%llg", "%jg", "%zg", "%tg", "%hg", "%hhg",
1300                  "%jc", "%zc", "%tc", "%Lc", "%hc", "%hhc", "%llc",
1301                  "%jp", "%zp", "%tp", "%Lp", "%hp", "%lp", "%hhp", "%llp",
1302                  "%-", "%+", "%#", "%0", "%.", "%Ln"])
1303     {
1304 
1305         testS(s, Format.error, s.length);
1306 
1307     }
1308 
1309     // Invalid scansets
1310     foreach (s; ["%[]", "%[^", "%[^]", "%[s", "%[0-9lld", "%[", "%l[^]"])
1311     {
1312 
1313         testS(s, Format.error, s.length);
1314     }
1315 
1316     // Posix extensions
1317     foreach (s; ["%jm", "%zm", "%tm", "%Lm", "%hm", "%hhm", "%lm", "%llm",
1318                  "%m", "%ma", "%md", "%ml", "%mm", "%mlb", "%mlj", "%mlr", "%mlz",
1319                  "%LC", "%lC", "%llC", "%jC", "%tC", "%hC", "%hhC", "%zC",
1320                  "%LS", "%lS", "%llS", "%jS", "%tS", "%hS", "%hhS", "%zS"])
1321     {
1322 
1323         testS(s, Format.error, s.length);
1324     }
1325 
1326     testS("%mc", Format.POSIX_ms, 3);
1327     testS("%ms", Format.POSIX_ms, 3);
1328     testS("%m[0-9]", Format.POSIX_ms, 7);
1329     testS("%mlc", Format.POSIX_mls, 4);
1330     testS("%mls", Format.POSIX_mls, 4);
1331     testS("%ml[^0-9]", Format.POSIX_mls, 9);
1332     testS("%mC", Format.POSIX_mls, 3);
1333     testS("%mS", Format.POSIX_mls, 3);
1334 
1335     testS("%C", Format.lc, 2);
1336     testS("%S", Format.ls, 2);
1337 }