1 /**
2  * Spell checker
3  *
4  * Does not have any dependencies on the rest of DMD.
5  *
6  * Copyright: Copyright (C) 1999-2023 by The D Language Foundation, All Rights Reserved
7  * Authors:   Walter Bright, https://www.digitalmars.com
8  * License:   $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9  * Source:    $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/root/speller.d, root/_speller.d)
10  * Documentation:  https://dlang.org/phobos/dmd_root_speller.html
11  * Coverage:    https://codecov.io/gh/dlang/dmd/src/master/src/dmd/root/speller.d
12  */
13 
14 module dmd.root.speller;
15 
16 /**************************************************
17  * Looks for correct spelling.
18  * Looks a distance of up to two.
19  * This does an exhaustive search, so can potentially be very slow.
20  * Params:
21  *      seed = wrongly spelled word
22  *      dg = search delegate of the form `T delegate(const(char)[] p, out int cost)`
23  * Returns:
24  *      T.init = no correct spellings found,
25  *      otherwise the value returned by dg() for first possible correct spelling
26  */
27 auto speller(alias dg)(const(char)[] seed)
28 if (isSearchFunction!dg)
29 {
30     const size_t maxdist = seed.length < 4 ? seed.length / 2 : 2;
31     foreach (distance; 0 .. maxdist)
32     {
33         if (auto p = spellerX!dg(seed, distance != 0))
34             return p;
35         //      if (seedlen > 10)
36         //          break;
37     }
38     return null; // didn't find it
39 }
40 
41 private:
42 
43 import core.stdc.stdlib;
44 import core.stdc.string;
45 import dmd.common.string : SmallBuffer;
46 
47 enum isSearchFunction(alias fun) = is(searchFunctionType!fun);
48 alias searchFunctionType(alias fun) = typeof(() {int x; return fun("", x);}());
49 
50 /*************************************
51  * Spell check level 1.
52  * Params:
53  *      dg = delegate that looks up string in dictionary AA and returns value found
54  *      seed = starting string
55  *      flag = if true, do 2 level lookup otherwise 1 level
56  * Returns:
57  *      whatever dg returns, null if no match
58  */
59 auto spellerX(alias dg)(const(char)[] seed, bool flag)
60 {
61     if (!seed.length)
62         return null;
63 
64     /* Need buffer to store trial strings in
65      */
66     char[30] tmp = void;
67     auto sb = SmallBuffer!char(seed.length + 1, tmp[]);
68     char[] buf = sb[];
69 
70     int cost = int.max;
71     searchFunctionType!dg p = null;
72 
73     /* Deletions */
74     buf[0 .. seed.length - 1] = seed[1 .. $];
75     foreach (i; 0 .. seed.length)
76     {
77         //printf("del buf = '%s'\n", buf);
78         int ncost;
79         auto np = flag ? spellerY!dg(buf[0 .. seed.length - 1], i, ncost)
80                        : dg(buf[0 .. seed.length - 1], ncost);
81         if (combineSpellerResult(p, cost, np, ncost))
82             return p;
83         buf[i] = seed[i];
84     }
85 
86     /* Transpositions */
87     if (!flag)
88     {
89         buf[0 .. seed.length] = seed;
90         for (size_t i = 0; i + 1 < seed.length; i++)
91         {
92             // swap [i] and [i + 1]
93             buf[i] = seed[i + 1];
94             buf[i + 1] = seed[i];
95             //printf("tra buf = '%s'\n", buf);
96             int ncost;
97             auto np = dg(buf[0 .. seed.length], ncost);
98             if (combineSpellerResult(p, cost, np, ncost))
99                 return p;
100             buf[i] = seed[i];
101         }
102     }
103 
104     /* Substitutions */
105     buf[0 .. seed.length] = seed;
106     foreach (i; 0 .. seed.length)
107     {
108         foreach (s; idchars)
109         {
110             buf[i] = s;
111             //printf("sub buf = '%s'\n", buf);
112             int ncost;
113             auto np = flag ? spellerY!dg(buf[0 .. seed.length], i + 1, ncost)
114                            : dg(buf[0 .. seed.length], ncost);
115             if (combineSpellerResult(p, cost, np, ncost))
116                 return p;
117         }
118         buf[i] = seed[i];
119     }
120 
121     /* Insertions */
122     buf[1 .. seed.length + 1] = seed;
123     foreach (i; 0 .. seed.length + 1) // yes, do seed.length+1 iterations
124     {
125         foreach (s; idchars)
126         {
127             buf[i] = s;
128             //printf("ins buf = '%s'\n", buf);
129             int ncost;
130             auto np = flag ? spellerY!dg(buf[0 .. seed.length + 1], i + 1, ncost)
131                            : dg(buf[0 .. seed.length + 1], ncost);
132             if (combineSpellerResult(p, cost, np, ncost))
133                 return p;
134         }
135         if (i < seed.length)
136             buf[i] = seed[i];
137     }
138 
139     return p; // return "best" result
140 }
141 
142 /**********************************************
143  * Do second level of spell matching.
144  * Params:
145  *      dg = delegate that looks up string in dictionary AA and returns value found
146  *      seed = starting string
147  *      index = index into seed[] that is where we will mutate it
148  *      cost = set to cost of match
149  * Returns:
150  *      whatever dg returns, null if no match
151  */
152 auto spellerY(alias dg)(const(char)[] seed, size_t index, out int cost)
153 {
154     if (!seed.length)
155         return null;
156 
157     /* Allocate a buf to store the new string to play with, needs
158      * space for an extra char for insertions
159      */
160     char[30] tmp = void;        // stack allocations are fastest
161     auto sb = SmallBuffer!char(seed.length + 1, tmp[]);
162     char[] buf = sb[];
163     buf[0 .. index] = seed[0 .. index];
164 
165     cost = int.max;             // start with worst possible match
166     searchFunctionType!dg p = null;
167 
168     /* Delete character at seed[index] */
169     if (index < seed.length)
170     {
171         buf[index .. seed.length - 1] = seed[index + 1 .. $]; // seed[] with deleted character
172         int ncost;
173         auto np = dg(buf[0 .. seed.length - 1], ncost); // look it up
174         if (combineSpellerResult(p, cost, np, ncost))   // compare with prev match
175             return p;                                   // cannot get any better
176     }
177 
178     /* Substitute character at seed[index] */
179     if (index < seed.length)
180     {
181         buf[0 .. seed.length] = seed;
182         foreach (s; idchars)
183         {
184             buf[index] = s;     // seed[] with substituted character
185             //printf("sub buf = '%s'\n", buf);
186             int ncost;
187             auto np = dg(buf[0 .. seed.length], ncost);
188             if (combineSpellerResult(p, cost, np, ncost))
189                 return p;
190         }
191     }
192 
193     /* Insert character at seed[index] */
194     buf[index + 1 .. seed.length + 1] = seed[index .. $];
195     foreach (s; idchars)
196     {
197         buf[index] = s;
198         //printf("ins buf = '%s'\n", buf);
199         int ncost;
200         auto np = dg(buf[0 .. seed.length + 1], ncost);
201         if (combineSpellerResult(p, cost, np, ncost))
202             return p;
203     }
204     return p; // return "best" result
205 }
206 
207 
208 /* Characters used to substitute ones in the string we're checking
209  * the spelling on.
210  */
211 immutable string idchars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";
212 
213 /**************************************************
214  * Combine a new result from the spell checker to
215  * find the one with the closest symbol with
216  * respect to the cost defined by the search function
217  * Params:
218  *      p = best found spelling so far, T.init if none found yet.
219  *          If np is better, p is replaced with np
220  *      cost = cost of p (int.max if none found yet).
221  *          If np is better, cost is replaced with ncost
222  *      np = current spelling to check against p, T.init if none
223  *      ncost = cost of np if np is not T.init
224  * Returns:
225  *      true    if the cost is less or equal 0, meaning we can stop looking
226  *      false   otherwise
227  */
228 bool combineSpellerResult(T)(ref T p, ref int cost, T np, int ncost)
229 {
230     if (np && ncost < cost) // if np is better
231     {
232         p = np;             // np is new best match
233         cost = ncost;
234         if (cost <= 0)
235             return true;    // meaning we can stop looking
236     }
237     return false;
238 }
239 
240 /************************************* Tests ***********************/
241 
242 unittest
243 {
244     static immutable string[][] cases =
245     [
246         ["hello", "hell", "y"],
247         ["hello", "hel", "y"],
248         ["hello", "ello", "y"],
249         ["hello", "llo", "y"],
250         ["hello", "hellox", "y"],
251         ["hello", "helloxy", "y"],
252         ["hello", "xhello", "y"],
253         ["hello", "xyhello", "y"],
254         ["hello", "ehllo", "y"],
255         ["hello", "helol", "y"],
256         ["hello", "abcd", "n"],
257         ["hello", "helxxlo", "y"],
258         ["hello", "ehlxxlo", "n"],
259         ["hello", "heaao", "y"],
260         ["_123456789_123456789_123456789_123456789", "_123456789_123456789_123456789_12345678", "y"],
261     ];
262     //printf("unittest_speller()\n");
263 
264     string dgarg;
265 
266     string speller_test(const(char)[] s, ref int cost)
267     {
268         assert(s[$-1] != '\0');
269         //printf("speller_test(%s, %s)\n", dgarg, s);
270         cost = 0;
271         if (dgarg == s)
272             return dgarg;
273         return null;
274     }
275 
276     dgarg = "hell";
277     auto p = speller!speller_test("hello");
278     assert(p !is null);
279     foreach (testCase; cases)
280     {
281         //printf("case [%d]\n", i);
282         dgarg = testCase[1];
283         auto p2 = speller!speller_test(testCase[0]);
284         if (p2)
285             assert(testCase[2][0] == 'y');
286         else
287             assert(testCase[2][0] == 'n');
288     }
289     //printf("unittest_speller() success\n");
290 }