1 /**
2  * xmm specific code generation
3  *
4  * Compiler implementation of the
5  * $(LINK2 https://www.dlang.org, D programming language).
6  *
7  * Copyright:   Copyright (C) 2011-2023 by The D Language Foundation, All Rights Reserved
8  * Authors:     $(LINK2 https://www.digitalmars.com, Walter Bright)
9  * License:     $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
10  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cgxmm.d, backend/cgxmm.d)
11  */
12 
13 module dmd.backend.cgxmm;
14 
15 import core.stdc.stdio;
16 import core.stdc.stdlib;
17 import core.stdc.string;
18 
19 import dmd.backend.cc;
20 import dmd.backend.cdef;
21 import dmd.backend.code;
22 import dmd.backend.code_x86;
23 import dmd.backend.codebuilder;
24 import dmd.backend.mem;
25 import dmd.backend.el;
26 import dmd.backend.global;
27 import dmd.backend.oper;
28 import dmd.backend.ty;
29 import dmd.backend.xmm;
30 
31 
32 nothrow:
33 @safe:
34 
35 /*******************************************
36  * Is operator a store operator?
37  */
38 
39 bool isXMMstore(opcode_t op)
40 {
41     switch (op)
42     {
43     case STOSS: case STOAPS: case STOUPS:
44     case STOSD: case STOAPD: case STOUPD:
45     case STOD: case STOQ: case STODQA: case STODQU:
46     case STOHPD: case STOHPS: case STOLPD: case STOLPS: return true;
47     default: return false;
48     }
49 }
50 
51 /*******************************************
52  * Move constant value into xmm register xreg.
53  * Params:
54  *      cdb = generated code appends to this
55  *      xreg = XMM register to load
56  *      sz = number of bytes to load
57  *      pev = pointer to const value
58  *      flags = if set flags based on value
59  */
60 
61 @trusted
62 void movxmmconst(ref CodeBuilder cdb, reg_t xreg, tym_t ty, eve* pev, regm_t flags)
63 {
64     //printf("movxmmconst() %s ty: %s value: %lld\n", regm_str(mask(xreg)), tym_str(ty), pev.Vllong);
65 
66     const sz = tysize(ty);
67     assert(mask(xreg) & XMMREGS);
68     if (sz == 16 || sz == 32)
69     {
70         if (sz == 16 &&
71                  pev.Vllong2[0] == 0 && pev.Vllong2[1] == 0)
72             cdb.gen2(PXOR,modregxrmx(3,xreg-XMM0,xreg-XMM0));       // PXOR xreg,xreg
73         else if (sz == 32 &&
74                  pev.Vllong4[0] == 0 && pev.Vllong4[1] == 0 &&
75                  pev.Vllong4[2] == 0 && pev.Vllong4[3] == 0)
76             cdb.gen2(PXOR,modregxrmx(3,xreg-XMM0,xreg-XMM0));       // PXOR xreg,xreg
77         else if (sz == 16 &&
78                  pev.Vllong2[0] == ~0 && pev.Vllong2[1] == ~0)
79             cdb.gen2(PCMPEQD,modregxrmx(3,xreg-XMM0,xreg-XMM0));    // PCMPEQD xreg,xreg
80         else if (sz == 32 &&
81                  pev.Vllong4[0] == ~0 && pev.Vllong4[1] == ~0 &&
82                  pev.Vllong4[2] == ~0 && pev.Vllong4[3] == ~0)
83             cdb.gen2(PCMPEQQ,modregxrmx(3,xreg-XMM0,xreg-XMM0));    // PCMPEQQ xreg,xreg
84         else
85             assert(0);
86         tym_t tyx = sz == 16 ? TYllong2 : TYllong4;
87         checkSetVex(cdb.last(), tyx);
88         return;
89     }
90 
91     /* Generate:
92      *    MOV reg,value
93      *    MOV xreg,reg
94      */
95     assert(sz == 4 || sz == 8);
96     targ_size_t value = pev.Vint;
97     if (sz == 8)
98         value = cast(targ_size_t)pev.Vullong;
99 
100     if (value == 0)
101     {
102         if (ty == TYfloat || ty == TYifloat)
103         {
104             cdb.gen2(XORPS,modregxrmx(3,xreg-XMM0,xreg-XMM0));       // XORPS xreg,xreg
105             return;
106         }
107         else if (ty == TYdouble || ty == TYidouble)
108         {
109             cdb.gen2(XORPD,modregxrmx(3,xreg-XMM0,xreg-XMM0));       // XORPD xreg,xreg
110             return;
111         }
112     }
113 
114 
115     if (I32 && sz == 8)
116     {
117         reg_t r;
118         regm_t rm = ALLREGS;
119         allocreg(cdb,&rm,&r,TYint);         // allocate scratch register
120         static union U { targ_size_t s; targ_long[2] l; }
121         U u = void;
122         u.l[1] = 0;
123         u.s = value;
124         targ_long *p = &u.l[0];
125         movregconst(cdb,r,p[0],0);
126         cdb.genfltreg(STO,r,0);                     // MOV floatreg,r
127         movregconst(cdb,r,p[1],0);
128         cdb.genfltreg(STO,r,4);                     // MOV floatreg+4,r
129 
130         const op = xmmload(TYdouble, true);
131         cdb.genxmmreg(op,xreg,0,TYdouble);          // MOVSD XMMreg,floatreg
132     }
133     else
134     {
135         reg_t reg;
136         regwithvalue(cdb,ALLREGS,value,reg,(sz == 8) ? 64 : 0);
137         cdb.gen2(LODD,modregxrmx(3,xreg-XMM0,reg));     // MOVD xreg,reg
138         if (sz == 8)
139             code_orrex(cdb.last(), REX_W);
140         checkSetVex(cdb.last(), TYulong);
141     }
142 }
143 
144 /***********************************************
145  * Do simple orthogonal operators for XMM registers.
146  */
147 
148 @trusted
149 void orthxmm(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
150 {
151     //printf("orthxmm(e = %p, *pretregs = %s)\n", e, regm_str(*pretregs));
152     elem *e1 = e.EV.E1;
153     elem *e2 = e.EV.E2;
154 
155     // float + ifloat is not actually addition
156     if ((e.Eoper == OPadd || e.Eoper == OPmin) &&
157         ((tyreal(e1.Ety) && tyimaginary(e2.Ety)) ||
158          (tyreal(e2.Ety) && tyimaginary(e1.Ety))))
159     {
160         regm_t retregs = *pretregs & XMMREGS;
161         if (!retregs)
162             retregs = XMMREGS;
163 
164         regm_t rretregs;
165         reg_t rreg;
166         if (tyreal(e1.Ety))
167         {
168             const reg = findreg(retregs);
169             rreg = findreg(retregs & ~mask(reg));
170             retregs = mask(reg);
171             rretregs = mask(rreg);
172         }
173         else
174         {
175             // Pick the second register, not the first
176             rreg = findreg(retregs);
177             rretregs = mask(rreg);
178             const reg = findreg(retregs & ~rretregs);
179             retregs = mask(reg);
180         }
181         assert(retregs && rretregs);
182 
183         codelem(cdb,e1,&retregs,false); // eval left leaf
184         scodelem(cdb, e2, &rretregs, retregs, true);  // eval right leaf
185 
186         retregs |= rretregs;
187         if (e.Eoper == OPmin)
188         {
189             regm_t nretregs = XMMREGS & ~retregs;
190             reg_t sreg; // hold sign bit
191             const uint sz = tysize(e1.Ety);
192             allocreg(cdb,&nretregs,&sreg,e2.Ety);
193             eve signbit;
194             signbit.Vint = 0x80000000;
195             if (sz == 8)
196                 signbit.Vllong = 0x8000_0000_0000_0000;
197             movxmmconst(cdb,sreg, e1.Ety, &signbit, 0);
198             getregs(cdb,nretregs);
199             const opcode_t xop = (sz == 8) ? XORPD : XORPS;       // XORPD/S rreg,sreg
200             cdb.gen2(xop,modregxrmx(3,rreg-XMM0,sreg-XMM0));
201         }
202         if (retregs != *pretregs)
203             fixresult(cdb,e,retregs,pretregs);
204         return;
205     }
206 
207     regm_t retregs = *pretregs & XMMREGS;
208     if (!retregs)
209         retregs = XMMREGS;
210     const constflag = OTrel(e.Eoper);
211     codelem(cdb,e1,&retregs,constflag); // eval left leaf
212     const reg = findreg(retregs);
213     regm_t rretregs = XMMREGS & ~retregs;
214     scodelem(cdb, e2, &rretregs, retregs, true);  // eval right leaf
215 
216     const rreg = findreg(rretregs);
217     const op = xmmoperator(e1.Ety, e.Eoper);
218 
219     /* We should take advantage of mem addressing modes for OP XMM,MEM
220      * but we do not at the moment.
221      */
222     if (OTrel(e.Eoper) && !tyvector(tybasic(e.Ety)))
223     {
224         cdb.gen2(op,modregxrmx(3,rreg-XMM0,reg-XMM0));
225         checkSetVex(cdb.last(), e1.Ety);
226         return;
227     }
228 
229     getregs(cdb,retregs);
230     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
231     if (op == CMPPS || op == CMPPD)
232     {
233         // https://www.felixcloutier.com/x86/cmpps
234         ubyte imm8;
235         switch (e.Eoper)
236         {
237             case OPeqeq: imm8 = 0; break;
238             case OPlt:   imm8 = 1; break;
239             case OPle:   imm8 = 2; break;
240             case OPne:   imm8 = 4; break;
241             default:
242                 elem_print(e);
243                 assert(0);  // not doing the unordered compares
244         }
245         code* c = cdb.last();
246         c.IFL2 = FLconst;
247         c.IEV2.Vsize_t = imm8;
248     }
249     checkSetVex(cdb.last(), e1.Ety);
250     if (retregs != *pretregs)
251         fixresult(cdb,e,retregs,pretregs);
252 }
253 
254 
255 /************************
256  * Generate code for an assignment using XMM registers.
257  * Params:
258  *      opcode = store opcode to use, CMP means generate one
259  */
260 @trusted
261 void xmmeq(ref CodeBuilder cdb, elem *e, opcode_t op, elem *e1, elem *e2,regm_t *pretregs)
262 {
263     tym_t tymll;
264     int i;
265     code cs;
266     elem *e11;
267     bool regvar;                  /* true means evaluate into register variable */
268     targ_int postinc;
269 
270     //printf("xmmeq(e1 = %p, e2 = %p, *pretregs = %s)\n", e1, e2, regm_str(*pretregs));
271     tym_t tyml = tybasic(e1.Ety);              /* type of lvalue               */
272     regm_t retregs = *pretregs;
273 
274     if (!(retregs & XMMREGS))
275         retregs = XMMREGS;              // pick any XMM reg
276 
277     bool aligned = xmmIsAligned(e1);
278     // If default, select store opcode
279     cs.Iop = (op == CMP) ? xmmstore(tyml, aligned) : op;
280     regvar = false;
281     regm_t varregm = 0;
282     if (config.flags4 & CFG4optimized)
283     {
284         // Be careful of cases like (x = x+x+x). We cannot evaluate in
285         // x if x is in a register.
286         reg_t varreg;
287         if (isregvar(e1, varregm, varreg) &&    // if lvalue is register variable
288             doinreg(e1.EV.Vsym,e2) &&           // and we can compute directly into it
289             varregm & XMMREGS
290            )
291         {   regvar = true;
292             retregs = varregm;    // evaluate directly in target register
293         }
294         else
295             varregm = 0;
296     }
297     if (*pretregs & mPSW && OTleaf(e1.Eoper))     // if evaluating e1 couldn't change flags
298     {   // Be careful that this lines up with jmpopcode()
299         retregs |= mPSW;
300         *pretregs &= ~mPSW;
301     }
302     scodelem(cdb,e2,&retregs,0,true);    // get rvalue
303 
304     // Look for special case of (*p++ = ...), where p is a register variable
305     if (e1.Eoper == OPind &&
306         ((e11 = e1.EV.E1).Eoper == OPpostinc || e11.Eoper == OPpostdec) &&
307         e11.EV.E1.Eoper == OPvar &&
308         e11.EV.E1.EV.Vsym.Sfl == FLreg
309        )
310     {
311         postinc = e11.EV.E2.EV.Vint;
312         if (e11.Eoper == OPpostdec)
313             postinc = -postinc;
314         getlvalue(cdb,&cs,e11,RMstore | retregs);
315         freenode(e11.EV.E2);
316     }
317     else
318     {   postinc = 0;
319         getlvalue(cdb,&cs,e1,RMstore | retregs);       // get lvalue (cl == CNIL if regvar)
320     }
321 
322     getregs_imm(cdb,regvar ? varregm : 0);
323 
324     const reg = findreg(retregs & XMMREGS);
325     cs.Irm |= modregrm(0,(reg - XMM0) & 7,0);
326     if ((reg - XMM0) & 8)
327         cs.Irex |= REX_R;
328 
329     // Do not generate mov from register onto itself
330     if (!(regvar && reg == XMM0 + ((cs.Irm & 7) | (cs.Irex & REX_B ? 8 : 0))))
331     {
332         cdb.gen(&cs);         // MOV EA+offset,reg
333         checkSetVex(cdb.last(), tyml);
334     }
335 
336     if (e1.Ecount ||                     // if lvalue is a CSE or
337         regvar)                           // rvalue can't be a CSE
338     {
339         getregs_imm(cdb,retregs);        // necessary if both lvalue and
340                                         //  rvalue are CSEs (since a reg
341                                         //  can hold only one e at a time)
342         cssave(e1,retregs,!OTleaf(e1.Eoper));     // if lvalue is a CSE
343     }
344 
345     fixresult(cdb,e,retregs,pretregs);
346     if (postinc)
347     {
348         const increg = findreg(idxregm(&cs));  // the register to increment
349         if (*pretregs & mPSW)
350         {   // Use LEA to avoid touching the flags
351             uint rm = cs.Irm & 7;
352             if (cs.Irex & REX_B)
353                 rm |= 8;
354             cdb.genc1(LEA,buildModregrm(2,increg,rm),FLconst,postinc);
355             if (tysize(e11.EV.E1.Ety) == 8)
356                 code_orrex(cdb.last(), REX_W);
357         }
358         else if (I64)
359         {
360             cdb.genc2(0x81,modregrmx(3,0,increg),postinc);
361             if (tysize(e11.EV.E1.Ety) == 8)
362                 code_orrex(cdb.last(), REX_W);
363         }
364         else
365         {
366             if (postinc == 1)
367                 cdb.gen1(0x40 + increg);       // INC increg
368             else if (postinc == -cast(targ_int)1)
369                 cdb.gen1(0x48 + increg);       // DEC increg
370             else
371             {
372                 cdb.genc2(0x81,modregrm(3,0,increg),postinc);
373             }
374         }
375     }
376     freenode(e1);
377 }
378 
379 /********************************
380  * Generate code for conversion using SSE2 instructions.
381  *
382  *      OPs32_d
383  *      OPs64_d (64-bit only)
384  *      OPu32_d (64-bit only)
385  *      OPd_f
386  *      OPf_d
387  *      OPd_s32
388  *      OPd_s64 (64-bit only)
389  *
390  */
391 
392 @trusted
393 void xmmcnvt(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
394 {
395     //printf("xmmconvt: %p, %s\n", e, regm_str(*pretregs));
396     opcode_t op = NoOpcode;
397     regm_t regs;
398     tym_t ty;
399     ubyte rex = 0;
400     bool zx = false; // zero extend uint
401 
402     /* There are no ops for integer <. float/real conversions
403      * but there are instructions for them. In order to use these
404      * try to fuse chained conversions. Be careful not to loose
405      * precision for real to long.
406      */
407     elem *e1 = e.EV.E1;
408     switch (e.Eoper)
409     {
410     case OPd_f:
411         if (e1.Eoper == OPs32_d)
412         { }
413         else if (I64 && e1.Eoper == OPs64_d)
414             rex = REX_W;
415         else if (I64 && e1.Eoper == OPu32_d)
416         {   rex = REX_W;
417             zx = true;
418         }
419         else
420         {   regs = XMMREGS;
421             op = CVTSD2SS;
422             ty = TYfloat;
423             break;
424         }
425         if (e1.Ecount)
426         {
427             regs = XMMREGS;
428             op = CVTSD2SS;
429             ty = TYfloat;
430             break;
431         }
432         // directly use si2ss
433         regs = ALLREGS;
434         e1 = e1.EV.E1;  // fused operation
435         op = CVTSI2SS;
436         ty = TYfloat;
437         break;
438 
439     case OPs32_d:              goto Litod;
440     case OPs64_d: rex = REX_W; goto Litod;
441     case OPu32_d: rex = REX_W; zx = true; goto Litod;
442     Litod:
443         regs = ALLREGS;
444         op = CVTSI2SD;
445         ty = TYdouble;
446         break;
447 
448     case OPd_s16:
449     case OPd_s32: ty = TYint;  goto Ldtoi;
450     case OPd_u32: ty = TYlong; if (I64) rex = REX_W; goto Ldtoi;
451     case OPd_s64: ty = TYlong; rex = REX_W; goto Ldtoi;
452     Ldtoi:
453         regs = XMMREGS;
454         switch (e1.Eoper)
455         {
456         case OPf_d:
457             if (e1.Ecount)
458             {
459                 op = CVTTSD2SI;
460                 break;
461             }
462             e1 = e1.EV.E1;      // fused operation
463             op = CVTTSS2SI;
464             break;
465         case OPld_d:
466             if (e.Eoper == OPd_s64)
467             {
468                 cnvt87(cdb,e,pretregs); // precision
469                 return;
470             }
471             goto default;
472 
473         default:
474             op = CVTTSD2SI;
475             break;
476         }
477         break;
478 
479     case OPf_d:
480         regs = XMMREGS;
481         op = CVTSS2SD;
482         ty = TYdouble;
483         break;
484 
485     default:
486         assert(0);
487     }
488     assert(op != NoOpcode);
489 
490     codelem(cdb,e1, &regs, false);
491     reg_t reg = findreg(regs);
492     if (isXMMreg(reg))
493         reg -= XMM0;
494     else if (zx)
495     {   assert(I64);
496         getregs(cdb,regs);
497         genregs(cdb,0x8B,reg,reg); // MOV reg,reg to zero upper 32-bit
498                                    // Don't use x89 because that will get optimized away
499         code_orflag(cdb.last(),CFvolatile);
500     }
501 
502     regm_t retregs = *pretregs;
503     if (tyxmmreg(ty)) // target is XMM
504     {   if (!(*pretregs & XMMREGS))
505             retregs = XMMREGS;
506     }
507     else              // source is XMM
508     {   assert(regs & XMMREGS);
509         if (!(retregs & ALLREGS))
510             retregs = ALLREGS;
511     }
512 
513     reg_t rreg;
514     allocreg(cdb,&retregs,&rreg,ty);
515     if (isXMMreg(rreg))
516         rreg -= XMM0;
517 
518     cdb.gen2(op, modregxrmx(3,rreg,reg));
519     assert(I64 || !rex);
520     if (rex)
521         code_orrex(cdb.last(), rex);
522 
523     if (*pretregs != retregs)
524         fixresult(cdb,e,retregs,pretregs);
525 }
526 
527 /********************************
528  * Generate code for op=
529  */
530 
531 @trusted
532 void xmmopass(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
533 {   elem *e1 = e.EV.E1;
534     elem *e2 = e.EV.E2;
535     tym_t ty1 = tybasic(e1.Ety);
536     const sz1 = _tysize[ty1];
537     regm_t rretregs = XMMREGS & ~*pretregs;
538     if (!rretregs)
539         rretregs = XMMREGS;
540 
541     codelem(cdb,e2,&rretregs,false); // eval right leaf
542     reg_t rreg = findreg(rretregs);
543 
544     code cs;
545     regm_t retregs;
546     reg_t reg;
547     bool regvar = false;
548     if (config.flags4 & CFG4optimized)
549     {
550         // Be careful of cases like (x = x+x+x). We cannot evaluate in
551         // x if x is in a register.
552         reg_t varreg;
553         regm_t varregm;
554         if (isregvar(e1,varregm,varreg) &&    // if lvalue is register variable
555             doinreg(e1.EV.Vsym,e2)          // and we can compute directly into it
556            )
557         {   regvar = true;
558             retregs = varregm;
559             reg = varreg;                       // evaluate directly in target register
560             getregs(cdb,retregs);       // destroy these regs
561         }
562     }
563 
564     if (!regvar)
565     {
566         getlvalue(cdb,&cs,e1,rretregs);         // get EA
567         retregs = *pretregs & XMMREGS & ~rretregs;
568         if (!retregs)
569             retregs = XMMREGS & ~rretregs;
570         allocreg(cdb,&retregs,&reg,ty1);
571         cs.Iop = xmmload(ty1, true);            // MOVSD xmm,xmm_m64
572         code_newreg(&cs,reg - XMM0);
573         cdb.gen(&cs);
574         checkSetVex(cdb.last(), ty1);
575     }
576 
577     const op = xmmoperator(e1.Ety, e.Eoper);
578     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
579     checkSetVex(cdb.last(), e1.Ety);
580 
581     if (!regvar)
582     {
583         cs.Iop = xmmstore(ty1,true);      // reverse operand order of MOVS[SD]
584         cdb.gen(&cs);
585         checkSetVex(cdb.last(), ty1);
586     }
587 
588     if (e1.Ecount ||                     // if lvalue is a CSE or
589         regvar)                           // rvalue can't be a CSE
590     {
591         getregs_imm(cdb,retregs);        // necessary if both lvalue and
592                                         //  rvalue are CSEs (since a reg
593                                         //  can hold only one e at a time)
594         cssave(e1,retregs,!OTleaf(e1.Eoper));     // if lvalue is a CSE
595     }
596 
597     fixresult(cdb,e,retregs,pretregs);
598     freenode(e1);
599 }
600 
601 /********************************
602  * Generate code for post increment and post decrement.
603  */
604 
605 @trusted
606 void xmmpost(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
607 {
608     elem *e1 = e.EV.E1;
609     elem *e2 = e.EV.E2;
610     tym_t ty1 = tybasic(e1.Ety);
611 
612     regm_t retregs;
613     reg_t reg;
614     bool regvar = false;
615     if (config.flags4 & CFG4optimized)
616     {
617         // Be careful of cases like (x = x+x+x). We cannot evaluate in
618         // x if x is in a register.
619         reg_t varreg;
620         regm_t varregm;
621         if (isregvar(e1,varregm,varreg) &&    // if lvalue is register variable
622             doinreg(e1.EV.Vsym,e2)          // and we can compute directly into it
623            )
624         {
625             regvar = true;
626             retregs = varregm;
627             reg = varreg;                       // evaluate directly in target register
628             getregs(cdb,retregs);       // destroy these regs
629         }
630     }
631 
632     code cs;
633     if (!regvar)
634     {
635         getlvalue(cdb,&cs,e1,0);                // get EA
636         retregs = XMMREGS & ~*pretregs;
637         if (!retregs)
638             retregs = XMMREGS;
639         allocreg(cdb,&retregs,&reg,ty1);
640         cs.Iop = xmmload(ty1, true);            // MOVSD xmm,xmm_m64
641         code_newreg(&cs,reg - XMM0);
642         cdb.gen(&cs);
643         checkSetVex(cdb.last(), ty1);
644     }
645 
646     // Result register
647     regm_t resultregs = XMMREGS & *pretregs & ~retregs;
648     if (!resultregs)
649         resultregs = XMMREGS & ~retregs;
650     reg_t resultreg;
651     allocreg(cdb,&resultregs, &resultreg, ty1);
652 
653     cdb.gen2(xmmload(ty1,true),modregxrmx(3,resultreg-XMM0,reg-XMM0));   // MOVSS/D resultreg,reg
654     checkSetVex(cdb.last(), ty1);
655 
656     regm_t rretregs = XMMREGS & ~(*pretregs | retregs | resultregs);
657     if (!rretregs)
658         rretregs = XMMREGS & ~(retregs | resultregs);
659     codelem(cdb,e2,&rretregs,false); // eval right leaf
660     const rreg = findreg(rretregs);
661 
662     const op = xmmoperator(e1.Ety, e.Eoper);
663     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));  // ADD reg,rreg
664     checkSetVex(cdb.last(), e1.Ety);
665 
666     if (!regvar)
667     {
668         cs.Iop = xmmstore(ty1,true);      // reverse operand order of MOVS[SD]
669         cdb.gen(&cs);
670         checkSetVex(cdb.last(), ty1);
671     }
672 
673     if (e1.Ecount ||                     // if lvalue is a CSE or
674         regvar)                           // rvalue can't be a CSE
675     {
676         getregs_imm(cdb,retregs); // necessary if both lvalue and
677                                         //  rvalue are CSEs (since a reg
678                                         //  can hold only one e at a time)
679         cssave(e1,retregs,!OTleaf(e1.Eoper));     // if lvalue is a CSE
680     }
681 
682     fixresult(cdb,e,resultregs,pretregs);
683     freenode(e1);
684 }
685 
686 /******************
687  * Negate operator
688  */
689 
690 @trusted
691 void xmmneg(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
692 {
693     //printf("xmmneg()\n");
694     //elem_print(e);
695     assert(*pretregs);
696     tym_t tyml = tybasic(e.EV.E1.Ety);
697     int sz = _tysize[tyml];
698 
699     regm_t retregs = *pretregs & XMMREGS;
700     if (!retregs)
701         retregs = XMMREGS;
702 
703     /* Generate:
704      *    MOV reg,e1
705      *    MOV rreg,signbit
706      *    XOR reg,rreg
707      */
708     codelem(cdb,e.EV.E1,&retregs,false);
709     getregs(cdb,retregs);
710     const reg = findreg(retregs);
711     regm_t rretregs = XMMREGS & ~retregs;
712     reg_t rreg;
713     allocreg(cdb,&rretregs,&rreg,tyml);
714 
715     eve signbit;
716     signbit.Vint = 0x80000000;
717     if (sz == 8)
718         signbit.Vllong = 0x8000_0000_0000_0000;
719 
720     movxmmconst(cdb,rreg, tyml, &signbit, 0);
721 
722     getregs(cdb,retregs);
723     const op = (sz == 8) ? XORPD : XORPS;       // XORPD/S reg,rreg
724     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
725     fixresult(cdb,e,retregs,pretregs);
726 }
727 
728 /******************
729  * Absolute value operator OPabs
730  */
731 
732 @trusted
733 void xmmabs(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
734 {
735     //printf("xmmabs()\n");
736     //elem_print(e);
737     assert(*pretregs);
738     tym_t tyml = tybasic(e.EV.E1.Ety);
739     int sz = _tysize[tyml];
740 
741     regm_t retregs = *pretregs & XMMREGS;
742     if (!retregs)
743         retregs = XMMREGS;
744 
745     /* Generate:
746      *    MOV reg,e1
747      *    MOV rreg,mask
748      *    AND reg,rreg
749      */
750     codelem(cdb,e.EV.E1,&retregs,false);
751     getregs(cdb,retregs);
752     const reg = findreg(retregs);
753     regm_t rretregs = XMMREGS & ~retregs;
754     reg_t rreg;
755     allocreg(cdb,&rretregs,&rreg,tyml);
756 
757     eve mask;
758     mask.Vint = 0x7FFF_FFFF;
759     if (sz == 8)
760         mask.Vllong = 0x7FFF_FFFF_FFFF_FFFFL;
761     movxmmconst(cdb, rreg, tyml, &mask, 0);
762 
763     getregs(cdb,retregs);
764     const op = (sz == 8) ? ANDPD : ANDPS;       // ANDPD/S reg,rreg
765     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
766     fixresult(cdb,e,retregs,pretregs);
767 }
768 
769 /*****************************
770  * Get correct load operator based on type.
771  * It is important to use the right one even if the number of bits moved is the same,
772  * as there are performance consequences for using the wrong one.
773  * Params:
774  *      tym = type of data to load
775  *      aligned = for vectors, true if aligned to 16 bytes
776  */
777 
778 @trusted
779 opcode_t xmmload(tym_t tym, bool aligned = true)
780 {
781     opcode_t op;
782     if (tysize(tym) == 32)
783         aligned = false;
784     switch (tybasic(tym))
785     {
786         case TYuint:
787         case TYint:
788         case TYlong:
789         case TYulong:   op = LODD;  break;       // MOVD
790         case TYfloat:
791         case TYcfloat:
792         case TYifloat:  op = LODSS; break;       // MOVSS
793         case TYllong:
794         case TYullong:  op = LODQ;  break;       // MOVQ
795         case TYdouble:
796         case TYcdouble:
797         case TYidouble: op = LODSD; break;       // MOVSD
798 
799         case TYfloat8:
800         case TYfloat4:  op = aligned ? LODAPS : LODUPS; break;      // MOVAPS / MOVUPS
801         case TYdouble4:
802         case TYdouble2: op = aligned ? LODAPD : LODUPD; break;      // MOVAPD / MOVUPD
803         case TYschar16:
804         case TYuchar16:
805         case TYshort8:
806         case TYushort8:
807         case TYlong4:
808         case TYulong4:
809         case TYllong2:
810         case TYullong2:
811         case TYschar32:
812         case TYuchar32:
813         case TYshort16:
814         case TYushort16:
815         case TYlong8:
816         case TYulong8:
817         case TYllong4:
818         case TYullong4: op = aligned ? LODDQA : LODDQU; break;      // MOVDQA / MOVDQU
819 
820         default:
821             printf("tym = x%x\n", tym);
822             assert(0);
823     }
824     return op;
825 }
826 
827 /*****************************
828  * Get correct store operator based on type.
829  */
830 
831 @trusted
832 opcode_t xmmstore(tym_t tym, bool aligned = true)
833 {
834     opcode_t op;
835     switch (tybasic(tym))
836     {
837         case TYuint:
838         case TYint:
839         case TYlong:
840         case TYulong:   op = STOD;  break;       // MOVD
841         case TYfloat:
842         case TYifloat:  op = STOSS; break;       // MOVSS
843         case TYllong:
844         case TYullong:  op = STOQ;  break;       // MOVQ
845         case TYdouble:
846         case TYidouble:
847         case TYcdouble:
848         case TYcfloat:  op = STOSD; break;       // MOVSD
849 
850         case TYfloat8:
851         case TYfloat4:  op = aligned ? STOAPS : STOUPS; break;      // MOVAPS / MOVUPS
852         case TYdouble4:
853         case TYdouble2: op = aligned ? STOAPD : STOUPD; break;      // MOVAPD / MOVUPD
854         case TYschar16:
855         case TYuchar16:
856         case TYshort8:
857         case TYushort8:
858         case TYlong4:
859         case TYulong4:
860         case TYllong2:
861         case TYullong2:
862         case TYschar32:
863         case TYuchar32:
864         case TYshort16:
865         case TYushort16:
866         case TYlong8:
867         case TYulong8:
868         case TYllong4:
869         case TYullong4: op = aligned ? STODQA : STODQU; break;      // MOVDQA / MOVDQU
870 
871         default:
872             printf("tym = 0x%x\n", tym);
873             assert(0);
874     }
875     return op;
876 }
877 
878 
879 /************************************
880  * Get correct XMM operator based on type and operator.
881  */
882 
883 @trusted
884 private opcode_t xmmoperator(tym_t tym, OPER oper)
885 {
886     tym = tybasic(tym);
887     opcode_t op;
888     switch (oper)
889     {
890         case OPadd:
891         case OPaddass:
892         case OPpostinc:
893             switch (tym)
894             {
895                 case TYfloat:
896                 case TYifloat:  op = ADDSS;  break;
897                 case TYdouble:
898                 case TYidouble: op = ADDSD;  break;
899 
900                 // SIMD vector types
901                 case TYfloat8:
902                 case TYfloat4:  op = ADDPS;  break;
903                 case TYdouble4:
904                 case TYdouble2: op = ADDPD;  break;
905                 case TYschar32:
906                 case TYuchar32:
907                 case TYschar16:
908                 case TYuchar16: op = PADDB;  break;
909                 case TYshort16:
910                 case TYushort16:
911                 case TYshort8:
912                 case TYushort8: op = PADDW;  break;
913                 case TYlong8:
914                 case TYulong8:
915                 case TYlong4:
916                 case TYulong4:  op = PADDD;  break;
917                 case TYllong4:
918                 case TYullong4:
919                 case TYllong2:
920                 case TYullong2: op = PADDQ;  break;
921 
922                 default:
923                     printf("tym = x%x\n", tym);
924                     assert(0);
925             }
926             break;
927 
928         case OPmin:
929         case OPminass:
930         case OPpostdec:
931             switch (tym)
932             {
933                 case TYfloat:
934                 case TYifloat:  op = SUBSS;  break;
935                 case TYdouble:
936                 case TYidouble: op = SUBSD;  break;
937 
938                 // SIMD vector types
939                 case TYfloat8:
940                 case TYfloat4:  op = SUBPS;  break;
941                 case TYdouble4:
942                 case TYdouble2: op = SUBPD;  break;
943                 case TYschar32:
944                 case TYuchar32:
945                 case TYschar16:
946                 case TYuchar16: op = PSUBB;  break;
947                 case TYshort16:
948                 case TYushort16:
949                 case TYshort8:
950                 case TYushort8: op = PSUBW;  break;
951                 case TYlong8:
952                 case TYulong8:
953                 case TYlong4:
954                 case TYulong4:  op = PSUBD;  break;
955                 case TYllong4:
956                 case TYullong4:
957                 case TYllong2:
958                 case TYullong2: op = PSUBQ;  break;
959 
960                 default:        assert(0);
961             }
962             break;
963 
964         case OPmul:
965         case OPmulass:
966             switch (tym)
967             {
968                 case TYfloat:
969                 case TYifloat:  op = MULSS;  break;
970                 case TYdouble:
971                 case TYidouble: op = MULSD;  break;
972 
973                 // SIMD vector types
974                 case TYfloat8:
975                 case TYfloat4:  op = MULPS;  break;
976                 case TYdouble4:
977                 case TYdouble2: op = MULPD;  break;
978                 case TYshort16:
979                 case TYushort16:
980                 case TYshort8:
981                 case TYushort8: op = PMULLW; break;
982                 case TYlong8:
983                 case TYulong8:
984                 case TYlong4:
985                 case TYulong4:  op = PMULLD; break;
986 
987                 default:        assert(0);
988             }
989             break;
990 
991         case OPdiv:
992         case OPdivass:
993             switch (tym)
994             {
995                 case TYfloat:
996                 case TYifloat:  op = DIVSS;  break;
997                 case TYdouble:
998                 case TYidouble: op = DIVSD;  break;
999 
1000                 // SIMD vector types
1001                 case TYfloat8:
1002                 case TYfloat4:  op = DIVPS;  break;
1003                 case TYdouble4:
1004                 case TYdouble2: op = DIVPD;  break;
1005 
1006                 default:        assert(0);
1007             }
1008             break;
1009 
1010         case OPor:
1011         case OPorass:
1012             switch (tym)
1013             {
1014                 // SIMD vector types
1015                 case TYschar16:
1016                 case TYuchar16:
1017                 case TYshort8:
1018                 case TYushort8:
1019                 case TYlong4:
1020                 case TYulong4:
1021                 case TYllong2:
1022                 case TYullong2:
1023                 case TYschar32:
1024                 case TYuchar32:
1025                 case TYshort16:
1026                 case TYushort16:
1027                 case TYlong8:
1028                 case TYulong8:
1029                 case TYllong4:
1030                 case TYullong4: op = POR; break;
1031 
1032                 default:        assert(0);
1033             }
1034             break;
1035 
1036         case OPand:
1037         case OPandass:
1038             switch (tym)
1039             {
1040                 // SIMD vector types
1041                 case TYschar16:
1042                 case TYuchar16:
1043                 case TYshort8:
1044                 case TYushort8:
1045                 case TYlong4:
1046                 case TYulong4:
1047                 case TYllong2:
1048                 case TYullong2:
1049                 case TYschar32:
1050                 case TYuchar32:
1051                 case TYshort16:
1052                 case TYushort16:
1053                 case TYlong8:
1054                 case TYulong8:
1055                 case TYllong4:
1056                 case TYullong4: op = PAND; break;
1057 
1058                 default:        assert(0);
1059             }
1060             break;
1061 
1062         case OPxor:
1063         case OPxorass:
1064             switch (tym)
1065             {
1066                 // SIMD vector types
1067                 case TYschar16:
1068                 case TYuchar16:
1069                 case TYshort8:
1070                 case TYushort8:
1071                 case TYlong4:
1072                 case TYulong4:
1073                 case TYllong2:
1074                 case TYullong2:
1075                 case TYschar32:
1076                 case TYuchar32:
1077                 case TYshort16:
1078                 case TYushort16:
1079                 case TYlong8:
1080                 case TYulong8:
1081                 case TYllong4:
1082                 case TYullong4: op = PXOR; break;
1083 
1084                 default:        assert(0);
1085             }
1086             break;
1087 
1088         case OPgt:
1089             switch (tym)
1090             {
1091                 case TYschar32:
1092                 case TYuchar32:
1093                 case TYschar16:
1094                 case TYuchar16: op = PCMPGTB;  break;
1095                 case TYshort16:
1096                 case TYushort16:
1097                 case TYshort8:
1098                 case TYushort8: op = PCMPGTW;  break;
1099                 case TYlong8:
1100                 case TYulong8:
1101                 case TYlong4:
1102                 case TYulong4:  op = PCMPGTD;  break;
1103                 case TYllong4:
1104                 case TYullong4:
1105                 case TYllong2:
1106                 case TYullong2: op = PCMPGTQ;  break;
1107                 default:
1108                     goto Lfloatcmp;
1109             }
1110             break;
1111 
1112         case OPeqeq:
1113             switch (tym)
1114             {
1115                 case TYschar32:
1116                 case TYuchar32:
1117                 case TYschar16:
1118                 case TYuchar16: op = PCMPEQB;  break;
1119                 case TYshort16:
1120                 case TYushort16:
1121                 case TYshort8:
1122                 case TYushort8: op = PCMPEQW;  break;
1123                 case TYlong8:
1124                 case TYulong8:
1125                 case TYlong4:
1126                 case TYulong4:  op = PCMPEQD;  break;
1127                 case TYllong4:
1128                 case TYullong4:
1129                 case TYllong2:
1130                 case TYullong2: op = PCMPEQQ;  break;
1131                 default:
1132                     goto Lfloatcmp;
1133             }
1134             break;
1135 
1136         case OPlt:
1137         case OPle:
1138         case OPge:
1139         case OPne:
1140         case OPunord:        /* !<>=         */
1141         case OPlg:           /* <>           */
1142         case OPleg:          /* <>=          */
1143         case OPule:          /* !>           */
1144         case OPul:           /* !>=          */
1145         case OPuge:          /* !<           */
1146         case OPug:           /* !<=          */
1147         case OPue:           /* !<>          */
1148         case OPngt:
1149         case OPnge:
1150         case OPnlt:
1151         case OPnle:
1152         case OPord:
1153         case OPnlg:
1154         case OPnleg:
1155         case OPnule:
1156         case OPnul:
1157         case OPnuge:
1158         case OPnug:
1159         case OPnue:
1160         Lfloatcmp:
1161             switch (tym)
1162             {
1163                 case TYfloat:
1164                 case TYifloat:  op = UCOMISS;  break;
1165                 case TYdouble:
1166                 case TYidouble: op = UCOMISD;  break;
1167 
1168                 case TYfloat4:
1169                 case TYfloat8:
1170                 case TYfloat16: op = CMPPS;    break;
1171 
1172                 case TYdouble2:
1173                 case TYdouble4:
1174                 case TYdouble8: op = CMPPD;    break;
1175                 default:        assert(0);
1176             }
1177             break;
1178 
1179         default:
1180             assert(0);
1181     }
1182     return op;
1183 }
1184 
1185 @trusted
1186 void cdvector(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
1187 {
1188     /* e should look like one of:
1189      *    vector
1190      *      |
1191      *    param
1192      *    /   \
1193      *  param op2
1194      *  /   \
1195      * op   op1
1196      */
1197 
1198     if (!config.fpxmmregs)
1199     {   printf("SIMD operations not supported on this platform\n");
1200         exit(1);
1201     }
1202 
1203     const n = el_nparams(e.EV.E1);
1204     assert(n < size_t.max / (2 * (elem *).sizeof));   // conservative overflow check
1205     elem **params = cast(elem **)malloc(n * (elem *).sizeof);
1206     assert(params);
1207     elem **tmp = params;
1208     el_paramArray(&tmp, e.EV.E1);
1209 
1210 static if (0)
1211 {
1212     printf("cdvector()\n");
1213     for (int i = 0; i < n; i++)
1214     {
1215         printf("[%d]: ", i);
1216         elem_print(params[i]);
1217     }
1218 }
1219 
1220     if (*pretregs == 0)
1221     {   /* Evaluate for side effects only
1222          */
1223         foreach (i; 0 .. n)
1224         {
1225             codelem(cdb,params[i], pretregs, false);
1226             *pretregs = 0;      // in case they got set
1227         }
1228         return;
1229     }
1230 
1231     assert(n >= 2 && n <= 4);
1232 
1233     elem *eop = params[0];
1234     elem *op1 = params[1];
1235     elem *op2 = null;
1236     tym_t ty2 = 0;
1237     if (n >= 3)
1238     {   op2 = params[2];
1239         ty2 = tybasic(op2.Ety);
1240     }
1241 
1242     auto op = cast(opcode_t)el_tolong(eop);
1243     debug assert(!isXMMstore(op));
1244     tym_t ty1 = tybasic(op1.Ety);
1245 
1246     regm_t retregs;
1247     if (n == 3 && ty2 == TYuchar && op2.Eoper == OPconst)
1248     {   // Handle: op xmm,imm8
1249 
1250         retregs = *pretregs & XMMREGS;
1251         if (!retregs)
1252             retregs = XMMREGS;
1253         codelem(cdb,op1,&retregs,false); // eval left leaf
1254         const reg = findreg(retregs);
1255         int r;
1256         switch (op)
1257         {
1258             case PSLLD:  r = 6; op = 0x660F72;  break;
1259             case PSLLQ:  r = 6; op = 0x660F73;  break;
1260             case PSLLW:  r = 6; op = 0x660F71;  break;
1261             case PSRAD:  r = 4; op = 0x660F72;  break;
1262             case PSRAW:  r = 4; op = 0x660F71;  break;
1263             case PSRLD:  r = 2; op = 0x660F72;  break;
1264             case PSRLQ:  r = 2; op = 0x660F73;  break;
1265             case PSRLW:  r = 2; op = 0x660F71;  break;
1266             case PSRLDQ: r = 3; op = 0x660F73;  break;
1267             case PSLLDQ: r = 7; op = 0x660F73;  break;
1268 
1269             default:
1270                 printf("op = x%x\n", op);
1271                 assert(0);
1272         }
1273         getregs(cdb,retregs);
1274         cdb.genc2(op,modregrmx(3,r,reg-XMM0), cast(uint)el_tolong(op2));
1275     }
1276     else if (n == 2)
1277     {   /* Handle: op xmm,mem
1278          * where xmm is written only, not read
1279          */
1280         code cs;
1281 
1282         if ((op1.Eoper == OPind && !op1.Ecount) || op1.Eoper == OPvar)
1283         {
1284             getlvalue(cdb,&cs, op1, RMload);     // get addressing mode
1285         }
1286         else
1287         {
1288             regm_t rretregs = XMMREGS;
1289             codelem(cdb,op1, &rretregs, false);
1290             const rreg = findreg(rretregs) - XMM0;
1291             cs.Irm = modregrm(3,0,rreg & 7);
1292             cs.Iflags = 0;
1293             cs.Irex = 0;
1294             if (rreg & 8)
1295                 cs.Irex |= REX_B;
1296         }
1297 
1298         retregs = *pretregs & XMMREGS;
1299         if (!retregs)
1300             retregs = XMMREGS;
1301         reg_t reg;
1302         allocreg(cdb,&retregs, &reg, e.Ety);
1303         code_newreg(&cs, reg - XMM0);
1304         cs.Iop = op;
1305         cdb.gen(&cs);
1306     }
1307     else if (n == 3 || n == 4)
1308     {   /* Handle:
1309          *      op xmm,mem        // n = 3
1310          *      op xmm,mem,imm8   // n = 4
1311          * Both xmm and mem are operands, evaluate xmm first.
1312          */
1313 
1314         code cs;
1315 
1316         retregs = *pretregs & XMMREGS;
1317         if (!retregs)
1318             retregs = XMMREGS;
1319         codelem(cdb,op1,&retregs,false); // eval left leaf
1320         const reg = findreg(retregs);
1321 
1322         /* MOVHLPS and LODLPS have the same opcode. They are distinguished
1323          * by MOVHLPS has a second operand of size 128, LODLPS has 64
1324          *  https://www.felixcloutier.com/x86/movlps
1325          *  https://www.felixcloutier.com/x86/movhlps
1326          * MOVHLPS must be an XMM operand, LODLPS must be a memory operand
1327          */
1328         const isMOVHLPS = op == MOVHLPS && tysize(ty2) == 16;
1329 
1330         if (((op2.Eoper == OPind && !op2.Ecount) || op2.Eoper == OPvar) && !isMOVHLPS)
1331         {
1332             getlvalue(cdb,&cs, op2, RMload | retregs);     // get addressing mode
1333         }
1334         else
1335         {
1336             // load op2 into XMM register
1337             regm_t rretregs = XMMREGS & ~retregs;
1338             scodelem(cdb, op2, &rretregs, retregs, true);
1339             const rreg = findreg(rretregs) - XMM0;
1340             cs.Irm = modregrm(3,0,rreg & 7);
1341             cs.Iflags = 0;
1342             cs.Irex = 0;
1343             if (rreg & 8)
1344                 cs.Irex |= REX_B;
1345         }
1346 
1347         getregs(cdb,retregs);
1348 
1349         switch (op)
1350         {
1351             case CMPPD:   case CMPSS:   case CMPSD:   case CMPPS:
1352             case PSHUFD:  case PSHUFHW: case PSHUFLW:
1353             case BLENDPD: case BLENDPS: case DPPD:    case DPPS:
1354             case MPSADBW: case PBLENDW:
1355             case ROUNDPD: case ROUNDPS: case ROUNDSD: case ROUNDSS:
1356             case SHUFPD:  case SHUFPS:
1357                 if (n == 3)
1358                 {
1359                     if (pass == BackendPass.final_)
1360                         error(e.Esrcpos.Sfilename, e.Esrcpos.Slinnum, e.Esrcpos.Scharnum, "missing 4th parameter to `__simd()`");
1361                     cs.IFL2 = FLconst;
1362                     cs.IEV2.Vsize_t = 0;
1363                 }
1364                 break;
1365             default:
1366                 break;
1367         }
1368 
1369         if (n == 4)
1370         {
1371             elem *imm8 = params[3];
1372             cs.IFL2 = FLconst;
1373             if (imm8.Eoper != OPconst)
1374             {
1375                 error(imm8.Esrcpos.Sfilename, imm8.Esrcpos.Slinnum, imm8.Esrcpos.Scharnum, "last parameter to `__simd()` must be a constant");
1376                 cs.IEV2.Vsize_t = 0;
1377             }
1378             else
1379                 cs.IEV2.Vsize_t = cast(targ_size_t)el_tolong(imm8);
1380         }
1381         code_newreg(&cs, reg - XMM0);
1382         cs.Iop = op;
1383         cdb.gen(&cs);
1384     }
1385     else
1386         assert(0);
1387     fixresult(cdb,e,retregs,pretregs);
1388     free(params);
1389     freenode(e);
1390 }
1391 
1392 /***************
1393  * Generate code for vector "store" operations.
1394  * The tree e must look like:
1395  *  (op1 OPvecsto (op OPparam op2))
1396  * where op is the store instruction STOxxxx.
1397  */
1398 @trusted
1399 void cdvecsto(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
1400 {
1401     //printf("cdvecsto()\n");
1402     //elem_print(e);
1403     elem *op1 = e.EV.E1;
1404     elem *op2 = e.EV.E2.EV.E2;
1405     elem *eop = e.EV.E2.EV.E1;
1406     const op = cast(opcode_t)el_tolong(eop);
1407     debug assert(isXMMstore(op));
1408     xmmeq(cdb, e, op, op1, op2, pretregs);
1409 }
1410 
1411 /***************
1412  * Generate code for OPvecfill (broadcast).
1413  * OPvecfill takes the single value in e1 and
1414  * fills the vector type with it.
1415  */
1416 @trusted
1417 void cdvecfill(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
1418 {
1419     //printf("cdvecfill(e = %p, *pretregs = %s)\n",e,regm_str(*pretregs));
1420 
1421     regm_t retregs = *pretregs & XMMREGS;
1422     if (!retregs)
1423         retregs = XMMREGS;
1424 
1425     code *c;
1426     code cs;
1427 
1428     elem *e1 = e.EV.E1;
1429 static if (0)
1430 {
1431     if ((e1.Eoper == OPind && !e1.Ecount) || e1.Eoper == OPvar)
1432     {
1433         cr = getlvalue(&cs, e1, RMload | retregs);     // get addressing mode
1434     }
1435     else
1436     {
1437         regm_t rretregs = XMMREGS & ~retregs;
1438         cr = scodelem(op2, &rretregs, retregs, true);
1439         const rreg = findreg(rretregs) - XMM0;
1440         cs.Irm = modregrm(3,0,rreg & 7);
1441         cs.Iflags = 0;
1442         cs.Irex = 0;
1443         if (rreg & 8)
1444             cs.Irex |= REX_B;
1445     }
1446 }
1447 
1448     /* e.Ety only gives us the size of the result vector, not its type.
1449      * We must combine it with the vector element type, e1.Ety, to
1450      * form the resulting vector type, ty.
1451      * The reason is someone may have painted the result of the OPvecfill to
1452      * a different vector type.
1453      */
1454     const sz = tysize(e.Ety);
1455     const ty1 = tybasic(e1.Ety);
1456     assert(sz == 16 || sz == 32);
1457     const bool x16 = (sz == 16);
1458 
1459     tym_t ty;
1460     switch (ty1)
1461     {
1462         case TYfloat:   ty = x16 ? TYfloat4  : TYfloat8;   break;
1463         case TYdouble:  ty = x16 ? TYdouble2 : TYdouble4;  break;
1464         case TYschar:   ty = x16 ? TYschar16 : TYschar32;  break;
1465         case TYuchar:   ty = x16 ? TYuchar16 : TYuchar32;  break;
1466         case TYshort:   ty = x16 ? TYshort8  : TYshort16;  break;
1467         case TYushort:  ty = x16 ? TYushort8 : TYushort16; break;
1468         case TYint:
1469         case TYlong:    ty = x16 ? TYlong4   : TYlong8;    break;
1470         case TYuint:
1471         case TYulong:   ty = x16 ? TYulong4  : TYulong8;   break;
1472         case TYllong:   ty = x16 ? TYllong2  : TYllong4;   break;
1473         case TYullong:  ty = x16 ? TYullong2 : TYullong4;  break;
1474 
1475         default:
1476             assert(0);
1477     }
1478 
1479     switch (ty)
1480     {
1481         case TYfloat4:
1482         case TYfloat8:
1483             if (config.avx && e1.Eoper == OPind && !e1.Ecount)
1484             {
1485                 // VBROADCASTSS X/YMM,MEM
1486                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1487                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1488                 reg_t reg;
1489                 allocreg(cdb,&retregs,&reg,ty);
1490                 cs.Iop = VBROADCASTSS;
1491                 cs.Irex &= ~REX_W;
1492                 code_newreg(&cs,reg - XMM0);
1493                 checkSetVex(&cs,ty);
1494                 cdb.gen(&cs);
1495             }
1496             else
1497             {
1498                 codelem(cdb,e1,&retregs,false); // eval left leaf
1499                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1500                 getregs(cdb,retregs);
1501                 if (config.avx >= 2)
1502                 {
1503                     // VBROADCASTSS X/YMM,XMM
1504                     cdb.gen2(VBROADCASTSS, modregxrmx(3,reg,reg));
1505                     checkSetVex(cdb.last(), ty);
1506                 }
1507                 else
1508                 {
1509                     // (V)SHUFPS XMM,XMM,0
1510                     cdb.genc2(SHUFPS, modregxrmx(3,reg,reg), 0);
1511                     checkSetVex(cdb.last(), ty);
1512                     if (tysize(ty) == 32)
1513                     {
1514                         // VINSERTF128 YMM,YMM,XMM,1
1515                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1516                         checkSetVex(cdb.last(), ty);
1517                     }
1518                 }
1519             }
1520             break;
1521 
1522         case TYdouble2:
1523         case TYdouble4:
1524             if (config.avx && tysize(ty) == 32 && e1.Eoper == OPind && !e1.Ecount)
1525             {
1526                 // VBROADCASTSD YMM,MEM
1527                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1528                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1529                 reg_t reg;
1530                 allocreg(cdb,&retregs,&reg,ty);
1531                 cs.Iop = VBROADCASTSD;
1532                 cs.Irex &= ~REX_W;
1533                 code_newreg(&cs,reg - XMM0);
1534                 checkSetVex(&cs,ty);
1535                 cdb.gen(&cs);
1536             }
1537             else
1538             {
1539                 codelem(cdb,e1,&retregs,false); // eval left leaf
1540                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1541                 getregs(cdb,retregs);
1542                 if (config.avx >= 2 && tysize(ty) == 32)
1543                 {
1544                     // VBROADCASTSD YMM,XMM
1545                     cdb.gen2(VBROADCASTSD, modregxrmx(3,reg,reg));
1546                     checkSetVex(cdb.last(), ty);
1547                 }
1548                 else
1549                 {
1550                     // (V)UNPCKLPD XMM,XMM
1551                     cdb.gen2(UNPCKLPD, modregxrmx(3,reg,reg));
1552                     checkSetVex(cdb.last(), TYdouble2); // AVX-128
1553                     if (tysize(ty) == 32)
1554                     {
1555                         // VINSERTF128 YMM,YMM,XMM,1
1556                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1557                         checkSetVex(cdb.last(), ty);
1558                     }
1559                 }
1560             }
1561             break;
1562 
1563         case TYschar16:
1564         case TYuchar16:
1565         case TYschar32:
1566         case TYuchar32:
1567             if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount)
1568             {
1569                 // VPBROADCASTB X/YMM,MEM
1570                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1571                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1572                 reg_t reg;
1573                 allocreg(cdb,&retregs,&reg,ty);
1574                 cs.Iop = VPBROADCASTB;
1575                 cs.Irex &= ~REX_W;
1576                 code_newreg(&cs,reg - XMM0);
1577                 checkSetVex(&cs,ty);
1578                 cdb.gen(&cs);
1579             }
1580             else
1581             {
1582                 regm_t regm = ALLREGS;
1583                 codelem(cdb,e1,&regm,true); // eval left leaf
1584                 const r = findreg(regm);
1585 
1586                 reg_t reg;
1587                 allocreg(cdb,&retregs,&reg, e.Ety);
1588                 reg -= XMM0;
1589                 // (V)MOVD reg,r
1590                 cdb.gen2(LODD,modregxrmx(3,reg,r));
1591                 checkSetVex(cdb.last(), TYushort8);
1592                 if (config.avx >= 2)
1593                 {
1594                     // VPBROADCASTB X/YMM,XMM
1595                     cdb.gen2(VPBROADCASTB, modregxrmx(3,reg,reg));
1596                     checkSetVex(cdb.last(), ty);
1597                 }
1598                 else
1599                 {
1600                     if (config.avx)
1601                     {
1602                         reg_t zeroreg;
1603                         regm = XMMREGS & ~retregs;
1604                         // VPXOR XMM1,XMM1,XMM1
1605                         allocreg(cdb,&regm,&zeroreg, ty);
1606                         zeroreg -= XMM0;
1607                         cdb.gen2(PXOR, modregxrmx(3,zeroreg,zeroreg));
1608                         checkSetVex(cdb.last(), TYuchar16); // AVX-128
1609                         // VPSHUFB XMM,XMM,XMM1
1610                         cdb.gen2(PSHUFB, modregxrmx(3,reg,zeroreg));
1611                         checkSetVex(cdb.last(), TYuchar16); // AVX-128
1612                     }
1613                     else
1614                     {
1615                         // PUNPCKLBW XMM,XMM
1616                         cdb.gen2(PUNPCKLBW, modregxrmx(3,reg,reg));
1617                         // PUNPCKLWD XMM,XMM
1618                         cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg));
1619                         // PSHUFD XMM,XMM,0
1620                         cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0);
1621                     }
1622                     if (tysize(ty) == 32)
1623                     {
1624                         // VINSERTF128 YMM,YMM,XMM,1
1625                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1626                         checkSetVex(cdb.last(), ty);
1627                     }
1628                 }
1629             }
1630             break;
1631 
1632         case TYshort8:
1633         case TYushort8:
1634         case TYshort16:
1635         case TYushort16:
1636             if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount)
1637             {
1638                 // VPBROADCASTW X/YMM,MEM
1639                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1640                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1641                 reg_t reg;
1642                 allocreg(cdb,&retregs,&reg,ty);
1643                 cs.Iop = VPBROADCASTW;
1644                 cs.Irex &= ~REX_W;
1645                 cs.Iflags &= ~CFopsize;
1646                 code_newreg(&cs,reg - XMM0);
1647                 checkSetVex(&cs,ty);
1648                 cdb.gen(&cs);
1649             }
1650             else
1651             {
1652                 regm_t regm = ALLREGS;
1653                 codelem(cdb,e1,&regm,true); // eval left leaf
1654                 reg_t r = findreg(regm);
1655 
1656                 reg_t reg;
1657                 allocreg(cdb,&retregs,&reg, e.Ety);
1658                 reg -= XMM0;
1659                 // (V)MOVD reg,r
1660                 cdb.gen2(LODD,modregxrmx(3,reg,r));
1661                 checkSetVex(cdb.last(), TYushort8);
1662                 if (config.avx >= 2)
1663                 {
1664                     // VPBROADCASTW X/YMM,XMM
1665                     cdb.gen2(VPBROADCASTW, modregxrmx(3,reg,reg));
1666                     checkSetVex(cdb.last(), ty);
1667                 }
1668                 else
1669                 {
1670                     // (V)PUNPCKLWD XMM,XMM
1671                     cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg));
1672                     checkSetVex(cdb.last(), TYushort8); // AVX-128
1673                     // (V)PSHUFD XMM,XMM,0
1674                     cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0);
1675                     checkSetVex(cdb.last(), TYushort8); // AVX-128
1676                     if (tysize(ty) == 32)
1677                     {
1678                         // VINSERTF128 YMM,YMM,XMM,1
1679                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1680                         checkSetVex(cdb.last(), ty);
1681                     }
1682                 }
1683             }
1684             break;
1685 
1686         case TYlong8:
1687         case TYulong8:
1688         case TYlong4:
1689         case TYulong4:
1690             if (config.avx && e1.Eoper == OPind && !e1.Ecount)
1691             {
1692                 // VPBROADCASTD/VBROADCASTSS X/YMM,MEM
1693                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1694                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1695                 reg_t reg;
1696                 allocreg(cdb,&retregs,&reg,ty);
1697                 cs.Iop = config.avx >= 2 ? VPBROADCASTD : VBROADCASTSS;
1698                 cs.Irex &= ~REX_W;
1699                 code_newreg(&cs,reg - XMM0);
1700                 checkSetVex(&cs,ty);
1701                 cdb.gen(&cs);
1702             }
1703             else
1704             {
1705                 codelem(cdb,e1,&retregs,true); // eval left leaf
1706                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1707                 getregs(cdb,retregs);
1708                 if (config.avx >= 2)
1709                 {
1710                     // VPBROADCASTD X/YMM,XMM
1711                     cdb.gen2(VPBROADCASTD, modregxrmx(3,reg,reg));
1712                     checkSetVex(cdb.last(), ty);
1713                 }
1714                 else
1715                 {
1716                     // (V)PSHUFD XMM,XMM,0
1717                     cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0);
1718                     checkSetVex(cdb.last(), TYulong4); // AVX-128
1719                     if (tysize(ty) == 32)
1720                     {
1721                         // VINSERTF128 YMM,YMM,XMM,1
1722                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1723                         checkSetVex(cdb.last(), ty);
1724                     }
1725                 }
1726             }
1727             break;
1728 
1729         case TYllong2:
1730         case TYullong2:
1731         case TYllong4:
1732         case TYullong4:
1733             if (e1.Eoper == OPind && !e1.Ecount)
1734             {
1735                 // VPBROADCASTQ/VBROADCASTSD/(V)PUNPCKLQDQ X/YMM,MEM
1736                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1737                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1738                 reg_t reg;
1739                 allocreg(cdb,&retregs,&reg,ty);
1740                 cs.Iop = config.avx >= 2 ? VPBROADCASTQ : tysize(ty) == 32 ? VBROADCASTSD : PUNPCKLQDQ;
1741                 cs.Irex &= ~REX_W;
1742                 code_newreg(&cs,reg - XMM0);
1743                 checkSetVex(&cs,ty);
1744                 cdb.gen(&cs);
1745             }
1746             else
1747             {
1748                 codelem(cdb,e1,&retregs,true); // eval left leaf
1749                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1750                 getregs(cdb,retregs);
1751                 if (config.avx >= 2)
1752                 {
1753                     // VPBROADCASTQ X/YMM,XMM
1754                     cdb.gen2(VPBROADCASTQ, modregxrmx(3,reg,reg));
1755                     checkSetVex(cdb.last(), ty);
1756                 }
1757                 else
1758                 {
1759                     // (V)PUNPCKLQDQ XMM,XMM
1760                     cdb.genc2(PUNPCKLQDQ, modregxrmx(3,reg,reg), 0);
1761                     checkSetVex(cdb.last(), TYullong2); // AVX-128
1762                     if (tysize(ty) == 32)
1763                     {
1764                         // VINSERTF128 YMM,YMM,XMM,1
1765                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1766                         checkSetVex(cdb.last(), ty);
1767                     }
1768                 }
1769             }
1770             break;
1771 
1772         default:
1773             assert(0);
1774     }
1775 
1776     fixresult(cdb,e,retregs,pretregs);
1777 }
1778 
1779 /*******************************************
1780  * Determine if lvalue e is a vector aligned on a 16/32 byte boundary.
1781  * Assume it to be aligned unless can prove it is not.
1782  * Params:
1783  *      e = lvalue
1784  * Returns:
1785  *      false if definitely not aligned
1786  */
1787 
1788 @trusted
1789 bool xmmIsAligned(elem *e)
1790 {
1791     if (tyvector(e.Ety) && e.Eoper == OPvar)
1792     {
1793         Symbol *s = e.EV.Vsym;
1794         const alignsz = tyalignsize(e.Ety);
1795         if (Symbol_Salignsize(*s) < alignsz ||
1796             e.EV.Voffset & (alignsz - 1) ||
1797             alignsz > STACKALIGN
1798            )
1799             return false;       // definitely not aligned
1800     }
1801     return true;        // assume aligned
1802 }
1803 
1804 /**************************************
1805  * VEX prefixes can be 2 or 3 bytes.
1806  * If it must be 3 bytes, set the CFvex3 flag.
1807  */
1808 
1809 void checkSetVex3(code *c)
1810 {
1811     // See Intel Vol. 2A 2.3.5.6
1812     if (c.Ivex.w || !c.Ivex.x || !c.Ivex.b || c.Ivex.mmmm > 0x1 ||
1813         !I64 && (c.Ivex.r || !(c.Ivex.vvvv & 8))
1814        )
1815     {
1816         c.Iflags |= CFvex3;
1817     }
1818 }
1819 
1820 /*************************************
1821  * Determine if operation should be rewritten as a VEX
1822  * operation; and do so.
1823  * Params:
1824  *      c = code
1825  *      ty = type of operand
1826  */
1827 
1828 @trusted
1829 void checkSetVex(code *c, tym_t ty)
1830 {
1831     //printf("checkSetVex() %d %x\n", tysize(ty), c.Iop);
1832     if (config.avx || tysize(ty) == 32)
1833     {
1834         uint vreg = (c.Irm >> 3) & 7;
1835         if (c.Irex & REX_R)
1836             vreg |= 8;
1837 
1838         // TODO: This is too simplistic, depending on the instruction, vex.vvvv
1839         // encodes NDS, NDD, DDS, or no operand (NOO). The code below assumes
1840         // NDS (non-destructive source), except for the incomplete list of 2
1841         // operand instructions (NOO) handled by the switch.
1842         switch (c.Iop)
1843         {
1844             case LODSS:
1845             case LODSD:
1846             case STOSS:
1847             case STOSD:
1848                 if ((c.Irm & 0xC0) == 0xC0)
1849                     break;
1850                 goto case LODAPS;
1851 
1852             case LODAPS:
1853             case LODUPS:
1854             case LODAPD:
1855             case LODUPD:
1856             case LODDQA:
1857             case LODDQU:
1858             case LODD:
1859             case LODQ:
1860             case STOAPS:
1861             case STOUPS:
1862             case STOAPD:
1863             case STOUPD:
1864             case STODQA:
1865             case STODQU:
1866             case STOD:
1867             case STOQ:
1868             case COMISS:
1869             case COMISD:
1870             case UCOMISS:
1871             case UCOMISD:
1872             case MOVDDUP:
1873             case MOVSHDUP:
1874             case MOVSLDUP:
1875             case VBROADCASTSS:
1876             case PSHUFD:
1877             case PSHUFHW:
1878             case PSHUFLW:
1879             case VPBROADCASTB:
1880             case VPBROADCASTW:
1881             case VPBROADCASTD:
1882             case VPBROADCASTQ:
1883                 vreg = 0;       // for 2 operand vex instructions
1884                 break;
1885 
1886             case VBROADCASTSD:
1887             case VBROADCASTF128:
1888             case VBROADCASTI128:
1889                 assert(tysize(ty) == 32); // AVX-256 only instructions
1890                 vreg = 0;       // for 2 operand vex instructions
1891                 break;
1892 
1893             case NOP:
1894                 return;         // ignore
1895 
1896             default:
1897                 break;
1898         }
1899 
1900         opcode_t op = 0xC4000000 | (c.Iop & 0xFF);
1901         switch (c.Iop & 0xFFFFFF00)
1902         {
1903             static uint MM_PP(uint mm, uint pp) { return (mm << 16) | (pp << 8); }
1904             case 0x00000F00: op |= MM_PP(1,0); break;
1905             case 0x00660F00: op |= MM_PP(1,1); break;
1906             case 0x00F30F00: op |= MM_PP(1,2); break;
1907             case 0x00F20F00: op |= MM_PP(1,3); break;
1908             case 0x660F3800: op |= MM_PP(2,1); break;
1909             case 0x660F3A00: op |= MM_PP(3,1); break;
1910             default:
1911                 printf("Iop = %x\n", c.Iop);
1912                 assert(0);
1913         }
1914         c.Iop = op;
1915         c.Ivex.pfx = 0xC4;
1916         c.Ivex.r = !(c.Irex & REX_R);
1917         c.Ivex.x = !(c.Irex & REX_X);
1918         c.Ivex.b = !(c.Irex & REX_B);
1919         c.Ivex.w = (c.Irex & REX_W) != 0;
1920         c.Ivex.l = tysize(ty) == 32;
1921 
1922         c.Ivex.vvvv = cast(ushort)~vreg;
1923 
1924         c.Iflags |= CFvex;
1925         checkSetVex3(c);
1926     }
1927 }
1928 
1929 /**************************************
1930  * Load complex operand into XMM registers or flags or both.
1931  */
1932 
1933 @trusted
1934 void cloadxmm(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
1935 {
1936     //printf("e = %p, *pretregs = %s)\n", e, regm_str(*pretregs));
1937     //elem_print(e);
1938     assert(*pretregs & (XMMREGS | mPSW));
1939     if (*pretregs == (mXMM0 | mXMM1) &&
1940         e.Eoper != OPconst)
1941     {
1942         code cs = void;
1943         tym_t tym = tybasic(e.Ety);
1944         tym_t ty = tym == TYcdouble ? TYdouble : TYfloat;
1945         opcode_t opmv = xmmload(tym, xmmIsAligned(e));
1946 
1947         regm_t retregs0 = mXMM0;
1948         reg_t reg0;
1949         allocreg(cdb, &retregs0, &reg0, ty);
1950         loadea(cdb, e, &cs, opmv, reg0, 0, RMload, 0);  // MOVSS/MOVSD XMM0,data
1951         checkSetVex(cdb.last(), ty);
1952 
1953         regm_t retregs1 = mXMM1;
1954         reg_t reg1;
1955         allocreg(cdb, &retregs1, &reg1, ty);
1956         loadea(cdb, e, &cs, opmv, reg1, tysize(ty), RMload, mXMM0); // MOVSS/MOVSD XMM1,data+offset
1957         checkSetVex(cdb.last(), ty);
1958 
1959         return;
1960     }
1961 
1962     // See test/complex.d for cases winding up here
1963     cload87(cdb, e, pretregs);
1964 }
1965 
1966 /***********************************
1967  * Determine if we can load a constant into an XMM register
1968  * with instructions.
1969  * Params:
1970  *      e = constant
1971  * Returns:
1972  *      true if it can be done
1973  */
1974 @trusted
1975 bool loadxmmconst(elem *e)
1976 {
1977     //printf("loadxmmconst() "); elem_print_const(e); printf("\n");
1978     const sz = tysize(e.Ety);
1979     ubyte* p = cast(ubyte*)&e.EV;
1980     assert(sz >= 1);
1981 
1982     if (config.avx < 2 && sz >= 32)
1983         return false;
1984 
1985     // true only if all ones or all zeros
1986     const b = p[0];
1987     if (b != 0 && b != 0xFF)
1988         return false;
1989     foreach (i; 1 .. sz)
1990     {
1991         if (p[i] != b)
1992             return false;
1993     }
1994     return true;
1995 }