1 /** 2 * xmm specific code generation 3 * 4 * Compiler implementation of the 5 * $(LINK2 https://www.dlang.org, D programming language). 6 * 7 * Copyright: Copyright (C) 2011-2023 by The D Language Foundation, All Rights Reserved 8 * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright) 9 * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 10 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cgxmm.d, backend/cgxmm.d) 11 */ 12 13 module dmd.backend.cgxmm; 14 15 import core.stdc.stdio; 16 import core.stdc.stdlib; 17 import core.stdc.string; 18 19 import dmd.backend.cc; 20 import dmd.backend.cdef; 21 import dmd.backend.code; 22 import dmd.backend.code_x86; 23 import dmd.backend.codebuilder; 24 import dmd.backend.mem; 25 import dmd.backend.el; 26 import dmd.backend.global; 27 import dmd.backend.oper; 28 import dmd.backend.ty; 29 import dmd.backend.xmm; 30 31 32 nothrow: 33 @safe: 34 35 /******************************************* 36 * Is operator a store operator? 37 */ 38 39 bool isXMMstore(opcode_t op) 40 { 41 switch (op) 42 { 43 case STOSS: case STOAPS: case STOUPS: 44 case STOSD: case STOAPD: case STOUPD: 45 case STOD: case STOQ: case STODQA: case STODQU: 46 case STOHPD: case STOHPS: case STOLPD: case STOLPS: return true; 47 default: return false; 48 } 49 } 50 51 /******************************************* 52 * Move constant value into xmm register xreg. 53 * Params: 54 * cdb = generated code appends to this 55 * xreg = XMM register to load 56 * sz = number of bytes to load 57 * pev = pointer to const value 58 * flags = if set flags based on value 59 */ 60 61 @trusted 62 void movxmmconst(ref CodeBuilder cdb, reg_t xreg, tym_t ty, eve* pev, regm_t flags) 63 { 64 //printf("movxmmconst() %s ty: %s value: %lld\n", regm_str(mask(xreg)), tym_str(ty), pev.Vllong); 65 66 const sz = tysize(ty); 67 assert(mask(xreg) & XMMREGS); 68 if (sz == 16 || sz == 32) 69 { 70 if (sz == 16 && 71 pev.Vllong2[0] == 0 && pev.Vllong2[1] == 0) 72 cdb.gen2(PXOR,modregxrmx(3,xreg-XMM0,xreg-XMM0)); // PXOR xreg,xreg 73 else if (sz == 32 && 74 pev.Vllong4[0] == 0 && pev.Vllong4[1] == 0 && 75 pev.Vllong4[2] == 0 && pev.Vllong4[3] == 0) 76 cdb.gen2(PXOR,modregxrmx(3,xreg-XMM0,xreg-XMM0)); // PXOR xreg,xreg 77 else if (sz == 16 && 78 pev.Vllong2[0] == ~0 && pev.Vllong2[1] == ~0) 79 cdb.gen2(PCMPEQD,modregxrmx(3,xreg-XMM0,xreg-XMM0)); // PCMPEQD xreg,xreg 80 else if (sz == 32 && 81 pev.Vllong4[0] == ~0 && pev.Vllong4[1] == ~0 && 82 pev.Vllong4[2] == ~0 && pev.Vllong4[3] == ~0) 83 cdb.gen2(PCMPEQQ,modregxrmx(3,xreg-XMM0,xreg-XMM0)); // PCMPEQQ xreg,xreg 84 else 85 assert(0); 86 tym_t tyx = sz == 16 ? TYllong2 : TYllong4; 87 checkSetVex(cdb.last(), tyx); 88 return; 89 } 90 91 /* Generate: 92 * MOV reg,value 93 * MOV xreg,reg 94 */ 95 assert(sz == 4 || sz == 8); 96 targ_size_t value = pev.Vint; 97 if (sz == 8) 98 value = cast(targ_size_t)pev.Vullong; 99 100 if (value == 0) 101 { 102 if (ty == TYfloat || ty == TYifloat) 103 { 104 cdb.gen2(XORPS,modregxrmx(3,xreg-XMM0,xreg-XMM0)); // XORPS xreg,xreg 105 return; 106 } 107 else if (ty == TYdouble || ty == TYidouble) 108 { 109 cdb.gen2(XORPD,modregxrmx(3,xreg-XMM0,xreg-XMM0)); // XORPD xreg,xreg 110 return; 111 } 112 } 113 114 115 if (I32 && sz == 8) 116 { 117 reg_t r; 118 regm_t rm = ALLREGS; 119 allocreg(cdb,&rm,&r,TYint); // allocate scratch register 120 static union U { targ_size_t s; targ_long[2] l; } 121 U u = void; 122 u.l[1] = 0; 123 u.s = value; 124 targ_long *p = &u.l[0]; 125 movregconst(cdb,r,p[0],0); 126 cdb.genfltreg(STO,r,0); // MOV floatreg,r 127 movregconst(cdb,r,p[1],0); 128 cdb.genfltreg(STO,r,4); // MOV floatreg+4,r 129 130 const op = xmmload(TYdouble, true); 131 cdb.genxmmreg(op,xreg,0,TYdouble); // MOVSD XMMreg,floatreg 132 } 133 else 134 { 135 reg_t reg; 136 regwithvalue(cdb,ALLREGS,value,reg,(sz == 8) ? 64 : 0); 137 cdb.gen2(LODD,modregxrmx(3,xreg-XMM0,reg)); // MOVD xreg,reg 138 if (sz == 8) 139 code_orrex(cdb.last(), REX_W); 140 checkSetVex(cdb.last(), TYulong); 141 } 142 } 143 144 /*********************************************** 145 * Do simple orthogonal operators for XMM registers. 146 */ 147 148 @trusted 149 void orthxmm(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 150 { 151 //printf("orthxmm(e = %p, *pretregs = %s)\n", e, regm_str(*pretregs)); 152 elem *e1 = e.EV.E1; 153 elem *e2 = e.EV.E2; 154 155 // float + ifloat is not actually addition 156 if ((e.Eoper == OPadd || e.Eoper == OPmin) && 157 ((tyreal(e1.Ety) && tyimaginary(e2.Ety)) || 158 (tyreal(e2.Ety) && tyimaginary(e1.Ety)))) 159 { 160 regm_t retregs = *pretregs & XMMREGS; 161 if (!retregs) 162 retregs = XMMREGS; 163 164 regm_t rretregs; 165 reg_t rreg; 166 if (tyreal(e1.Ety)) 167 { 168 const reg = findreg(retregs); 169 rreg = findreg(retregs & ~mask(reg)); 170 retregs = mask(reg); 171 rretregs = mask(rreg); 172 } 173 else 174 { 175 // Pick the second register, not the first 176 rreg = findreg(retregs); 177 rretregs = mask(rreg); 178 const reg = findreg(retregs & ~rretregs); 179 retregs = mask(reg); 180 } 181 assert(retregs && rretregs); 182 183 codelem(cdb,e1,&retregs,false); // eval left leaf 184 scodelem(cdb, e2, &rretregs, retregs, true); // eval right leaf 185 186 retregs |= rretregs; 187 if (e.Eoper == OPmin) 188 { 189 regm_t nretregs = XMMREGS & ~retregs; 190 reg_t sreg; // hold sign bit 191 const uint sz = tysize(e1.Ety); 192 allocreg(cdb,&nretregs,&sreg,e2.Ety); 193 eve signbit; 194 signbit.Vint = 0x80000000; 195 if (sz == 8) 196 signbit.Vllong = 0x8000_0000_0000_0000; 197 movxmmconst(cdb,sreg, e1.Ety, &signbit, 0); 198 getregs(cdb,nretregs); 199 const opcode_t xop = (sz == 8) ? XORPD : XORPS; // XORPD/S rreg,sreg 200 cdb.gen2(xop,modregxrmx(3,rreg-XMM0,sreg-XMM0)); 201 } 202 if (retregs != *pretregs) 203 fixresult(cdb,e,retregs,pretregs); 204 return; 205 } 206 207 regm_t retregs = *pretregs & XMMREGS; 208 if (!retregs) 209 retregs = XMMREGS; 210 const constflag = OTrel(e.Eoper); 211 codelem(cdb,e1,&retregs,constflag); // eval left leaf 212 const reg = findreg(retregs); 213 regm_t rretregs = XMMREGS & ~retregs; 214 scodelem(cdb, e2, &rretregs, retregs, true); // eval right leaf 215 216 const rreg = findreg(rretregs); 217 const op = xmmoperator(e1.Ety, e.Eoper); 218 219 /* We should take advantage of mem addressing modes for OP XMM,MEM 220 * but we do not at the moment. 221 */ 222 if (OTrel(e.Eoper) && !tyvector(tybasic(e.Ety))) 223 { 224 cdb.gen2(op,modregxrmx(3,rreg-XMM0,reg-XMM0)); 225 checkSetVex(cdb.last(), e1.Ety); 226 return; 227 } 228 229 getregs(cdb,retregs); 230 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); 231 if (op == CMPPS || op == CMPPD) 232 { 233 // https://www.felixcloutier.com/x86/cmpps 234 ubyte imm8; 235 switch (e.Eoper) 236 { 237 case OPeqeq: imm8 = 0; break; 238 case OPlt: imm8 = 1; break; 239 case OPle: imm8 = 2; break; 240 case OPne: imm8 = 4; break; 241 default: 242 elem_print(e); 243 assert(0); // not doing the unordered compares 244 } 245 code* c = cdb.last(); 246 c.IFL2 = FLconst; 247 c.IEV2.Vsize_t = imm8; 248 } 249 checkSetVex(cdb.last(), e1.Ety); 250 if (retregs != *pretregs) 251 fixresult(cdb,e,retregs,pretregs); 252 } 253 254 255 /************************ 256 * Generate code for an assignment using XMM registers. 257 * Params: 258 * opcode = store opcode to use, CMP means generate one 259 */ 260 @trusted 261 void xmmeq(ref CodeBuilder cdb, elem *e, opcode_t op, elem *e1, elem *e2,regm_t *pretregs) 262 { 263 tym_t tymll; 264 int i; 265 code cs; 266 elem *e11; 267 bool regvar; /* true means evaluate into register variable */ 268 targ_int postinc; 269 270 //printf("xmmeq(e1 = %p, e2 = %p, *pretregs = %s)\n", e1, e2, regm_str(*pretregs)); 271 tym_t tyml = tybasic(e1.Ety); /* type of lvalue */ 272 regm_t retregs = *pretregs; 273 274 if (!(retregs & XMMREGS)) 275 retregs = XMMREGS; // pick any XMM reg 276 277 bool aligned = xmmIsAligned(e1); 278 // If default, select store opcode 279 cs.Iop = (op == CMP) ? xmmstore(tyml, aligned) : op; 280 regvar = false; 281 regm_t varregm = 0; 282 if (config.flags4 & CFG4optimized) 283 { 284 // Be careful of cases like (x = x+x+x). We cannot evaluate in 285 // x if x is in a register. 286 reg_t varreg; 287 if (isregvar(e1, varregm, varreg) && // if lvalue is register variable 288 doinreg(e1.EV.Vsym,e2) && // and we can compute directly into it 289 varregm & XMMREGS 290 ) 291 { regvar = true; 292 retregs = varregm; // evaluate directly in target register 293 } 294 else 295 varregm = 0; 296 } 297 if (*pretregs & mPSW && OTleaf(e1.Eoper)) // if evaluating e1 couldn't change flags 298 { // Be careful that this lines up with jmpopcode() 299 retregs |= mPSW; 300 *pretregs &= ~mPSW; 301 } 302 scodelem(cdb,e2,&retregs,0,true); // get rvalue 303 304 // Look for special case of (*p++ = ...), where p is a register variable 305 if (e1.Eoper == OPind && 306 ((e11 = e1.EV.E1).Eoper == OPpostinc || e11.Eoper == OPpostdec) && 307 e11.EV.E1.Eoper == OPvar && 308 e11.EV.E1.EV.Vsym.Sfl == FLreg 309 ) 310 { 311 postinc = e11.EV.E2.EV.Vint; 312 if (e11.Eoper == OPpostdec) 313 postinc = -postinc; 314 getlvalue(cdb,&cs,e11,RMstore | retregs); 315 freenode(e11.EV.E2); 316 } 317 else 318 { postinc = 0; 319 getlvalue(cdb,&cs,e1,RMstore | retregs); // get lvalue (cl == CNIL if regvar) 320 } 321 322 getregs_imm(cdb,regvar ? varregm : 0); 323 324 const reg = findreg(retregs & XMMREGS); 325 cs.Irm |= modregrm(0,(reg - XMM0) & 7,0); 326 if ((reg - XMM0) & 8) 327 cs.Irex |= REX_R; 328 329 // Do not generate mov from register onto itself 330 if (!(regvar && reg == XMM0 + ((cs.Irm & 7) | (cs.Irex & REX_B ? 8 : 0)))) 331 { 332 cdb.gen(&cs); // MOV EA+offset,reg 333 checkSetVex(cdb.last(), tyml); 334 } 335 336 if (e1.Ecount || // if lvalue is a CSE or 337 regvar) // rvalue can't be a CSE 338 { 339 getregs_imm(cdb,retregs); // necessary if both lvalue and 340 // rvalue are CSEs (since a reg 341 // can hold only one e at a time) 342 cssave(e1,retregs,!OTleaf(e1.Eoper)); // if lvalue is a CSE 343 } 344 345 fixresult(cdb,e,retregs,pretregs); 346 if (postinc) 347 { 348 const increg = findreg(idxregm(&cs)); // the register to increment 349 if (*pretregs & mPSW) 350 { // Use LEA to avoid touching the flags 351 uint rm = cs.Irm & 7; 352 if (cs.Irex & REX_B) 353 rm |= 8; 354 cdb.genc1(LEA,buildModregrm(2,increg,rm),FLconst,postinc); 355 if (tysize(e11.EV.E1.Ety) == 8) 356 code_orrex(cdb.last(), REX_W); 357 } 358 else if (I64) 359 { 360 cdb.genc2(0x81,modregrmx(3,0,increg),postinc); 361 if (tysize(e11.EV.E1.Ety) == 8) 362 code_orrex(cdb.last(), REX_W); 363 } 364 else 365 { 366 if (postinc == 1) 367 cdb.gen1(0x40 + increg); // INC increg 368 else if (postinc == -cast(targ_int)1) 369 cdb.gen1(0x48 + increg); // DEC increg 370 else 371 { 372 cdb.genc2(0x81,modregrm(3,0,increg),postinc); 373 } 374 } 375 } 376 freenode(e1); 377 } 378 379 /******************************** 380 * Generate code for conversion using SSE2 instructions. 381 * 382 * OPs32_d 383 * OPs64_d (64-bit only) 384 * OPu32_d (64-bit only) 385 * OPd_f 386 * OPf_d 387 * OPd_s32 388 * OPd_s64 (64-bit only) 389 * 390 */ 391 392 @trusted 393 void xmmcnvt(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 394 { 395 //printf("xmmconvt: %p, %s\n", e, regm_str(*pretregs)); 396 opcode_t op = NoOpcode; 397 regm_t regs; 398 tym_t ty; 399 ubyte rex = 0; 400 bool zx = false; // zero extend uint 401 402 /* There are no ops for integer <. float/real conversions 403 * but there are instructions for them. In order to use these 404 * try to fuse chained conversions. Be careful not to loose 405 * precision for real to long. 406 */ 407 elem *e1 = e.EV.E1; 408 switch (e.Eoper) 409 { 410 case OPd_f: 411 if (e1.Eoper == OPs32_d) 412 { } 413 else if (I64 && e1.Eoper == OPs64_d) 414 rex = REX_W; 415 else if (I64 && e1.Eoper == OPu32_d) 416 { rex = REX_W; 417 zx = true; 418 } 419 else 420 { regs = XMMREGS; 421 op = CVTSD2SS; 422 ty = TYfloat; 423 break; 424 } 425 if (e1.Ecount) 426 { 427 regs = XMMREGS; 428 op = CVTSD2SS; 429 ty = TYfloat; 430 break; 431 } 432 // directly use si2ss 433 regs = ALLREGS; 434 e1 = e1.EV.E1; // fused operation 435 op = CVTSI2SS; 436 ty = TYfloat; 437 break; 438 439 case OPs32_d: goto Litod; 440 case OPs64_d: rex = REX_W; goto Litod; 441 case OPu32_d: rex = REX_W; zx = true; goto Litod; 442 Litod: 443 regs = ALLREGS; 444 op = CVTSI2SD; 445 ty = TYdouble; 446 break; 447 448 case OPd_s16: 449 case OPd_s32: ty = TYint; goto Ldtoi; 450 case OPd_u32: ty = TYlong; if (I64) rex = REX_W; goto Ldtoi; 451 case OPd_s64: ty = TYlong; rex = REX_W; goto Ldtoi; 452 Ldtoi: 453 regs = XMMREGS; 454 switch (e1.Eoper) 455 { 456 case OPf_d: 457 if (e1.Ecount) 458 { 459 op = CVTTSD2SI; 460 break; 461 } 462 e1 = e1.EV.E1; // fused operation 463 op = CVTTSS2SI; 464 break; 465 case OPld_d: 466 if (e.Eoper == OPd_s64) 467 { 468 cnvt87(cdb,e,pretregs); // precision 469 return; 470 } 471 goto default; 472 473 default: 474 op = CVTTSD2SI; 475 break; 476 } 477 break; 478 479 case OPf_d: 480 regs = XMMREGS; 481 op = CVTSS2SD; 482 ty = TYdouble; 483 break; 484 485 default: 486 assert(0); 487 } 488 assert(op != NoOpcode); 489 490 codelem(cdb,e1, ®s, false); 491 reg_t reg = findreg(regs); 492 if (isXMMreg(reg)) 493 reg -= XMM0; 494 else if (zx) 495 { assert(I64); 496 getregs(cdb,regs); 497 genregs(cdb,0x8B,reg,reg); // MOV reg,reg to zero upper 32-bit 498 // Don't use x89 because that will get optimized away 499 code_orflag(cdb.last(),CFvolatile); 500 } 501 502 regm_t retregs = *pretregs; 503 if (tyxmmreg(ty)) // target is XMM 504 { if (!(*pretregs & XMMREGS)) 505 retregs = XMMREGS; 506 } 507 else // source is XMM 508 { assert(regs & XMMREGS); 509 if (!(retregs & ALLREGS)) 510 retregs = ALLREGS; 511 } 512 513 reg_t rreg; 514 allocreg(cdb,&retregs,&rreg,ty); 515 if (isXMMreg(rreg)) 516 rreg -= XMM0; 517 518 cdb.gen2(op, modregxrmx(3,rreg,reg)); 519 assert(I64 || !rex); 520 if (rex) 521 code_orrex(cdb.last(), rex); 522 523 if (*pretregs != retregs) 524 fixresult(cdb,e,retregs,pretregs); 525 } 526 527 /******************************** 528 * Generate code for op= 529 */ 530 531 @trusted 532 void xmmopass(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 533 { elem *e1 = e.EV.E1; 534 elem *e2 = e.EV.E2; 535 tym_t ty1 = tybasic(e1.Ety); 536 const sz1 = _tysize[ty1]; 537 regm_t rretregs = XMMREGS & ~*pretregs; 538 if (!rretregs) 539 rretregs = XMMREGS; 540 541 codelem(cdb,e2,&rretregs,false); // eval right leaf 542 reg_t rreg = findreg(rretregs); 543 544 code cs; 545 regm_t retregs; 546 reg_t reg; 547 bool regvar = false; 548 if (config.flags4 & CFG4optimized) 549 { 550 // Be careful of cases like (x = x+x+x). We cannot evaluate in 551 // x if x is in a register. 552 reg_t varreg; 553 regm_t varregm; 554 if (isregvar(e1,varregm,varreg) && // if lvalue is register variable 555 doinreg(e1.EV.Vsym,e2) // and we can compute directly into it 556 ) 557 { regvar = true; 558 retregs = varregm; 559 reg = varreg; // evaluate directly in target register 560 getregs(cdb,retregs); // destroy these regs 561 } 562 } 563 564 if (!regvar) 565 { 566 getlvalue(cdb,&cs,e1,rretregs); // get EA 567 retregs = *pretregs & XMMREGS & ~rretregs; 568 if (!retregs) 569 retregs = XMMREGS & ~rretregs; 570 allocreg(cdb,&retregs,®,ty1); 571 cs.Iop = xmmload(ty1, true); // MOVSD xmm,xmm_m64 572 code_newreg(&cs,reg - XMM0); 573 cdb.gen(&cs); 574 checkSetVex(cdb.last(), ty1); 575 } 576 577 const op = xmmoperator(e1.Ety, e.Eoper); 578 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); 579 checkSetVex(cdb.last(), e1.Ety); 580 581 if (!regvar) 582 { 583 cs.Iop = xmmstore(ty1,true); // reverse operand order of MOVS[SD] 584 cdb.gen(&cs); 585 checkSetVex(cdb.last(), ty1); 586 } 587 588 if (e1.Ecount || // if lvalue is a CSE or 589 regvar) // rvalue can't be a CSE 590 { 591 getregs_imm(cdb,retregs); // necessary if both lvalue and 592 // rvalue are CSEs (since a reg 593 // can hold only one e at a time) 594 cssave(e1,retregs,!OTleaf(e1.Eoper)); // if lvalue is a CSE 595 } 596 597 fixresult(cdb,e,retregs,pretregs); 598 freenode(e1); 599 } 600 601 /******************************** 602 * Generate code for post increment and post decrement. 603 */ 604 605 @trusted 606 void xmmpost(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 607 { 608 elem *e1 = e.EV.E1; 609 elem *e2 = e.EV.E2; 610 tym_t ty1 = tybasic(e1.Ety); 611 612 regm_t retregs; 613 reg_t reg; 614 bool regvar = false; 615 if (config.flags4 & CFG4optimized) 616 { 617 // Be careful of cases like (x = x+x+x). We cannot evaluate in 618 // x if x is in a register. 619 reg_t varreg; 620 regm_t varregm; 621 if (isregvar(e1,varregm,varreg) && // if lvalue is register variable 622 doinreg(e1.EV.Vsym,e2) // and we can compute directly into it 623 ) 624 { 625 regvar = true; 626 retregs = varregm; 627 reg = varreg; // evaluate directly in target register 628 getregs(cdb,retregs); // destroy these regs 629 } 630 } 631 632 code cs; 633 if (!regvar) 634 { 635 getlvalue(cdb,&cs,e1,0); // get EA 636 retregs = XMMREGS & ~*pretregs; 637 if (!retregs) 638 retregs = XMMREGS; 639 allocreg(cdb,&retregs,®,ty1); 640 cs.Iop = xmmload(ty1, true); // MOVSD xmm,xmm_m64 641 code_newreg(&cs,reg - XMM0); 642 cdb.gen(&cs); 643 checkSetVex(cdb.last(), ty1); 644 } 645 646 // Result register 647 regm_t resultregs = XMMREGS & *pretregs & ~retregs; 648 if (!resultregs) 649 resultregs = XMMREGS & ~retregs; 650 reg_t resultreg; 651 allocreg(cdb,&resultregs, &resultreg, ty1); 652 653 cdb.gen2(xmmload(ty1,true),modregxrmx(3,resultreg-XMM0,reg-XMM0)); // MOVSS/D resultreg,reg 654 checkSetVex(cdb.last(), ty1); 655 656 regm_t rretregs = XMMREGS & ~(*pretregs | retregs | resultregs); 657 if (!rretregs) 658 rretregs = XMMREGS & ~(retregs | resultregs); 659 codelem(cdb,e2,&rretregs,false); // eval right leaf 660 const rreg = findreg(rretregs); 661 662 const op = xmmoperator(e1.Ety, e.Eoper); 663 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); // ADD reg,rreg 664 checkSetVex(cdb.last(), e1.Ety); 665 666 if (!regvar) 667 { 668 cs.Iop = xmmstore(ty1,true); // reverse operand order of MOVS[SD] 669 cdb.gen(&cs); 670 checkSetVex(cdb.last(), ty1); 671 } 672 673 if (e1.Ecount || // if lvalue is a CSE or 674 regvar) // rvalue can't be a CSE 675 { 676 getregs_imm(cdb,retregs); // necessary if both lvalue and 677 // rvalue are CSEs (since a reg 678 // can hold only one e at a time) 679 cssave(e1,retregs,!OTleaf(e1.Eoper)); // if lvalue is a CSE 680 } 681 682 fixresult(cdb,e,resultregs,pretregs); 683 freenode(e1); 684 } 685 686 /****************** 687 * Negate operator 688 */ 689 690 @trusted 691 void xmmneg(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 692 { 693 //printf("xmmneg()\n"); 694 //elem_print(e); 695 assert(*pretregs); 696 tym_t tyml = tybasic(e.EV.E1.Ety); 697 int sz = _tysize[tyml]; 698 699 regm_t retregs = *pretregs & XMMREGS; 700 if (!retregs) 701 retregs = XMMREGS; 702 703 /* Generate: 704 * MOV reg,e1 705 * MOV rreg,signbit 706 * XOR reg,rreg 707 */ 708 codelem(cdb,e.EV.E1,&retregs,false); 709 getregs(cdb,retregs); 710 const reg = findreg(retregs); 711 regm_t rretregs = XMMREGS & ~retregs; 712 reg_t rreg; 713 allocreg(cdb,&rretregs,&rreg,tyml); 714 715 eve signbit; 716 signbit.Vint = 0x80000000; 717 if (sz == 8) 718 signbit.Vllong = 0x8000_0000_0000_0000; 719 720 movxmmconst(cdb,rreg, tyml, &signbit, 0); 721 722 getregs(cdb,retregs); 723 const op = (sz == 8) ? XORPD : XORPS; // XORPD/S reg,rreg 724 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); 725 fixresult(cdb,e,retregs,pretregs); 726 } 727 728 /****************** 729 * Absolute value operator OPabs 730 */ 731 732 @trusted 733 void xmmabs(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 734 { 735 //printf("xmmabs()\n"); 736 //elem_print(e); 737 assert(*pretregs); 738 tym_t tyml = tybasic(e.EV.E1.Ety); 739 int sz = _tysize[tyml]; 740 741 regm_t retregs = *pretregs & XMMREGS; 742 if (!retregs) 743 retregs = XMMREGS; 744 745 /* Generate: 746 * MOV reg,e1 747 * MOV rreg,mask 748 * AND reg,rreg 749 */ 750 codelem(cdb,e.EV.E1,&retregs,false); 751 getregs(cdb,retregs); 752 const reg = findreg(retregs); 753 regm_t rretregs = XMMREGS & ~retregs; 754 reg_t rreg; 755 allocreg(cdb,&rretregs,&rreg,tyml); 756 757 eve mask; 758 mask.Vint = 0x7FFF_FFFF; 759 if (sz == 8) 760 mask.Vllong = 0x7FFF_FFFF_FFFF_FFFFL; 761 movxmmconst(cdb, rreg, tyml, &mask, 0); 762 763 getregs(cdb,retregs); 764 const op = (sz == 8) ? ANDPD : ANDPS; // ANDPD/S reg,rreg 765 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); 766 fixresult(cdb,e,retregs,pretregs); 767 } 768 769 /***************************** 770 * Get correct load operator based on type. 771 * It is important to use the right one even if the number of bits moved is the same, 772 * as there are performance consequences for using the wrong one. 773 * Params: 774 * tym = type of data to load 775 * aligned = for vectors, true if aligned to 16 bytes 776 */ 777 778 @trusted 779 opcode_t xmmload(tym_t tym, bool aligned = true) 780 { 781 opcode_t op; 782 if (tysize(tym) == 32) 783 aligned = false; 784 switch (tybasic(tym)) 785 { 786 case TYuint: 787 case TYint: 788 case TYlong: 789 case TYulong: op = LODD; break; // MOVD 790 case TYfloat: 791 case TYcfloat: 792 case TYifloat: op = LODSS; break; // MOVSS 793 case TYllong: 794 case TYullong: op = LODQ; break; // MOVQ 795 case TYdouble: 796 case TYcdouble: 797 case TYidouble: op = LODSD; break; // MOVSD 798 799 case TYfloat8: 800 case TYfloat4: op = aligned ? LODAPS : LODUPS; break; // MOVAPS / MOVUPS 801 case TYdouble4: 802 case TYdouble2: op = aligned ? LODAPD : LODUPD; break; // MOVAPD / MOVUPD 803 case TYschar16: 804 case TYuchar16: 805 case TYshort8: 806 case TYushort8: 807 case TYlong4: 808 case TYulong4: 809 case TYllong2: 810 case TYullong2: 811 case TYschar32: 812 case TYuchar32: 813 case TYshort16: 814 case TYushort16: 815 case TYlong8: 816 case TYulong8: 817 case TYllong4: 818 case TYullong4: op = aligned ? LODDQA : LODDQU; break; // MOVDQA / MOVDQU 819 820 default: 821 printf("tym = x%x\n", tym); 822 assert(0); 823 } 824 return op; 825 } 826 827 /***************************** 828 * Get correct store operator based on type. 829 */ 830 831 @trusted 832 opcode_t xmmstore(tym_t tym, bool aligned = true) 833 { 834 opcode_t op; 835 switch (tybasic(tym)) 836 { 837 case TYuint: 838 case TYint: 839 case TYlong: 840 case TYulong: op = STOD; break; // MOVD 841 case TYfloat: 842 case TYifloat: op = STOSS; break; // MOVSS 843 case TYllong: 844 case TYullong: op = STOQ; break; // MOVQ 845 case TYdouble: 846 case TYidouble: 847 case TYcdouble: 848 case TYcfloat: op = STOSD; break; // MOVSD 849 850 case TYfloat8: 851 case TYfloat4: op = aligned ? STOAPS : STOUPS; break; // MOVAPS / MOVUPS 852 case TYdouble4: 853 case TYdouble2: op = aligned ? STOAPD : STOUPD; break; // MOVAPD / MOVUPD 854 case TYschar16: 855 case TYuchar16: 856 case TYshort8: 857 case TYushort8: 858 case TYlong4: 859 case TYulong4: 860 case TYllong2: 861 case TYullong2: 862 case TYschar32: 863 case TYuchar32: 864 case TYshort16: 865 case TYushort16: 866 case TYlong8: 867 case TYulong8: 868 case TYllong4: 869 case TYullong4: op = aligned ? STODQA : STODQU; break; // MOVDQA / MOVDQU 870 871 default: 872 printf("tym = 0x%x\n", tym); 873 assert(0); 874 } 875 return op; 876 } 877 878 879 /************************************ 880 * Get correct XMM operator based on type and operator. 881 */ 882 883 @trusted 884 private opcode_t xmmoperator(tym_t tym, OPER oper) 885 { 886 tym = tybasic(tym); 887 opcode_t op; 888 switch (oper) 889 { 890 case OPadd: 891 case OPaddass: 892 case OPpostinc: 893 switch (tym) 894 { 895 case TYfloat: 896 case TYifloat: op = ADDSS; break; 897 case TYdouble: 898 case TYidouble: op = ADDSD; break; 899 900 // SIMD vector types 901 case TYfloat8: 902 case TYfloat4: op = ADDPS; break; 903 case TYdouble4: 904 case TYdouble2: op = ADDPD; break; 905 case TYschar32: 906 case TYuchar32: 907 case TYschar16: 908 case TYuchar16: op = PADDB; break; 909 case TYshort16: 910 case TYushort16: 911 case TYshort8: 912 case TYushort8: op = PADDW; break; 913 case TYlong8: 914 case TYulong8: 915 case TYlong4: 916 case TYulong4: op = PADDD; break; 917 case TYllong4: 918 case TYullong4: 919 case TYllong2: 920 case TYullong2: op = PADDQ; break; 921 922 default: 923 printf("tym = x%x\n", tym); 924 assert(0); 925 } 926 break; 927 928 case OPmin: 929 case OPminass: 930 case OPpostdec: 931 switch (tym) 932 { 933 case TYfloat: 934 case TYifloat: op = SUBSS; break; 935 case TYdouble: 936 case TYidouble: op = SUBSD; break; 937 938 // SIMD vector types 939 case TYfloat8: 940 case TYfloat4: op = SUBPS; break; 941 case TYdouble4: 942 case TYdouble2: op = SUBPD; break; 943 case TYschar32: 944 case TYuchar32: 945 case TYschar16: 946 case TYuchar16: op = PSUBB; break; 947 case TYshort16: 948 case TYushort16: 949 case TYshort8: 950 case TYushort8: op = PSUBW; break; 951 case TYlong8: 952 case TYulong8: 953 case TYlong4: 954 case TYulong4: op = PSUBD; break; 955 case TYllong4: 956 case TYullong4: 957 case TYllong2: 958 case TYullong2: op = PSUBQ; break; 959 960 default: assert(0); 961 } 962 break; 963 964 case OPmul: 965 case OPmulass: 966 switch (tym) 967 { 968 case TYfloat: 969 case TYifloat: op = MULSS; break; 970 case TYdouble: 971 case TYidouble: op = MULSD; break; 972 973 // SIMD vector types 974 case TYfloat8: 975 case TYfloat4: op = MULPS; break; 976 case TYdouble4: 977 case TYdouble2: op = MULPD; break; 978 case TYshort16: 979 case TYushort16: 980 case TYshort8: 981 case TYushort8: op = PMULLW; break; 982 case TYlong8: 983 case TYulong8: 984 case TYlong4: 985 case TYulong4: op = PMULLD; break; 986 987 default: assert(0); 988 } 989 break; 990 991 case OPdiv: 992 case OPdivass: 993 switch (tym) 994 { 995 case TYfloat: 996 case TYifloat: op = DIVSS; break; 997 case TYdouble: 998 case TYidouble: op = DIVSD; break; 999 1000 // SIMD vector types 1001 case TYfloat8: 1002 case TYfloat4: op = DIVPS; break; 1003 case TYdouble4: 1004 case TYdouble2: op = DIVPD; break; 1005 1006 default: assert(0); 1007 } 1008 break; 1009 1010 case OPor: 1011 case OPorass: 1012 switch (tym) 1013 { 1014 // SIMD vector types 1015 case TYschar16: 1016 case TYuchar16: 1017 case TYshort8: 1018 case TYushort8: 1019 case TYlong4: 1020 case TYulong4: 1021 case TYllong2: 1022 case TYullong2: 1023 case TYschar32: 1024 case TYuchar32: 1025 case TYshort16: 1026 case TYushort16: 1027 case TYlong8: 1028 case TYulong8: 1029 case TYllong4: 1030 case TYullong4: op = POR; break; 1031 1032 default: assert(0); 1033 } 1034 break; 1035 1036 case OPand: 1037 case OPandass: 1038 switch (tym) 1039 { 1040 // SIMD vector types 1041 case TYschar16: 1042 case TYuchar16: 1043 case TYshort8: 1044 case TYushort8: 1045 case TYlong4: 1046 case TYulong4: 1047 case TYllong2: 1048 case TYullong2: 1049 case TYschar32: 1050 case TYuchar32: 1051 case TYshort16: 1052 case TYushort16: 1053 case TYlong8: 1054 case TYulong8: 1055 case TYllong4: 1056 case TYullong4: op = PAND; break; 1057 1058 default: assert(0); 1059 } 1060 break; 1061 1062 case OPxor: 1063 case OPxorass: 1064 switch (tym) 1065 { 1066 // SIMD vector types 1067 case TYschar16: 1068 case TYuchar16: 1069 case TYshort8: 1070 case TYushort8: 1071 case TYlong4: 1072 case TYulong4: 1073 case TYllong2: 1074 case TYullong2: 1075 case TYschar32: 1076 case TYuchar32: 1077 case TYshort16: 1078 case TYushort16: 1079 case TYlong8: 1080 case TYulong8: 1081 case TYllong4: 1082 case TYullong4: op = PXOR; break; 1083 1084 default: assert(0); 1085 } 1086 break; 1087 1088 case OPgt: 1089 switch (tym) 1090 { 1091 case TYschar32: 1092 case TYuchar32: 1093 case TYschar16: 1094 case TYuchar16: op = PCMPGTB; break; 1095 case TYshort16: 1096 case TYushort16: 1097 case TYshort8: 1098 case TYushort8: op = PCMPGTW; break; 1099 case TYlong8: 1100 case TYulong8: 1101 case TYlong4: 1102 case TYulong4: op = PCMPGTD; break; 1103 case TYllong4: 1104 case TYullong4: 1105 case TYllong2: 1106 case TYullong2: op = PCMPGTQ; break; 1107 default: 1108 goto Lfloatcmp; 1109 } 1110 break; 1111 1112 case OPeqeq: 1113 switch (tym) 1114 { 1115 case TYschar32: 1116 case TYuchar32: 1117 case TYschar16: 1118 case TYuchar16: op = PCMPEQB; break; 1119 case TYshort16: 1120 case TYushort16: 1121 case TYshort8: 1122 case TYushort8: op = PCMPEQW; break; 1123 case TYlong8: 1124 case TYulong8: 1125 case TYlong4: 1126 case TYulong4: op = PCMPEQD; break; 1127 case TYllong4: 1128 case TYullong4: 1129 case TYllong2: 1130 case TYullong2: op = PCMPEQQ; break; 1131 default: 1132 goto Lfloatcmp; 1133 } 1134 break; 1135 1136 case OPlt: 1137 case OPle: 1138 case OPge: 1139 case OPne: 1140 case OPunord: /* !<>= */ 1141 case OPlg: /* <> */ 1142 case OPleg: /* <>= */ 1143 case OPule: /* !> */ 1144 case OPul: /* !>= */ 1145 case OPuge: /* !< */ 1146 case OPug: /* !<= */ 1147 case OPue: /* !<> */ 1148 case OPngt: 1149 case OPnge: 1150 case OPnlt: 1151 case OPnle: 1152 case OPord: 1153 case OPnlg: 1154 case OPnleg: 1155 case OPnule: 1156 case OPnul: 1157 case OPnuge: 1158 case OPnug: 1159 case OPnue: 1160 Lfloatcmp: 1161 switch (tym) 1162 { 1163 case TYfloat: 1164 case TYifloat: op = UCOMISS; break; 1165 case TYdouble: 1166 case TYidouble: op = UCOMISD; break; 1167 1168 case TYfloat4: 1169 case TYfloat8: 1170 case TYfloat16: op = CMPPS; break; 1171 1172 case TYdouble2: 1173 case TYdouble4: 1174 case TYdouble8: op = CMPPD; break; 1175 default: assert(0); 1176 } 1177 break; 1178 1179 default: 1180 assert(0); 1181 } 1182 return op; 1183 } 1184 1185 @trusted 1186 void cdvector(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 1187 { 1188 /* e should look like one of: 1189 * vector 1190 * | 1191 * param 1192 * / \ 1193 * param op2 1194 * / \ 1195 * op op1 1196 */ 1197 1198 if (!config.fpxmmregs) 1199 { printf("SIMD operations not supported on this platform\n"); 1200 exit(1); 1201 } 1202 1203 const n = el_nparams(e.EV.E1); 1204 assert(n < size_t.max / (2 * (elem *).sizeof)); // conservative overflow check 1205 elem **params = cast(elem **)malloc(n * (elem *).sizeof); 1206 assert(params); 1207 elem **tmp = params; 1208 el_paramArray(&tmp, e.EV.E1); 1209 1210 static if (0) 1211 { 1212 printf("cdvector()\n"); 1213 for (int i = 0; i < n; i++) 1214 { 1215 printf("[%d]: ", i); 1216 elem_print(params[i]); 1217 } 1218 } 1219 1220 if (*pretregs == 0) 1221 { /* Evaluate for side effects only 1222 */ 1223 foreach (i; 0 .. n) 1224 { 1225 codelem(cdb,params[i], pretregs, false); 1226 *pretregs = 0; // in case they got set 1227 } 1228 return; 1229 } 1230 1231 assert(n >= 2 && n <= 4); 1232 1233 elem *eop = params[0]; 1234 elem *op1 = params[1]; 1235 elem *op2 = null; 1236 tym_t ty2 = 0; 1237 if (n >= 3) 1238 { op2 = params[2]; 1239 ty2 = tybasic(op2.Ety); 1240 } 1241 1242 auto op = cast(opcode_t)el_tolong(eop); 1243 debug assert(!isXMMstore(op)); 1244 tym_t ty1 = tybasic(op1.Ety); 1245 1246 regm_t retregs; 1247 if (n == 3 && ty2 == TYuchar && op2.Eoper == OPconst) 1248 { // Handle: op xmm,imm8 1249 1250 retregs = *pretregs & XMMREGS; 1251 if (!retregs) 1252 retregs = XMMREGS; 1253 codelem(cdb,op1,&retregs,false); // eval left leaf 1254 const reg = findreg(retregs); 1255 int r; 1256 switch (op) 1257 { 1258 case PSLLD: r = 6; op = 0x660F72; break; 1259 case PSLLQ: r = 6; op = 0x660F73; break; 1260 case PSLLW: r = 6; op = 0x660F71; break; 1261 case PSRAD: r = 4; op = 0x660F72; break; 1262 case PSRAW: r = 4; op = 0x660F71; break; 1263 case PSRLD: r = 2; op = 0x660F72; break; 1264 case PSRLQ: r = 2; op = 0x660F73; break; 1265 case PSRLW: r = 2; op = 0x660F71; break; 1266 case PSRLDQ: r = 3; op = 0x660F73; break; 1267 case PSLLDQ: r = 7; op = 0x660F73; break; 1268 1269 default: 1270 printf("op = x%x\n", op); 1271 assert(0); 1272 } 1273 getregs(cdb,retregs); 1274 cdb.genc2(op,modregrmx(3,r,reg-XMM0), cast(uint)el_tolong(op2)); 1275 } 1276 else if (n == 2) 1277 { /* Handle: op xmm,mem 1278 * where xmm is written only, not read 1279 */ 1280 code cs; 1281 1282 if ((op1.Eoper == OPind && !op1.Ecount) || op1.Eoper == OPvar) 1283 { 1284 getlvalue(cdb,&cs, op1, RMload); // get addressing mode 1285 } 1286 else 1287 { 1288 regm_t rretregs = XMMREGS; 1289 codelem(cdb,op1, &rretregs, false); 1290 const rreg = findreg(rretregs) - XMM0; 1291 cs.Irm = modregrm(3,0,rreg & 7); 1292 cs.Iflags = 0; 1293 cs.Irex = 0; 1294 if (rreg & 8) 1295 cs.Irex |= REX_B; 1296 } 1297 1298 retregs = *pretregs & XMMREGS; 1299 if (!retregs) 1300 retregs = XMMREGS; 1301 reg_t reg; 1302 allocreg(cdb,&retregs, ®, e.Ety); 1303 code_newreg(&cs, reg - XMM0); 1304 cs.Iop = op; 1305 cdb.gen(&cs); 1306 } 1307 else if (n == 3 || n == 4) 1308 { /* Handle: 1309 * op xmm,mem // n = 3 1310 * op xmm,mem,imm8 // n = 4 1311 * Both xmm and mem are operands, evaluate xmm first. 1312 */ 1313 1314 code cs; 1315 1316 retregs = *pretregs & XMMREGS; 1317 if (!retregs) 1318 retregs = XMMREGS; 1319 codelem(cdb,op1,&retregs,false); // eval left leaf 1320 const reg = findreg(retregs); 1321 1322 /* MOVHLPS and LODLPS have the same opcode. They are distinguished 1323 * by MOVHLPS has a second operand of size 128, LODLPS has 64 1324 * https://www.felixcloutier.com/x86/movlps 1325 * https://www.felixcloutier.com/x86/movhlps 1326 * MOVHLPS must be an XMM operand, LODLPS must be a memory operand 1327 */ 1328 const isMOVHLPS = op == MOVHLPS && tysize(ty2) == 16; 1329 1330 if (((op2.Eoper == OPind && !op2.Ecount) || op2.Eoper == OPvar) && !isMOVHLPS) 1331 { 1332 getlvalue(cdb,&cs, op2, RMload | retregs); // get addressing mode 1333 } 1334 else 1335 { 1336 // load op2 into XMM register 1337 regm_t rretregs = XMMREGS & ~retregs; 1338 scodelem(cdb, op2, &rretregs, retregs, true); 1339 const rreg = findreg(rretregs) - XMM0; 1340 cs.Irm = modregrm(3,0,rreg & 7); 1341 cs.Iflags = 0; 1342 cs.Irex = 0; 1343 if (rreg & 8) 1344 cs.Irex |= REX_B; 1345 } 1346 1347 getregs(cdb,retregs); 1348 1349 switch (op) 1350 { 1351 case CMPPD: case CMPSS: case CMPSD: case CMPPS: 1352 case PSHUFD: case PSHUFHW: case PSHUFLW: 1353 case BLENDPD: case BLENDPS: case DPPD: case DPPS: 1354 case MPSADBW: case PBLENDW: 1355 case ROUNDPD: case ROUNDPS: case ROUNDSD: case ROUNDSS: 1356 case SHUFPD: case SHUFPS: 1357 if (n == 3) 1358 { 1359 if (pass == BackendPass.final_) 1360 error(e.Esrcpos.Sfilename, e.Esrcpos.Slinnum, e.Esrcpos.Scharnum, "missing 4th parameter to `__simd()`"); 1361 cs.IFL2 = FLconst; 1362 cs.IEV2.Vsize_t = 0; 1363 } 1364 break; 1365 default: 1366 break; 1367 } 1368 1369 if (n == 4) 1370 { 1371 elem *imm8 = params[3]; 1372 cs.IFL2 = FLconst; 1373 if (imm8.Eoper != OPconst) 1374 { 1375 error(imm8.Esrcpos.Sfilename, imm8.Esrcpos.Slinnum, imm8.Esrcpos.Scharnum, "last parameter to `__simd()` must be a constant"); 1376 cs.IEV2.Vsize_t = 0; 1377 } 1378 else 1379 cs.IEV2.Vsize_t = cast(targ_size_t)el_tolong(imm8); 1380 } 1381 code_newreg(&cs, reg - XMM0); 1382 cs.Iop = op; 1383 cdb.gen(&cs); 1384 } 1385 else 1386 assert(0); 1387 fixresult(cdb,e,retregs,pretregs); 1388 free(params); 1389 freenode(e); 1390 } 1391 1392 /*************** 1393 * Generate code for vector "store" operations. 1394 * The tree e must look like: 1395 * (op1 OPvecsto (op OPparam op2)) 1396 * where op is the store instruction STOxxxx. 1397 */ 1398 @trusted 1399 void cdvecsto(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 1400 { 1401 //printf("cdvecsto()\n"); 1402 //elem_print(e); 1403 elem *op1 = e.EV.E1; 1404 elem *op2 = e.EV.E2.EV.E2; 1405 elem *eop = e.EV.E2.EV.E1; 1406 const op = cast(opcode_t)el_tolong(eop); 1407 debug assert(isXMMstore(op)); 1408 xmmeq(cdb, e, op, op1, op2, pretregs); 1409 } 1410 1411 /*************** 1412 * Generate code for OPvecfill (broadcast). 1413 * OPvecfill takes the single value in e1 and 1414 * fills the vector type with it. 1415 */ 1416 @trusted 1417 void cdvecfill(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 1418 { 1419 //printf("cdvecfill(e = %p, *pretregs = %s)\n",e,regm_str(*pretregs)); 1420 1421 regm_t retregs = *pretregs & XMMREGS; 1422 if (!retregs) 1423 retregs = XMMREGS; 1424 1425 code *c; 1426 code cs; 1427 1428 elem *e1 = e.EV.E1; 1429 static if (0) 1430 { 1431 if ((e1.Eoper == OPind && !e1.Ecount) || e1.Eoper == OPvar) 1432 { 1433 cr = getlvalue(&cs, e1, RMload | retregs); // get addressing mode 1434 } 1435 else 1436 { 1437 regm_t rretregs = XMMREGS & ~retregs; 1438 cr = scodelem(op2, &rretregs, retregs, true); 1439 const rreg = findreg(rretregs) - XMM0; 1440 cs.Irm = modregrm(3,0,rreg & 7); 1441 cs.Iflags = 0; 1442 cs.Irex = 0; 1443 if (rreg & 8) 1444 cs.Irex |= REX_B; 1445 } 1446 } 1447 1448 /* e.Ety only gives us the size of the result vector, not its type. 1449 * We must combine it with the vector element type, e1.Ety, to 1450 * form the resulting vector type, ty. 1451 * The reason is someone may have painted the result of the OPvecfill to 1452 * a different vector type. 1453 */ 1454 const sz = tysize(e.Ety); 1455 const ty1 = tybasic(e1.Ety); 1456 assert(sz == 16 || sz == 32); 1457 const bool x16 = (sz == 16); 1458 1459 tym_t ty; 1460 switch (ty1) 1461 { 1462 case TYfloat: ty = x16 ? TYfloat4 : TYfloat8; break; 1463 case TYdouble: ty = x16 ? TYdouble2 : TYdouble4; break; 1464 case TYschar: ty = x16 ? TYschar16 : TYschar32; break; 1465 case TYuchar: ty = x16 ? TYuchar16 : TYuchar32; break; 1466 case TYshort: ty = x16 ? TYshort8 : TYshort16; break; 1467 case TYushort: ty = x16 ? TYushort8 : TYushort16; break; 1468 case TYint: 1469 case TYlong: ty = x16 ? TYlong4 : TYlong8; break; 1470 case TYuint: 1471 case TYulong: ty = x16 ? TYulong4 : TYulong8; break; 1472 case TYllong: ty = x16 ? TYllong2 : TYllong4; break; 1473 case TYullong: ty = x16 ? TYullong2 : TYullong4; break; 1474 1475 default: 1476 assert(0); 1477 } 1478 1479 switch (ty) 1480 { 1481 case TYfloat4: 1482 case TYfloat8: 1483 if (config.avx && e1.Eoper == OPind && !e1.Ecount) 1484 { 1485 // VBROADCASTSS X/YMM,MEM 1486 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1487 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1488 reg_t reg; 1489 allocreg(cdb,&retregs,®,ty); 1490 cs.Iop = VBROADCASTSS; 1491 cs.Irex &= ~REX_W; 1492 code_newreg(&cs,reg - XMM0); 1493 checkSetVex(&cs,ty); 1494 cdb.gen(&cs); 1495 } 1496 else 1497 { 1498 codelem(cdb,e1,&retregs,false); // eval left leaf 1499 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1500 getregs(cdb,retregs); 1501 if (config.avx >= 2) 1502 { 1503 // VBROADCASTSS X/YMM,XMM 1504 cdb.gen2(VBROADCASTSS, modregxrmx(3,reg,reg)); 1505 checkSetVex(cdb.last(), ty); 1506 } 1507 else 1508 { 1509 // (V)SHUFPS XMM,XMM,0 1510 cdb.genc2(SHUFPS, modregxrmx(3,reg,reg), 0); 1511 checkSetVex(cdb.last(), ty); 1512 if (tysize(ty) == 32) 1513 { 1514 // VINSERTF128 YMM,YMM,XMM,1 1515 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1516 checkSetVex(cdb.last(), ty); 1517 } 1518 } 1519 } 1520 break; 1521 1522 case TYdouble2: 1523 case TYdouble4: 1524 if (config.avx && tysize(ty) == 32 && e1.Eoper == OPind && !e1.Ecount) 1525 { 1526 // VBROADCASTSD YMM,MEM 1527 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1528 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1529 reg_t reg; 1530 allocreg(cdb,&retregs,®,ty); 1531 cs.Iop = VBROADCASTSD; 1532 cs.Irex &= ~REX_W; 1533 code_newreg(&cs,reg - XMM0); 1534 checkSetVex(&cs,ty); 1535 cdb.gen(&cs); 1536 } 1537 else 1538 { 1539 codelem(cdb,e1,&retregs,false); // eval left leaf 1540 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1541 getregs(cdb,retregs); 1542 if (config.avx >= 2 && tysize(ty) == 32) 1543 { 1544 // VBROADCASTSD YMM,XMM 1545 cdb.gen2(VBROADCASTSD, modregxrmx(3,reg,reg)); 1546 checkSetVex(cdb.last(), ty); 1547 } 1548 else 1549 { 1550 // (V)UNPCKLPD XMM,XMM 1551 cdb.gen2(UNPCKLPD, modregxrmx(3,reg,reg)); 1552 checkSetVex(cdb.last(), TYdouble2); // AVX-128 1553 if (tysize(ty) == 32) 1554 { 1555 // VINSERTF128 YMM,YMM,XMM,1 1556 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1557 checkSetVex(cdb.last(), ty); 1558 } 1559 } 1560 } 1561 break; 1562 1563 case TYschar16: 1564 case TYuchar16: 1565 case TYschar32: 1566 case TYuchar32: 1567 if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount) 1568 { 1569 // VPBROADCASTB X/YMM,MEM 1570 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1571 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1572 reg_t reg; 1573 allocreg(cdb,&retregs,®,ty); 1574 cs.Iop = VPBROADCASTB; 1575 cs.Irex &= ~REX_W; 1576 code_newreg(&cs,reg - XMM0); 1577 checkSetVex(&cs,ty); 1578 cdb.gen(&cs); 1579 } 1580 else 1581 { 1582 regm_t regm = ALLREGS; 1583 codelem(cdb,e1,®m,true); // eval left leaf 1584 const r = findreg(regm); 1585 1586 reg_t reg; 1587 allocreg(cdb,&retregs,®, e.Ety); 1588 reg -= XMM0; 1589 // (V)MOVD reg,r 1590 cdb.gen2(LODD,modregxrmx(3,reg,r)); 1591 checkSetVex(cdb.last(), TYushort8); 1592 if (config.avx >= 2) 1593 { 1594 // VPBROADCASTB X/YMM,XMM 1595 cdb.gen2(VPBROADCASTB, modregxrmx(3,reg,reg)); 1596 checkSetVex(cdb.last(), ty); 1597 } 1598 else 1599 { 1600 if (config.avx) 1601 { 1602 reg_t zeroreg; 1603 regm = XMMREGS & ~retregs; 1604 // VPXOR XMM1,XMM1,XMM1 1605 allocreg(cdb,®m,&zeroreg, ty); 1606 zeroreg -= XMM0; 1607 cdb.gen2(PXOR, modregxrmx(3,zeroreg,zeroreg)); 1608 checkSetVex(cdb.last(), TYuchar16); // AVX-128 1609 // VPSHUFB XMM,XMM,XMM1 1610 cdb.gen2(PSHUFB, modregxrmx(3,reg,zeroreg)); 1611 checkSetVex(cdb.last(), TYuchar16); // AVX-128 1612 } 1613 else 1614 { 1615 // PUNPCKLBW XMM,XMM 1616 cdb.gen2(PUNPCKLBW, modregxrmx(3,reg,reg)); 1617 // PUNPCKLWD XMM,XMM 1618 cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg)); 1619 // PSHUFD XMM,XMM,0 1620 cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0); 1621 } 1622 if (tysize(ty) == 32) 1623 { 1624 // VINSERTF128 YMM,YMM,XMM,1 1625 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1626 checkSetVex(cdb.last(), ty); 1627 } 1628 } 1629 } 1630 break; 1631 1632 case TYshort8: 1633 case TYushort8: 1634 case TYshort16: 1635 case TYushort16: 1636 if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount) 1637 { 1638 // VPBROADCASTW X/YMM,MEM 1639 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1640 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1641 reg_t reg; 1642 allocreg(cdb,&retregs,®,ty); 1643 cs.Iop = VPBROADCASTW; 1644 cs.Irex &= ~REX_W; 1645 cs.Iflags &= ~CFopsize; 1646 code_newreg(&cs,reg - XMM0); 1647 checkSetVex(&cs,ty); 1648 cdb.gen(&cs); 1649 } 1650 else 1651 { 1652 regm_t regm = ALLREGS; 1653 codelem(cdb,e1,®m,true); // eval left leaf 1654 reg_t r = findreg(regm); 1655 1656 reg_t reg; 1657 allocreg(cdb,&retregs,®, e.Ety); 1658 reg -= XMM0; 1659 // (V)MOVD reg,r 1660 cdb.gen2(LODD,modregxrmx(3,reg,r)); 1661 checkSetVex(cdb.last(), TYushort8); 1662 if (config.avx >= 2) 1663 { 1664 // VPBROADCASTW X/YMM,XMM 1665 cdb.gen2(VPBROADCASTW, modregxrmx(3,reg,reg)); 1666 checkSetVex(cdb.last(), ty); 1667 } 1668 else 1669 { 1670 // (V)PUNPCKLWD XMM,XMM 1671 cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg)); 1672 checkSetVex(cdb.last(), TYushort8); // AVX-128 1673 // (V)PSHUFD XMM,XMM,0 1674 cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0); 1675 checkSetVex(cdb.last(), TYushort8); // AVX-128 1676 if (tysize(ty) == 32) 1677 { 1678 // VINSERTF128 YMM,YMM,XMM,1 1679 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1680 checkSetVex(cdb.last(), ty); 1681 } 1682 } 1683 } 1684 break; 1685 1686 case TYlong8: 1687 case TYulong8: 1688 case TYlong4: 1689 case TYulong4: 1690 if (config.avx && e1.Eoper == OPind && !e1.Ecount) 1691 { 1692 // VPBROADCASTD/VBROADCASTSS X/YMM,MEM 1693 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1694 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1695 reg_t reg; 1696 allocreg(cdb,&retregs,®,ty); 1697 cs.Iop = config.avx >= 2 ? VPBROADCASTD : VBROADCASTSS; 1698 cs.Irex &= ~REX_W; 1699 code_newreg(&cs,reg - XMM0); 1700 checkSetVex(&cs,ty); 1701 cdb.gen(&cs); 1702 } 1703 else 1704 { 1705 codelem(cdb,e1,&retregs,true); // eval left leaf 1706 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1707 getregs(cdb,retregs); 1708 if (config.avx >= 2) 1709 { 1710 // VPBROADCASTD X/YMM,XMM 1711 cdb.gen2(VPBROADCASTD, modregxrmx(3,reg,reg)); 1712 checkSetVex(cdb.last(), ty); 1713 } 1714 else 1715 { 1716 // (V)PSHUFD XMM,XMM,0 1717 cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0); 1718 checkSetVex(cdb.last(), TYulong4); // AVX-128 1719 if (tysize(ty) == 32) 1720 { 1721 // VINSERTF128 YMM,YMM,XMM,1 1722 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1723 checkSetVex(cdb.last(), ty); 1724 } 1725 } 1726 } 1727 break; 1728 1729 case TYllong2: 1730 case TYullong2: 1731 case TYllong4: 1732 case TYullong4: 1733 if (e1.Eoper == OPind && !e1.Ecount) 1734 { 1735 // VPBROADCASTQ/VBROADCASTSD/(V)PUNPCKLQDQ X/YMM,MEM 1736 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1737 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1738 reg_t reg; 1739 allocreg(cdb,&retregs,®,ty); 1740 cs.Iop = config.avx >= 2 ? VPBROADCASTQ : tysize(ty) == 32 ? VBROADCASTSD : PUNPCKLQDQ; 1741 cs.Irex &= ~REX_W; 1742 code_newreg(&cs,reg - XMM0); 1743 checkSetVex(&cs,ty); 1744 cdb.gen(&cs); 1745 } 1746 else 1747 { 1748 codelem(cdb,e1,&retregs,true); // eval left leaf 1749 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1750 getregs(cdb,retregs); 1751 if (config.avx >= 2) 1752 { 1753 // VPBROADCASTQ X/YMM,XMM 1754 cdb.gen2(VPBROADCASTQ, modregxrmx(3,reg,reg)); 1755 checkSetVex(cdb.last(), ty); 1756 } 1757 else 1758 { 1759 // (V)PUNPCKLQDQ XMM,XMM 1760 cdb.genc2(PUNPCKLQDQ, modregxrmx(3,reg,reg), 0); 1761 checkSetVex(cdb.last(), TYullong2); // AVX-128 1762 if (tysize(ty) == 32) 1763 { 1764 // VINSERTF128 YMM,YMM,XMM,1 1765 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1766 checkSetVex(cdb.last(), ty); 1767 } 1768 } 1769 } 1770 break; 1771 1772 default: 1773 assert(0); 1774 } 1775 1776 fixresult(cdb,e,retregs,pretregs); 1777 } 1778 1779 /******************************************* 1780 * Determine if lvalue e is a vector aligned on a 16/32 byte boundary. 1781 * Assume it to be aligned unless can prove it is not. 1782 * Params: 1783 * e = lvalue 1784 * Returns: 1785 * false if definitely not aligned 1786 */ 1787 1788 @trusted 1789 bool xmmIsAligned(elem *e) 1790 { 1791 if (tyvector(e.Ety) && e.Eoper == OPvar) 1792 { 1793 Symbol *s = e.EV.Vsym; 1794 const alignsz = tyalignsize(e.Ety); 1795 if (Symbol_Salignsize(*s) < alignsz || 1796 e.EV.Voffset & (alignsz - 1) || 1797 alignsz > STACKALIGN 1798 ) 1799 return false; // definitely not aligned 1800 } 1801 return true; // assume aligned 1802 } 1803 1804 /************************************** 1805 * VEX prefixes can be 2 or 3 bytes. 1806 * If it must be 3 bytes, set the CFvex3 flag. 1807 */ 1808 1809 void checkSetVex3(code *c) 1810 { 1811 // See Intel Vol. 2A 2.3.5.6 1812 if (c.Ivex.w || !c.Ivex.x || !c.Ivex.b || c.Ivex.mmmm > 0x1 || 1813 !I64 && (c.Ivex.r || !(c.Ivex.vvvv & 8)) 1814 ) 1815 { 1816 c.Iflags |= CFvex3; 1817 } 1818 } 1819 1820 /************************************* 1821 * Determine if operation should be rewritten as a VEX 1822 * operation; and do so. 1823 * Params: 1824 * c = code 1825 * ty = type of operand 1826 */ 1827 1828 @trusted 1829 void checkSetVex(code *c, tym_t ty) 1830 { 1831 //printf("checkSetVex() %d %x\n", tysize(ty), c.Iop); 1832 if (config.avx || tysize(ty) == 32) 1833 { 1834 uint vreg = (c.Irm >> 3) & 7; 1835 if (c.Irex & REX_R) 1836 vreg |= 8; 1837 1838 // TODO: This is too simplistic, depending on the instruction, vex.vvvv 1839 // encodes NDS, NDD, DDS, or no operand (NOO). The code below assumes 1840 // NDS (non-destructive source), except for the incomplete list of 2 1841 // operand instructions (NOO) handled by the switch. 1842 switch (c.Iop) 1843 { 1844 case LODSS: 1845 case LODSD: 1846 case STOSS: 1847 case STOSD: 1848 if ((c.Irm & 0xC0) == 0xC0) 1849 break; 1850 goto case LODAPS; 1851 1852 case LODAPS: 1853 case LODUPS: 1854 case LODAPD: 1855 case LODUPD: 1856 case LODDQA: 1857 case LODDQU: 1858 case LODD: 1859 case LODQ: 1860 case STOAPS: 1861 case STOUPS: 1862 case STOAPD: 1863 case STOUPD: 1864 case STODQA: 1865 case STODQU: 1866 case STOD: 1867 case STOQ: 1868 case COMISS: 1869 case COMISD: 1870 case UCOMISS: 1871 case UCOMISD: 1872 case MOVDDUP: 1873 case MOVSHDUP: 1874 case MOVSLDUP: 1875 case VBROADCASTSS: 1876 case PSHUFD: 1877 case PSHUFHW: 1878 case PSHUFLW: 1879 case VPBROADCASTB: 1880 case VPBROADCASTW: 1881 case VPBROADCASTD: 1882 case VPBROADCASTQ: 1883 vreg = 0; // for 2 operand vex instructions 1884 break; 1885 1886 case VBROADCASTSD: 1887 case VBROADCASTF128: 1888 case VBROADCASTI128: 1889 assert(tysize(ty) == 32); // AVX-256 only instructions 1890 vreg = 0; // for 2 operand vex instructions 1891 break; 1892 1893 case NOP: 1894 return; // ignore 1895 1896 default: 1897 break; 1898 } 1899 1900 opcode_t op = 0xC4000000 | (c.Iop & 0xFF); 1901 switch (c.Iop & 0xFFFFFF00) 1902 { 1903 static uint MM_PP(uint mm, uint pp) { return (mm << 16) | (pp << 8); } 1904 case 0x00000F00: op |= MM_PP(1,0); break; 1905 case 0x00660F00: op |= MM_PP(1,1); break; 1906 case 0x00F30F00: op |= MM_PP(1,2); break; 1907 case 0x00F20F00: op |= MM_PP(1,3); break; 1908 case 0x660F3800: op |= MM_PP(2,1); break; 1909 case 0x660F3A00: op |= MM_PP(3,1); break; 1910 default: 1911 printf("Iop = %x\n", c.Iop); 1912 assert(0); 1913 } 1914 c.Iop = op; 1915 c.Ivex.pfx = 0xC4; 1916 c.Ivex.r = !(c.Irex & REX_R); 1917 c.Ivex.x = !(c.Irex & REX_X); 1918 c.Ivex.b = !(c.Irex & REX_B); 1919 c.Ivex.w = (c.Irex & REX_W) != 0; 1920 c.Ivex.l = tysize(ty) == 32; 1921 1922 c.Ivex.vvvv = cast(ushort)~vreg; 1923 1924 c.Iflags |= CFvex; 1925 checkSetVex3(c); 1926 } 1927 } 1928 1929 /************************************** 1930 * Load complex operand into XMM registers or flags or both. 1931 */ 1932 1933 @trusted 1934 void cloadxmm(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 1935 { 1936 //printf("e = %p, *pretregs = %s)\n", e, regm_str(*pretregs)); 1937 //elem_print(e); 1938 assert(*pretregs & (XMMREGS | mPSW)); 1939 if (*pretregs == (mXMM0 | mXMM1) && 1940 e.Eoper != OPconst) 1941 { 1942 code cs = void; 1943 tym_t tym = tybasic(e.Ety); 1944 tym_t ty = tym == TYcdouble ? TYdouble : TYfloat; 1945 opcode_t opmv = xmmload(tym, xmmIsAligned(e)); 1946 1947 regm_t retregs0 = mXMM0; 1948 reg_t reg0; 1949 allocreg(cdb, &retregs0, ®0, ty); 1950 loadea(cdb, e, &cs, opmv, reg0, 0, RMload, 0); // MOVSS/MOVSD XMM0,data 1951 checkSetVex(cdb.last(), ty); 1952 1953 regm_t retregs1 = mXMM1; 1954 reg_t reg1; 1955 allocreg(cdb, &retregs1, ®1, ty); 1956 loadea(cdb, e, &cs, opmv, reg1, tysize(ty), RMload, mXMM0); // MOVSS/MOVSD XMM1,data+offset 1957 checkSetVex(cdb.last(), ty); 1958 1959 return; 1960 } 1961 1962 // See test/complex.d for cases winding up here 1963 cload87(cdb, e, pretregs); 1964 } 1965 1966 /*********************************** 1967 * Determine if we can load a constant into an XMM register 1968 * with instructions. 1969 * Params: 1970 * e = constant 1971 * Returns: 1972 * true if it can be done 1973 */ 1974 @trusted 1975 bool loadxmmconst(elem *e) 1976 { 1977 //printf("loadxmmconst() "); elem_print_const(e); printf("\n"); 1978 const sz = tysize(e.Ety); 1979 ubyte* p = cast(ubyte*)&e.EV; 1980 assert(sz >= 1); 1981 1982 if (config.avx < 2 && sz >= 32) 1983 return false; 1984 1985 // true only if all ones or all zeros 1986 const b = p[0]; 1987 if (b != 0 && b != 0xFF) 1988 return false; 1989 foreach (i; 1 .. sz) 1990 { 1991 if (p[i] != b) 1992 return false; 1993 } 1994 return true; 1995 }