1 /** 2 * Instruction scheduler 3 * 4 * Compiler implementation of the 5 * $(LINK2 https://www.dlang.org, D programming language). 6 * 7 * Copyright: Copyright (C) 1995-1998 by Symantec 8 * Copyright (C) 2000-2023 by The D Language Foundation, All Rights Reserved 9 * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright) 10 * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 11 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cgsched.c, backend/cgsched.d) 12 */ 13 14 module dmd.backend.cgsched; 15 16 version (SCPP) 17 version = COMPILE; 18 version (MARS) 19 version = COMPILE; 20 21 version (COMPILE) 22 { 23 24 import core.stdc.stdio; 25 import core.stdc.stdlib; 26 import core.stdc.string; 27 28 import dmd.backend.cc; 29 import dmd.backend.cdef; 30 import dmd.backend.code; 31 import dmd.backend.code_x86; 32 import dmd.backend.dlist; 33 import dmd.backend.global; 34 import dmd.backend.mem; 35 import dmd.backend.ty; 36 import dmd.backend.barray; 37 38 extern (C++): 39 40 nothrow: 41 @safe: 42 43 int REGSIZE(); 44 code *gen1(code *c, uint op); 45 code *gen2(code *c, uint op, uint rm); 46 47 private uint mask(uint m) { return 1 << m; } 48 49 // is32bitaddr works correctly only when x is 0 or 1. This is 50 // true today for the current definition of I32, but if the definition 51 // of I32 changes, this macro will need to change as well 52 // 53 // Note: even for linux targets, CFaddrsize can be set by the inline 54 // assembler. 55 private bool is32bitaddr(bool x, uint Iflags) { return I64 || (x ^ ((Iflags & CFaddrsize) != 0)); } 56 57 // If we use Pentium Pro scheduler 58 @trusted 59 private bool PRO() { return config.target_cpu >= TARGET_PentiumPro; } 60 61 private enum FP : ubyte 62 { 63 fstp = 1, /// FSTP mem 64 fld = 2, /// FLD mem 65 fop = 3, /// Fop ST0,mem or Fop ST0 66 } 67 68 private enum CIFL : ubyte 69 { 70 arraybounds = 1, /// this instruction is a jmp to array bounds 71 ea = 2, /// this instruction has a memory-referencing 72 /// modregrm EA byte 73 nostage = 4, /// don't stage these instructions 74 push = 8, /// it's a push we can swap around 75 } 76 77 // Struct where we gather information about an instruction 78 struct Cinfo 79 { 80 code *c; // the instruction 81 ubyte pair; // pairing information 82 ubyte sz; // operand size 83 ubyte isz; // instruction size 84 85 // For floating point scheduling 86 ubyte fxch_pre; 87 ubyte fxch_post; 88 FP fp_op; /// FPxxxx 89 90 ubyte flags; /// CIFLxxx 91 92 uint r; // read mask 93 uint w; // write mask 94 uint a; // registers used in addressing mode 95 ubyte reg; // reg field of modregrm byte 96 ubyte uops; // Pentium Pro micro-ops 97 uint sibmodrm; // (sib << 8) + mod__rm byte 98 uint spadjust; // if !=0, then amount ESP changes as a result of this 99 // instruction being executed 100 int fpuadjust; // if !=0, then amount FPU stack changes as a result 101 // of this instruction being executed 102 103 @trusted 104 nothrow void print() // pretty-printer 105 { 106 Cinfo *ci = &this; 107 108 if (ci == null) 109 { 110 printf("Cinfo 0\n"); 111 return; 112 } 113 114 printf("Cinfo %p: c %p, pair %x, sz %d, isz %d, flags - ", 115 ci,c,pair,sz,isz); 116 if (ci.flags & CIFL.arraybounds) 117 printf("arraybounds,"); 118 if (ci.flags & CIFL.ea) 119 printf("ea,"); 120 if (ci.flags & CIFL.nostage) 121 printf("nostage,"); 122 if (ci.flags & CIFL.push) 123 printf("push,"); 124 if (ci.flags & ~(CIFL.arraybounds|CIFL.nostage|CIFL.push|CIFL.ea)) 125 printf("bad flag,"); 126 printf("\n\tr %x w %x a %x reg %x uops %x sibmodrm %x spadjust %d\n", 127 cast(int)r,cast(int)w,cast(int)a,reg,uops,sibmodrm,cast(int)spadjust); 128 if (ci.fp_op) 129 { 130 __gshared const(char*)[3] fpops = ["fstp","fld","fop"]; 131 132 printf("\tfp_op %s, fxch_pre %x, fxch_post %x\n", 133 fpops[fp_op-1],fxch_pre,fxch_post); 134 } 135 } 136 137 } 138 139 140 /***************************************** 141 * Do Pentium optimizations. 142 * Input: 143 * scratch scratch registers we can use 144 */ 145 146 @trusted 147 private void cgsched_pentium(code **pc,regm_t scratch) 148 { 149 //printf("scratch = x%02x\n",scratch); 150 if (config.target_scheduler >= TARGET_80486) 151 { 152 if (!I64) 153 *pc = peephole(*pc,0); 154 if (I32) // forget about 16 bit code 155 { 156 if (config.target_cpu == TARGET_Pentium || 157 config.target_cpu == TARGET_PentiumMMX) 158 *pc = simpleops(*pc,scratch); 159 *pc = schedule(*pc,0); 160 } 161 } 162 } 163 164 /************************************ 165 * Entry point 166 */ 167 @trusted 168 public void cgsched_block(block* b) 169 { 170 if (config.flags4 & CFG4speed && 171 config.target_cpu >= TARGET_Pentium && 172 b.BC != BCasm) 173 { 174 regm_t scratch = allregs; 175 176 scratch &= ~(b.Bregcon.used | b.Bregcon.params | mfuncreg); 177 scratch &= ~(b.Bregcon.immed.mval | b.Bregcon.cse.mval); 178 cgsched_pentium(&b.Bcode,scratch); 179 //printf("after schedule:\n"); WRcodlst(b.Bcode); 180 } 181 } 182 183 enum 184 { 185 NP = 0, /// not pairable 186 PU = 1, /// pairable in U only, never executed in V 187 PV = 2, /// pairable in V only 188 UV = (PU|PV), /// pairable in both U and V 189 PE = 4, /// register contention exception 190 PF = 8, /// flags contention exception 191 FX = 0x10, /// pairable with FXCH instruction 192 } 193 194 extern (D) private immutable ubyte[256] pentcycl = 195 [ 196 UV,UV,UV,UV, UV,UV,NP,NP, // 0 197 UV,UV,UV,UV, UV,UV,NP,NP, // 8 198 PU,PU,PU,PU, PU,PU,NP,NP, // 10 199 PU,PU,PU,PU, PU,PU,NP,NP, // 18 200 UV,UV,UV,UV, UV,UV,NP,NP, // 20 201 UV,UV,UV,UV, UV,UV,NP,NP, // 28 202 UV,UV,UV,UV, UV,UV,NP,NP, // 30 203 UV,UV,UV,UV, UV,UV,NP,NP, // 38 204 205 UV,UV,UV,UV, UV,UV,UV,UV, // 40 206 UV,UV,UV,UV, UV,UV,UV,UV, // 48 207 PE|UV,PE|UV,PE|UV,PE|UV, PE|UV,PE|UV,PE|UV,PE|UV, // 50 PUSH reg 208 PE|UV,PE|UV,PE|UV,PE|UV, PE|UV,PE|UV,PE|UV,PE|UV, // 58 POP reg 209 NP,NP,NP,NP, NP,NP,NP,NP, // 60 210 PE|UV,NP,PE|UV,NP, NP,NP,NP,NP, // 68 211 PV|PF,PV|PF,PV|PF,PV|PF, PV|PF,PV|PF,PV|PF,PV|PF, // 70 Jcc rel8 212 PV|PF,PV|PF,PV|PF,PV|PF, PV|PF,PV|PF,PV|PF,PV|PF, // 78 Jcc rel8 213 214 NP,NP,NP,NP, NP,NP,NP,NP, // 80 215 UV,UV,UV,UV, NP,UV,NP,NP, // 88 216 NP,NP,NP,NP, NP,NP,NP,NP, // 90 217 NP,NP,NP,NP, NP,NP,NP,NP, // 98 218 UV,UV,UV,UV, NP,NP,NP,NP, // A0 219 UV,UV,NP,NP, NP,NP,NP,NP, // A8 220 UV,UV,UV,UV, UV,UV,UV,UV, // B0 221 UV,UV,UV,UV, UV,UV,UV,UV, // B8 222 223 NP,NP,NP,NP, NP,NP,NP,NP, // C0 224 NP,NP,NP,NP, NP,NP,NP,NP, // C8 225 PU,PU,NP,NP, NP,NP,NP,NP, // D0 226 FX,NP,FX,FX, NP,NP,FX,NP, // D8 all floating point 227 NP,NP,NP,NP, NP,NP,NP,NP, // E0 228 PE|PV,PV,NP,PV, NP,NP,NP,NP, // E8 229 NP,NP,NP,NP, NP,NP,NP,NP, // F0 230 NP,NP,NP,NP, NP,NP,NP,NP, // F8 231 ]; 232 233 /******************************************** 234 * For each opcode, determine read [0] and written [1] masks. 235 */ 236 237 enum 238 { 239 EA = 0x100000, 240 R = 0x200000, /// register (reg of modregrm field) 241 N = 0x400000, /// other things modified, not swappable 242 B = 0x800000, /// it's a byte operation 243 C = 0x1000000, /// floating point flags 244 mMEM = 0x2000000, /// memory 245 S = 0x4000000, /// floating point stack 246 F = 0x8000000, /// flags 247 } 248 249 extern (D) private immutable uint[2][256] oprw = 250 [ 251 // 00 252 [ EA|R|B, F|EA|B ], // ADD 253 [ EA|R, F|EA ], 254 [ EA|R|B, F|R|B ], 255 [ EA|R, F|R ], 256 [ mAX, F|mAX ], 257 [ mAX, F|mAX ], 258 [ N, N ], // PUSH ES 259 [ N, N ], // POP ES 260 261 // 08 262 [ EA|R|B, F|EA|B ], // OR 263 [ EA|R, F|EA ], 264 [ EA|R|B, F|R|B ], 265 [ EA|R, F|R ], 266 [ mAX, F|mAX ], 267 [ mAX, F|mAX ], 268 [ N, N ], // PUSH CS 269 [ N, N ], // 2 byte escape 270 271 // 10 272 [ F|EA|R|B,F|EA|B ], // ADC 273 [ F|EA|R, F|EA ], 274 [ F|EA|R|B,F|R|B ], 275 [ F|EA|R, F|R ], 276 [ F|mAX, F|mAX ], 277 [ F|mAX, F|mAX ], 278 [ N, N ], // PUSH SS 279 [ N, N ], // POP SS 280 281 // 18 282 [ F|EA|R|B,F|EA|B ], // SBB 283 [ F|EA|R, F|EA ], 284 [ F|EA|R|B,F|R|B ], 285 [ F|EA|R, F|R ], 286 [ F|mAX, F|mAX ], 287 [ F|mAX, F|mAX ], 288 [ N, N ], // PUSH DS 289 [ N, N ], // POP DS 290 291 // 20 292 [ EA|R|B, F|EA|B ], // AND 293 [ EA|R, F|EA ], 294 [ EA|R|B, F|R|B ], 295 [ EA|R, F|R ], 296 [ mAX, F|mAX ], 297 [ mAX, F|mAX ], 298 [ N, N ], // SEG ES 299 [ F|mAX, F|mAX ], // DAA 300 301 // 28 302 [ EA|R|B, F|EA|B ], // SUB 303 [ EA|R, F|EA ], 304 [ EA|R|B, F|R|B ], 305 [ EA|R, F|R ], 306 [ mAX, F|mAX ], 307 [ mAX, F|mAX ], 308 [ N, N ], // SEG CS 309 [ F|mAX, F|mAX ], // DAS 310 311 // 30 312 [ EA|R|B, F|EA|B ], // XOR 313 [ EA|R, F|EA ], 314 [ EA|R|B, F|R|B ], 315 [ EA|R, F|R ], 316 [ mAX, F|mAX ], 317 [ mAX, F|mAX ], 318 [ N, N ], // SEG SS 319 [ F|mAX, F|mAX ], // AAA 320 321 // 38 322 [ EA|R|B, F ], // CMP 323 [ EA|R, F ], 324 [ EA|R|B, F ], 325 [ EA|R, F ], 326 [ mAX, F ], // CMP AL,imm8 327 [ mAX, F ], // CMP EAX,imm16/32 328 [ N, N ], // SEG DS 329 [ N, N ], // AAS 330 331 // 40 332 [ mAX, F|mAX ], // INC EAX 333 [ mCX, F|mCX ], 334 [ mDX, F|mDX ], 335 [ mBX, F|mBX ], 336 [ mSP, F|mSP ], 337 [ mBP, F|mBP ], 338 [ mSI, F|mSI ], 339 [ mDI, F|mDI ], 340 341 // 48 342 [ mAX, F|mAX ], // DEC EAX 343 [ mCX, F|mCX ], 344 [ mDX, F|mDX ], 345 [ mBX, F|mBX ], 346 [ mSP, F|mSP ], 347 [ mBP, F|mBP ], 348 [ mSI, F|mSI ], 349 [ mDI, F|mDI ], 350 351 // 50 352 [ mAX|mSP, mSP|mMEM ], // PUSH EAX 353 [ mCX|mSP, mSP|mMEM ], 354 [ mDX|mSP, mSP|mMEM ], 355 [ mBX|mSP, mSP|mMEM ], 356 [ mSP|mSP, mSP|mMEM ], 357 [ mBP|mSP, mSP|mMEM ], 358 [ mSI|mSP, mSP|mMEM ], 359 [ mDI|mSP, mSP|mMEM ], 360 361 // 58 362 [ mSP|mMEM, mAX|mSP ], // POP EAX 363 [ mSP|mMEM, mCX|mSP ], 364 [ mSP|mMEM, mDX|mSP ], 365 [ mSP|mMEM, mBX|mSP ], 366 [ mSP|mMEM, mSP|mSP ], 367 [ mSP|mMEM, mBP|mSP ], 368 [ mSP|mMEM, mSI|mSP ], 369 [ mSP|mMEM, mDI|mSP ], 370 371 // 60 372 [ N, N ], // PUSHA 373 [ N, N ], // POPA 374 [ N, N ], // BOUND Gv,Ma 375 [ N, N ], // ARPL Ew,Rw 376 [ N, N ], // SEG FS 377 [ N, N ], // SEG GS 378 [ N, N ], // operand size prefix 379 [ N, N ], // address size prefix 380 381 // 68 382 [ mSP, mSP|mMEM ], // PUSH immed16/32 383 [ EA, F|R ], // IMUL Gv,Ev,lv 384 [ mSP, mSP|mMEM ], // PUSH immed8 385 [ EA, F|R ], // IMUL Gv,Ev,lb 386 [ N, N ], // INSB Yb,DX 387 [ N, N ], // INSW/D Yv,DX 388 [ N, N ], // OUTSB DX,Xb 389 [ N, N ], // OUTSW/D DX,Xv 390 391 // 70 392 [ F|N, N ], 393 [ F|N, N ], 394 [ F|N, N ], 395 [ F|N, N ], 396 [ F|N, N ], 397 [ F|N, N ], 398 [ F|N, N ], 399 [ F|N, N ], 400 401 // 78 402 [ F|N, N ], 403 [ F|N, N ], 404 [ F|N, N ], 405 [ F|N, N ], 406 [ F|N, N ], 407 [ F|N, N ], 408 [ F|N, N ], 409 [ F|N, N ], 410 411 // 80 412 [ N, N ], 413 [ N, N ], 414 [ N, N ], 415 [ N, N ], 416 [ EA|R, F ], // TEST EA,r8 417 [ EA|R, F ], // TEST EA,r16/32 418 [ EA|R, EA|R ], // XCHG EA,r8 419 [ EA|R, EA|R ], // XCHG EA,r16/32 420 421 // 88 422 [ R|B, EA|B ], // MOV EA8,r8 423 [ R, EA ], // MOV EA,r16/32 424 [ EA|B, R|B ], // MOV r8,EA8 425 [ EA, R ], // MOV r16/32,EA 426 [ N, N ], // MOV EA,segreg 427 [ EA, R ], // LEA r16/32,EA 428 [ N, N ], // MOV segreg,EA 429 [ mSP|mMEM, EA|mSP ], // POP mem16/32 430 431 // 90 432 [ 0, 0 ], // NOP 433 [ mAX|mCX, mAX|mCX ], 434 [ mAX|mDX, mAX|mDX ], 435 [ mAX|mBX, mAX|mBX ], 436 [ mAX|mSP, mAX|mSP ], 437 [ mAX|mBP, mAX|mBP ], 438 [ mAX|mSI, mAX|mSI ], 439 [ mAX|mDI, mAX|mDI ], 440 441 // 98 442 [ mAX, mAX ], // CBW 443 [ mAX, mDX ], // CWD 444 [ N, N|F ], // CALL far ptr 445 [ N, N ], // WAIT 446 [ F|mSP, mSP|mMEM ], // PUSHF 447 [ mSP|mMEM, F|mSP ], // POPF 448 [ mAX, F ], // SAHF 449 [ F, mAX ], // LAHF 450 451 // A0 452 [ mMEM, mAX ], // MOV AL,moffs8 453 [ mMEM, mAX ], // MOV EAX,moffs32 454 [ mAX, mMEM ], // MOV moffs8,AL 455 [ mAX, mMEM ], // MOV moffs32,EAX 456 [ N, N ], // MOVSB 457 [ N, N ], // MOVSW/D 458 [ N, N ], // CMPSB 459 [ N, N ], // CMPSW/D 460 461 // A8 462 [ mAX, F ], // TEST AL,imm8 463 [ mAX, F ], // TEST AX,imm16 464 [ N, N ], // STOSB 465 [ N, N ], // STOSW/D 466 [ N, N ], // LODSB 467 [ N, N ], // LODSW/D 468 [ N, N ], // SCASB 469 [ N, N ], // SCASW/D 470 471 // B0 472 [ 0, mAX ], // MOV AL,imm8 473 [ 0, mCX ], 474 [ 0, mDX ], 475 [ 0, mBX ], 476 [ 0, mAX ], 477 [ 0, mCX ], 478 [ 0, mDX ], 479 [ 0, mBX ], 480 481 // B8 482 [ 0, mAX ], // MOV AX,imm16 483 [ 0, mCX ], 484 [ 0, mDX ], 485 [ 0, mBX ], 486 [ 0, mSP ], 487 [ 0, mBP ], 488 [ 0, mSI ], 489 [ 0, mDI ], 490 491 // C0 492 [ EA, F|EA ], // Shift Eb,Ib 493 [ EA, F|EA ], 494 [ N, N ], 495 [ N, N ], 496 [ N, N ], 497 [ N, N ], 498 [ 0, EA|B ], // MOV EA8,imm8 499 [ 0, EA ], // MOV EA,imm16 500 501 // C8 502 [ N, N ], // ENTER 503 [ N, N ], // LEAVE 504 [ N, N ], // RETF lw 505 [ N, N ], // RETF 506 [ N, N ], // INT 3 507 [ N, N ], // INT lb 508 [ N, N ], // INTO 509 [ N, N ], // IRET 510 511 // D0 512 [ EA, F|EA ], // Shift EA,1 513 [ EA, F|EA ], 514 [ EA|mCX, F|EA ], // Shift EA,CL 515 [ EA|mCX, F|EA ], 516 [ mAX, F|mAX ], // AAM 517 [ mAX, F|mAX ], // AAD 518 [ N, N ], // reserved 519 [ mAX|mBX|mMEM, mAX ], // XLAT 520 521 // D8 522 [ N, N ], 523 [ N, N ], 524 [ N, N ], 525 [ N, N ], 526 [ N, N ], 527 [ N, N ], 528 [ N, N ], 529 [ N, N ], 530 531 // E0 532 [ F|mCX|N,mCX|N ], // LOOPNE jb 533 [ F|mCX|N,mCX|N ], // LOOPE jb 534 [ mCX|N, mCX|N ], // LOOP jb 535 [ mCX|N, N ], // JCXZ jb 536 [ N, N ], // IN AL,lb 537 [ N, N ], // IN EAX,lb 538 [ N, N ], // OUT lb,AL 539 [ N, N ], // OUT lb,EAX 540 541 // E8 542 [ N, N|F ], // CALL jv 543 [ N, N ], // JMP Jv 544 [ N, N ], // JMP Ab 545 [ N, N ], // JMP jb 546 [ N|mDX, N|mAX ], // IN AL,DX 547 [ N|mDX, N|mAX ], // IN AX,DX 548 [ N|mAX|mDX,N ], // OUT DX,AL 549 [ N|mAX|mDX,N ], // OUT DX,AX 550 551 // F0 552 [ N, N ], // LOCK 553 [ N, N ], // reserved 554 [ N, N ], // REPNE 555 [ N, N ], // REP,REPE 556 [ N, N ], // HLT 557 [ F, F ], // CMC 558 [ N, N ], 559 [ N, N ], 560 561 // F8 562 [ 0, F ], // CLC 563 [ 0, F ], // STC 564 [ N, N ], // CLI 565 [ N, N ], // STI 566 [ N, N ], // CLD 567 [ N, N ], // STD 568 [ EA, F|EA ], // INC/DEC 569 [ N, N ], 570 ]; 571 572 /**************************************** 573 * Same thing, but for groups. 574 */ 575 576 extern (D) private immutable uint[2][8][8] grprw = 577 [ 578 [ 579 // Grp 1 580 [ EA, F|EA ], // ADD 581 [ EA, F|EA ], // OR 582 [ F|EA, F|EA ], // ADC 583 [ F|EA, F|EA ], // SBB 584 [ EA, F|EA ], // AND 585 [ EA, F|EA ], // SUB 586 [ EA, F|EA ], // XOR 587 [ EA, F ], // CMP 588 ], 589 [ 590 // Grp 3 591 [ EA, F ], // TEST EA,imm 592 [ N, N ], // reserved 593 [ EA, EA ], // NOT 594 [ EA, F|EA ], // NEG 595 [ mAX|EA, F|mAX|mDX ], // MUL 596 [ mAX|EA, F|mAX|mDX ], // IMUL 597 [ mAX|mDX|EA, F|mAX|mDX ], // DIV 598 599 // Could generate an exception we want to catch 600 //mAX|mDX|EA|N, F|mAX|mDX|N, // IDIV 601 602 [ mAX|mDX|EA, F|mAX|mDX ], // IDIV 603 ], 604 [ 605 // Grp 5 606 [ EA, F|EA ], // INC Ev 607 [ EA, F|EA ], // DEC Ev 608 [ N|EA, N ], // CALL Ev 609 [ N|EA, N ], // CALL eP 610 [ N|EA, N ], // JMP Ev 611 [ N|EA, N ], // JMP Ep 612 [ mSP|EA, mSP|mMEM ], // PUSH Ev 613 [ N, N ], // reserved 614 ], 615 [ 616 // Grp 3, byte version 617 [ EA|B, F ], // TEST EA,imm 618 [ N, N ], // reserved 619 [ EA|B, EA|B ], // NOT 620 [ EA|B, F|EA|B ], // NEG 621 [ mAX|EA, F|mAX ], // MUL 622 [ mAX|EA, F|mAX ], // IMUL 623 [ mAX|EA, F|mAX ], // DIV 624 625 // Could generate an exception we want to catch 626 //mAX|EA|N, F|mAX|N, // IDIV 627 628 [ mAX|EA, F|mAX ], // IDIV 629 ] 630 ]; 631 632 /******************************************** 633 * For floating point opcodes 0xD8..0xDF, with Irm < 0xC0. 634 * [][][0] = read 635 * [1] = write 636 */ 637 638 extern (D) private immutable uint[2][8][8] grpf1 = 639 [ 640 [ 641 // 0xD8 642 [ EA|S, S|C ], // FADD float 643 [ EA|S, S|C ], // FMUL float 644 [ EA|S, C ], // FCOM float 645 [ EA|S, S|C ], // FCOMP float 646 [ EA|S, S|C ], // FSUB float 647 [ EA|S, S|C ], // FSUBR float 648 [ EA|S, S|C ], // FDIV float 649 [ EA|S, S|C ], // FDIVR float 650 ], 651 [ 652 // 0xD9 653 [ EA, S|C ], // FLD float 654 [ N, N ], // 655 [ S, EA|C ], // FST float 656 [ S, EA|S|C ], // FSTP float 657 [ N, N ], // FLDENV 658 [ N, N ], // FLDCW 659 [ N, N ], // FSTENV 660 [ N, N ], // FSTCW 661 ], 662 [ 663 // 0xDA 664 [ EA|S, S|C ], // FIADD long 665 [ EA|S, S|C ], // FIMUL long 666 [ EA|S, C ], // FICOM long 667 [ EA|S, S|C ], // FICOMP long 668 [ EA|S, S|C ], // FISUB long 669 [ EA|S, S|C ], // FISUBR long 670 [ EA|S, S|C ], // FIDIV long 671 [ EA|S, S|C ], // FIDIVR long 672 ], 673 [ 674 // 0xDB 675 [ EA, S|C ], // FILD long 676 [ S, EA|S|C ], // FISTTP int 677 [ S, EA|C ], // FIST long 678 [ S, EA|S|C ], // FISTP long 679 [ N, N ], // 680 [ EA, S|C ], // FLD real80 681 [ N, N ], // 682 [ S, EA|S|C ], // FSTP real80 683 ], 684 [ 685 // 0xDC 686 [ EA|S, S|C ], // FADD double 687 [ EA|S, S|C ], // FMUL double 688 [ EA|S, C ], // FCOM double 689 [ EA|S, S|C ], // FCOMP double 690 [ EA|S, S|C ], // FSUB double 691 [ EA|S, S|C ], // FSUBR double 692 [ EA|S, S|C ], // FDIV double 693 [ EA|S, S|C ], // FDIVR double 694 ], 695 [ 696 // 0xDD 697 [ EA, S|C ], // FLD double 698 [ S, EA|S|C ], // FISTTP long 699 [ S, EA|C ], // FST double 700 [ S, EA|S|C ], // FSTP double 701 [ N, N ], // FRSTOR 702 [ N, N ], // 703 [ N, N ], // FSAVE 704 [ C, EA ], // FSTSW 705 ], 706 [ 707 // 0xDE 708 [ EA|S, S|C ], // FIADD short 709 [ EA|S, S|C ], // FIMUL short 710 [ EA|S, C ], // FICOM short 711 [ EA|S, S|C ], // FICOMP short 712 [ EA|S, S|C ], // FISUB short 713 [ EA|S, S|C ], // FISUBR short 714 [ EA|S, S|C ], // FIDIV short 715 [ EA|S, S|C ], // FIDIVR short 716 ], 717 [ 718 // 0xDF 719 [ EA, S|C ], // FILD short 720 [ S, EA|S|C ], // FISTTP short 721 [ S, EA|C ], // FIST short 722 [ S, EA|S|C ], // FISTP short 723 [ EA, S|C ], // FBLD packed BCD 724 [ EA, S|C ], // FILD long long 725 [ S, EA|S|C ], // FBSTP packed BCD 726 [ S, EA|S|C ], // FISTP long long 727 ] 728 ]; 729 730 731 /******************************************** 732 * Micro-ops for floating point opcodes 0xD8..0xDF, with Irm < 0xC0. 733 */ 734 735 extern (D) private immutable ubyte[8][8] uopsgrpf1 = 736 [ 737 [ 738 // 0xD8 739 2, // FADD float 740 2, // FMUL float 741 2, // FCOM float 742 2, // FCOMP float 743 2, // FSUB float 744 2, // FSUBR float 745 2, // FDIV float 746 2, // FDIVR float 747 ], 748 [ 749 // 0xD9 750 1, // FLD float 751 0, // 752 2, // FST float 753 2, // FSTP float 754 5, // FLDENV 755 3, // FLDCW 756 5, // FSTENV 757 5, // FSTCW 758 ], 759 [ 760 // 0xDA 761 5, // FIADD long 762 5, // FIMUL long 763 5, // FICOM long 764 5, // FICOMP long 765 5, // FISUB long 766 5, // FISUBR long 767 5, // FIDIV long 768 5, // FIDIVR long 769 ], 770 [ 771 // 0xDB 772 4, // FILD long 773 0, // 774 4, // FIST long 775 4, // FISTP long 776 0, // 777 4, // FLD real80 778 0, // 779 5, // FSTP real80 780 ], 781 [ 782 // 0xDC 783 2, // FADD double 784 2, // FMUL double 785 2, // FCOM double 786 2, // FCOMP double 787 2, // FSUB double 788 2, // FSUBR double 789 2, // FDIV double 790 2, // FDIVR double 791 ], 792 [ 793 // 0xDD 794 1, // FLD double 795 0, // 796 2, // FST double 797 2, // FSTP double 798 5, // FRSTOR 799 0, // 800 5, // FSAVE 801 5, // FSTSW 802 ], 803 [ 804 // 0xDE 805 5, // FIADD short 806 5, // FIMUL short 807 5, // FICOM short 808 5, // FICOMP short 809 5, // FISUB short 810 5, // FISUBR short 811 5, // FIDIV short 812 5, // FIDIVR short 813 ], 814 [ 815 // 0xDF 816 4, // FILD short 817 0, // 818 4, // FIST short 819 4, // FISTP short 820 5, // FBLD packed BCD 821 4, // FILD long long 822 5, // FBSTP packed BCD 823 4, // FISTP long long 824 ] 825 ]; 826 827 /************************************************** 828 * Determine number of micro-ops for Pentium Pro and Pentium II processors. 829 * 0 means special case, 830 * 5 means 'complex' 831 */ 832 833 extern (D) private immutable ubyte[256] insuops = 834 [ 0,0,0,0, 1,1,4,5, /* 00 */ 835 0,0,0,0, 1,1,4,0, /* 08 */ 836 0,0,0,0, 2,2,4,5, /* 10 */ 837 0,0,0,0, 2,2,4,5, /* 18 */ 838 0,0,0,0, 1,1,0,1, /* 20 */ 839 0,0,0,0, 1,1,0,1, /* 28 */ 840 0,0,0,0, 1,1,0,1, /* 30 */ 841 0,0,0,0, 1,1,0,1, /* 38 */ 842 1,1,1,1, 1,1,1,1, /* 40 */ 843 1,1,1,1, 1,1,1,1, /* 48 */ 844 3,3,3,3, 3,3,3,3, /* 50 */ 845 2,2,2,2, 3,2,2,2, /* 58 */ 846 5,5,5,5, 0,0,0,0, /* 60 */ 847 3,3,0,0, 5,5,5,5, /* 68 */ 848 1,1,1,1, 1,1,1,1, /* 70 */ 849 1,1,1,1, 1,1,1,1, /* 78 */ 850 0,0,0,0, 0,0,0,0, /* 80 */ 851 0,0,0,0, 0,1,4,0, /* 88 */ 852 1,3,3,3, 3,3,3,3, /* 90 */ 853 1,1,5,0, 5,5,1,1, /* 98 */ 854 1,1,2,2, 5,5,5,5, /* A0 */ 855 1,1,3,3, 2,2,3,3, /* A8 */ 856 1,1,1,1, 1,1,1,1, /* B0 */ 857 1,1,1,1, 1,1,1,1, /* B8 */ 858 0,0,5,4, 0,0,0,0, /* C0 */ 859 5,3,5,5, 5,3,5,5, /* C8 */ 860 0,0,0,0, 4,3,0,2, /* D0 */ 861 0,0,0,0, 0,0,0,0, /* D8 */ 862 4,4,4,2, 5,5,5,5, /* E0 */ 863 4,1,5,1, 5,5,5,5, /* E8 */ 864 0,0,5,5, 5,1,0,0, /* F0 */ 865 1,1,5,5, 4,4,0,0, /* F8 */ 866 ]; 867 868 extern (D) private immutable ubyte[8] uopsx = [ 1,1,2,5,1,1,1,5 ]; 869 870 /************************************************ 871 * Determine number of micro-ops for Pentium Pro and Pentium II processors. 872 * 5 means 'complex'. 873 * Doesn't currently handle: 874 * floating point 875 * MMX 876 * 0F opcodes 877 * prefix bytes 878 */ 879 880 private int uops(code *c) 881 { int n; 882 int op; 883 int op2; 884 885 op = c.Iop & 0xFF; 886 if ((c.Iop & 0xFF00) == 0x0F00) 887 op = 0x0F; 888 n = insuops[op]; 889 if (!n) // if special case 890 { ubyte irm,mod,reg,rm; 891 892 irm = c.Irm; 893 mod = (irm >> 6) & 3; 894 reg = (irm >> 3) & 7; 895 rm = irm & 7; 896 897 switch (op) 898 { 899 case 0x10: 900 case 0x11: // ADC rm,r 901 case 0x18: 902 case 0x19: // SBB rm,r 903 n = (mod == 3) ? 2 : 4; 904 break; 905 906 case 0x12: 907 case 0x13: // ADC r,rm 908 case 0x1A: 909 case 0x1B: // SBB r,rm 910 n = (mod == 3) ? 2 : 3; 911 break; 912 913 case 0x00: 914 case 0x01: // ADD rm,r 915 case 0x08: 916 case 0x09: // OR rm,r 917 case 0x20: 918 case 0x21: // AND rm,r 919 case 0x28: 920 case 0x29: // SUB rm,r 921 case 0x30: 922 case 0x31: // XOR rm,r 923 n = (mod == 3) ? 1 : 4; 924 break; 925 926 case 0x02: 927 case 0x03: // ADD r,rm 928 case 0x0A: 929 case 0x0B: // OR r,rm 930 case 0x22: 931 case 0x23: // AND r,rm 932 case 0x2A: 933 case 0x2B: // SUB r,rm 934 case 0x32: 935 case 0x33: // XOR r,rm 936 case 0x38: 937 case 0x39: // CMP rm,r 938 case 0x3A: 939 case 0x3B: // CMP r,rm 940 case 0x69: // IMUL rm,r,imm 941 case 0x6B: // IMUL rm,r,imm8 942 case 0x84: 943 case 0x85: // TEST rm,r 944 n = (mod == 3) ? 1 : 2; 945 break; 946 947 case 0x80: 948 case 0x81: 949 case 0x82: 950 case 0x83: 951 if (reg == 2 || reg == 3) // ADC/SBB rm,imm 952 n = (mod == 3) ? 2 : 4; 953 else if (reg == 7) // CMP rm,imm 954 n = (mod == 3) ? 1 : 2; 955 else 956 n = (mod == 3) ? 1 : 4; 957 break; 958 959 case 0x86: 960 case 0x87: // XCHG rm,r 961 n = (mod == 3) ? 3 : 5; 962 break; 963 964 case 0x88: 965 case 0x89: // MOV rm,r 966 n = (mod == 3) ? 1 : 2; 967 break; 968 969 case 0x8A: 970 case 0x8B: // MOV r,rm 971 n = 1; 972 break; 973 974 case 0x8C: // MOV Sreg,rm 975 n = (mod == 3) ? 1 : 3; 976 break; 977 978 case 0x8F: 979 if (reg == 0) // POP m 980 n = 5; 981 break; 982 983 case 0xC6: 984 case 0xC7: 985 if (reg == 0) // MOV rm,imm 986 n = (mod == 3) ? 1 : 2; 987 break; 988 989 case 0xD0: 990 case 0xD1: 991 if (reg == 2 || reg == 3) // RCL/RCR rm,1 992 n = (mod == 3) ? 2 : 4; 993 else 994 n = (mod == 3) ? 1 : 4; 995 break; 996 997 case 0xC0: 998 case 0xC1: // RCL/RCR rm,imm8 999 case 0xD2: 1000 case 0xD3: 1001 if (reg == 2 || reg == 3) // RCL/RCR rm,CL 1002 n = 5; 1003 else 1004 n = (mod == 3) ? 1 : 4; 1005 break; 1006 1007 case 0xD8: 1008 case 0xD9: 1009 case 0xDA: 1010 case 0xDB: 1011 case 0xDC: 1012 case 0xDD: 1013 case 0xDE: 1014 case 0xDF: 1015 // Floating point opcodes 1016 if (irm < 0xC0) 1017 { n = uopsgrpf1[op - 0xD8][reg]; 1018 break; 1019 } 1020 n = uopsx[op - 0xD8]; 1021 switch (op) 1022 { 1023 case 0xD9: 1024 switch (irm) 1025 { 1026 case 0xE0: // FCHS 1027 n = 3; 1028 break; 1029 case 0xE8: 1030 case 0xE9: 1031 case 0xEA: 1032 case 0xEB: 1033 case 0xEC: 1034 case 0xED: 1035 n = 2; 1036 break; 1037 case 0xF0: 1038 case 0xF1: 1039 case 0xF2: 1040 case 0xF3: 1041 case 0xF4: 1042 case 0xF5: 1043 case 0xF8: 1044 case 0xF9: 1045 case 0xFB: 1046 case 0xFC: 1047 case 0xFD: 1048 case 0xFE: 1049 case 0xFF: 1050 n = 5; 1051 break; 1052 1053 default: 1054 break; 1055 } 1056 break; 1057 case 0xDE: 1058 if (irm == 0xD9) // FCOMPP 1059 n = 2; 1060 break; 1061 1062 default: 1063 break; 1064 } 1065 break; 1066 1067 case 0xF6: 1068 if (reg == 6 || reg == 7) // DIV AL,rm8 1069 n = (mod == 3) ? 3 : 4; 1070 else if (reg == 4 || reg == 5 || reg == 0) // MUL/IMUL/TEST rm8 1071 n = (mod == 3) ? 1 : 2; 1072 else if (reg == 2 || reg == 3) // NOT/NEG rm 1073 n = (mod == 3) ? 1 : 4; 1074 break; 1075 1076 case 0xF7: 1077 if (reg == 6 || reg == 7) // DIV EAX,rm 1078 n = 4; 1079 else if (reg == 4 || reg == 5) // MUL/IMUL rm 1080 n = (mod == 3) ? 3 : 4; 1081 else if (reg == 2 || reg == 3) // NOT/NEG rm 1082 n = (mod == 3) ? 1 : 4; 1083 break; 1084 1085 case 0xFF: 1086 if (reg == 2 || reg == 3 || // CALL rm, CALL m,rm 1087 reg == 5) // JMP seg:offset 1088 n = 5; 1089 else if (reg == 4) 1090 n = (mod == 3) ? 1 : 2; 1091 else if (reg == 0 || reg == 1) // INC/DEC rm 1092 n = (mod == 3) ? 1 : 4; 1093 else if (reg == 6) // PUSH rm 1094 n = (mod == 3) ? 3 : 4; 1095 break; 1096 1097 case 0x0F: 1098 op2 = c.Iop & 0xFF; 1099 if ((op2 & 0xF0) == 0x80) // Jcc 1100 { n = 1; 1101 break; 1102 } 1103 if ((op2 & 0xF0) == 0x90) // SETcc 1104 { n = (mod == 3) ? 1 : 3; 1105 break; 1106 } 1107 if (op2 == 0xB6 || op2 == 0xB7 || // MOVZX 1108 op2 == 0xBE || op2 == 0xBF) // MOVSX 1109 { n = 1; 1110 break; 1111 } 1112 if (op2 == 0xAF) // IMUL r,m 1113 { n = (mod == 3) ? 1 : 2; 1114 break; 1115 } 1116 break; 1117 1118 default: 1119 break; 1120 } 1121 } 1122 if (n == 0) 1123 n = 5; // copout for now 1124 return n; 1125 } 1126 1127 /****************************************** 1128 * Determine pairing classification. 1129 * Don't deal with floating point, just assume they are all NP (Not Pairable). 1130 * Returns: 1131 * NP,UV,PU,PV optionally OR'd with PE 1132 */ 1133 1134 private int pair_class(code *c) 1135 { ubyte op; 1136 ubyte irm,mod,reg,rm; 1137 uint a32; 1138 int pc; 1139 1140 // Of course, with Intel this is *never* simple, and Intel's 1141 // documentation is vague about the specifics. 1142 1143 op = c.Iop & 0xFF; 1144 if ((c.Iop & 0xFF00) == 0x0F00) 1145 op = 0x0F; 1146 pc = pentcycl[op]; 1147 a32 = I32; 1148 if (c.Iflags & CFaddrsize) 1149 a32 ^= 1; 1150 irm = c.Irm; 1151 mod = (irm >> 6) & 3; 1152 reg = (irm >> 3) & 7; 1153 rm = irm & 7; 1154 switch (op) 1155 { 1156 case 0x0F: // 2 byte opcode 1157 if ((c.Iop & 0xF0) == 0x80) // if Jcc 1158 pc = PV | PF; 1159 break; 1160 1161 case 0x80: 1162 case 0x81: 1163 case 0x83: 1164 if (reg == 2 || // ADC EA,immed 1165 reg == 3) // SBB EA,immed 1166 { pc = PU; 1167 goto L2; 1168 } 1169 goto L1; // AND/OR/XOR/ADD/SUB/CMP EA,immed 1170 1171 case 0x84: 1172 case 0x85: // TEST EA,reg 1173 if (mod == 3) // TEST reg,reg 1174 pc = UV; 1175 break; 1176 1177 case 0xC0: 1178 case 0xC1: 1179 if (reg >= 4) 1180 pc = PU; 1181 break; 1182 1183 case 0xC6: 1184 case 0xC7: 1185 if (reg == 0) // MOV EA,immed 1186 { 1187 L1: 1188 pc = UV; 1189 L2: 1190 // if EA contains a displacement then 1191 // can't execute in V, or pair in U 1192 switch (mod) 1193 { case 0: 1194 if (a32) 1195 { if (rm == 5 || 1196 (rm == 4 && (c.Isib & 7) == 5) 1197 ) 1198 pc = NP; 1199 } 1200 else if (rm == 6) 1201 pc = NP; 1202 break; 1203 case 1: 1204 case 2: 1205 pc = NP; 1206 break; 1207 1208 default: 1209 break; 1210 } 1211 } 1212 break; 1213 1214 case 0xD9: 1215 if (irm < 0xC0) 1216 { 1217 if (reg == 0) 1218 pc = FX; 1219 } 1220 else if (irm < 0xC8) 1221 pc = FX; 1222 else if (irm < 0xD0) 1223 pc = PV; 1224 else 1225 { 1226 switch (irm) 1227 { 1228 case 0xE0: 1229 case 0xE1: 1230 case 0xE4: 1231 pc = FX; 1232 break; 1233 1234 default: 1235 break; 1236 } 1237 } 1238 break; 1239 1240 case 0xDB: 1241 if (irm < 0xC0 && (reg == 0 || reg == 5)) 1242 pc = FX; 1243 break; 1244 1245 case 0xDD: 1246 if (irm < 0xC0) 1247 { 1248 if (reg == 0) 1249 pc = FX; 1250 } 1251 else if (irm >= 0xE0 && irm < 0xF0) 1252 pc = FX; 1253 break; 1254 1255 case 0xDF: 1256 if (irm < 0xC0 && (reg == 0 || reg == 5)) 1257 pc = FX; 1258 break; 1259 1260 case 0xFE: 1261 if (reg == 0 || reg == 1) // INC/DEC EA 1262 pc = UV; 1263 break; 1264 case 0xFF: 1265 if (reg == 0 || reg == 1) // INC/DEC EA 1266 pc = UV; 1267 else if (reg == 2 || reg == 4) // CALL/JMP near ptr EA 1268 pc = PE|PV; 1269 else if (reg == 6 && mod == 3) // PUSH reg 1270 pc = PE | UV; 1271 break; 1272 1273 default: 1274 break; 1275 } 1276 if (c.Iflags & CFPREFIX && pc == UV) // if prefix byte 1277 pc = PU; 1278 return pc; 1279 } 1280 1281 /****************************************** 1282 * For an instruction, determine what is read 1283 * and what is written, and what is used for addressing. 1284 * Determine operand size if EA (larger is ok). 1285 */ 1286 1287 @trusted 1288 private void getinfo(Cinfo *ci,code *c) 1289 { 1290 memset(ci,0,Cinfo.sizeof); 1291 if (!c) 1292 return; 1293 ci.c = c; 1294 1295 if (PRO) 1296 { 1297 ci.uops = cast(ubyte)uops(c); 1298 ci.isz = cast(ubyte)calccodsize(c); 1299 } 1300 else 1301 ci.pair = cast(ubyte)pair_class(c); 1302 1303 ubyte op; 1304 ubyte op2; 1305 ubyte irm,mod,reg,rm; 1306 uint a32; 1307 int pc; 1308 uint r,w; 1309 int sz = I32 ? 4 : 2; 1310 1311 ci.r = 0; 1312 ci.w = 0; 1313 ci.a = 0; 1314 op = c.Iop & 0xFF; 1315 if ((c.Iop & 0xFF00) == 0x0F00) 1316 op = 0x0F; 1317 //printf("\tgetinfo %x, op %x \n",c,op); 1318 pc = pentcycl[op]; 1319 a32 = I32; 1320 if (c.Iflags & CFaddrsize) 1321 a32 ^= 1; 1322 if (c.Iflags & CFopsize) 1323 sz ^= 2 | 4; 1324 irm = c.Irm; 1325 mod = (irm >> 6) & 3; 1326 reg = (irm >> 3) & 7; 1327 rm = irm & 7; 1328 1329 r = oprw[op][0]; 1330 w = oprw[op][1]; 1331 1332 switch (op) 1333 { 1334 case 0x50: 1335 case 0x51: 1336 case 0x52: 1337 case 0x53: 1338 case 0x55: 1339 case 0x56: 1340 case 0x57: // PUSH reg 1341 ci.flags |= CIFL.push; 1342 goto Lpush; 1343 1344 case 0x54: // PUSH ESP 1345 case 0x6A: // PUSH imm8 1346 case 0x68: // PUSH imm 1347 case 0x0E: 1348 case 0x16: 1349 case 0x1E: 1350 case 0x06: 1351 case 0x9C: 1352 Lpush: 1353 ci.spadjust = -sz; 1354 ci.a |= mSP; 1355 break; 1356 1357 case 0x58: 1358 case 0x59: 1359 case 0x5A: 1360 case 0x5B: 1361 case 0x5C: 1362 case 0x5D: 1363 case 0x5E: 1364 case 0x5F: // POP reg 1365 case 0x1F: 1366 case 0x07: 1367 case 0x17: 1368 case 0x9D: // POPF 1369 Lpop: 1370 ci.spadjust = sz; 1371 ci.a |= mSP; 1372 break; 1373 1374 case 0x80: 1375 if (reg == 7) // CMP 1376 c.Iflags |= CFpsw; 1377 r = B | grprw[0][reg][0]; // Grp 1 (byte) 1378 w = B | grprw[0][reg][1]; 1379 break; 1380 1381 case 0x81: 1382 case 0x83: 1383 if (reg == 7) // CMP 1384 c.Iflags |= CFpsw; 1385 else if (irm == modregrm(3,0,SP)) // ADD ESP,imm 1386 { 1387 assert(c.IFL2 == FLconst); 1388 ci.spadjust = (op == 0x81) ? c.IEV2.Vint : cast(byte)c.IEV2.Vint; 1389 } 1390 else if (irm == modregrm(3,5,SP)) // SUB ESP,imm 1391 { 1392 assert(c.IFL2 == FLconst); 1393 ci.spadjust = (op == 0x81) ? -c.IEV2.Vint : -cast(int)cast(byte)c.IEV2.Vint; 1394 } 1395 r = grprw[0][reg][0]; // Grp 1 1396 w = grprw[0][reg][1]; 1397 break; 1398 1399 case 0x8F: 1400 if (reg == 0) // POP rm 1401 goto Lpop; 1402 break; 1403 1404 case 0xA0: 1405 case 0xA1: 1406 case 0xA2: 1407 case 0xA3: 1408 // Fake having an EA to simplify code in conflict() 1409 ci.flags |= CIFL.ea; 1410 ci.reg = 0; 1411 ci.sibmodrm = a32 ? modregrm(0,0,5) : modregrm(0,0,6); 1412 c.IFL1 = c.IFL2; 1413 c.IEV1 = c.IEV2; 1414 break; 1415 1416 case 0xC2: 1417 case 0xC3: 1418 case 0xCA: 1419 case 0xCB: // RET 1420 ci.a |= mSP; 1421 break; 1422 1423 case 0xE8: 1424 if (c.Iflags & CFclassinit) // call to __j_classinit 1425 { r = 0; 1426 w = F; 1427 1428 version (CLASSINIT2) 1429 ci.pair = UV; // it is patched to CMP EAX,0 1430 else 1431 ci.pair = NP; 1432 1433 } 1434 break; 1435 1436 case 0xF6: 1437 r = grprw[3][reg][0]; // Grp 3, byte version 1438 w = grprw[3][reg][1]; 1439 break; 1440 1441 case 0xF7: 1442 r = grprw[1][reg][0]; // Grp 3 1443 w = grprw[1][reg][1]; 1444 break; 1445 1446 case 0x0F: 1447 op2 = c.Iop & 0xFF; 1448 if ((op2 & 0xF0) == 0x80) // if Jxx instructions 1449 { 1450 ci.r = F | N; 1451 ci.w = N; 1452 goto Lret; 1453 } 1454 ci.r = N; 1455 ci.w = N; // copout for now 1456 goto Lret; 1457 1458 case 0xD7: // XLAT 1459 ci.a = mAX | mBX; 1460 break; 1461 1462 case 0xFF: 1463 r = grprw[2][reg][0]; // Grp 5 1464 w = grprw[2][reg][1]; 1465 if (reg == 6) // PUSH rm 1466 goto Lpush; 1467 break; 1468 1469 case 0x38: 1470 case 0x39: 1471 case 0x3A: 1472 case 0x3B: 1473 case 0x3C: // CMP AL,imm8 1474 case 0x3D: // CMP EAX,imm32 1475 // For CMP opcodes, always test for flags 1476 c.Iflags |= CFpsw; 1477 break; 1478 1479 case ESCAPE: 1480 if (c.Iop == (ESCAPE | ESCadjfpu)) 1481 ci.fpuadjust = c.IEV1.Vint; 1482 break; 1483 1484 case 0xD0: 1485 case 0xD1: 1486 case 0xD2: 1487 case 0xD3: 1488 case 0xC0: 1489 case 0xC1: 1490 if (reg == 2 || reg == 3) // if RCL or RCR 1491 c.Iflags |= CFpsw; // always test for flags 1492 break; 1493 1494 case 0xD8: 1495 case 0xD9: 1496 case 0xDA: 1497 case 0xDB: 1498 case 0xDC: 1499 case 0xDD: 1500 case 0xDE: 1501 case 0xDF: 1502 if (irm < 0xC0) 1503 { r = grpf1[op - 0xD8][reg][0]; 1504 w = grpf1[op - 0xD8][reg][1]; 1505 switch (op) 1506 { 1507 case 0xD8: 1508 if (reg == 3) // if FCOMP 1509 ci.fpuadjust = -1; 1510 else 1511 ci.fp_op = FP.fop; 1512 break; 1513 1514 case 0xD9: 1515 if (reg == 0) // if FLD float 1516 { ci.fpuadjust = 1; 1517 ci.fp_op = FP.fld; 1518 } 1519 else if (reg == 3) // if FSTP float 1520 { ci.fpuadjust = -1; 1521 ci.fp_op = FP.fstp; 1522 } 1523 else if (reg == 5 || reg == 7) 1524 sz = 2; 1525 else if (reg == 4 || reg == 6) 1526 sz = 28; 1527 break; 1528 case 0xDA: 1529 if (reg == 3) // if FICOMP 1530 ci.fpuadjust = -1; 1531 break; 1532 case 0xDB: 1533 if (reg == 0 || reg == 5) 1534 { ci.fpuadjust = 1; 1535 ci.fp_op = FP.fld; // FILD / FLD long double 1536 } 1537 if (reg == 3 || reg == 7) 1538 ci.fpuadjust = -1; 1539 if (reg == 7) 1540 ci.fp_op = FP.fstp; // FSTP long double 1541 if (reg == 5 || reg == 7) 1542 sz = 10; 1543 break; 1544 case 0xDC: 1545 sz = 8; 1546 if (reg == 3) // if FCOMP 1547 ci.fpuadjust = -1; 1548 else 1549 ci.fp_op = FP.fop; 1550 break; 1551 case 0xDD: 1552 if (reg == 0) // if FLD double 1553 { ci.fpuadjust = 1; 1554 ci.fp_op = FP.fld; 1555 } 1556 if (reg == 3) // if FSTP double 1557 { ci.fpuadjust = -1; 1558 ci.fp_op = FP.fstp; 1559 } 1560 if (reg == 7) 1561 sz = 2; 1562 else if (reg == 4 || reg == 6) 1563 sz = 108; 1564 else 1565 sz = 8; 1566 break; 1567 case 0xDE: 1568 sz = 2; 1569 if (reg == 3) // if FICOMP 1570 ci.fpuadjust = -1; 1571 break; 1572 case 0xDF: 1573 sz = 2; 1574 if (reg == 4 || reg == 6) 1575 sz = 10; 1576 else if (reg == 5 || reg == 7) 1577 sz = 8; 1578 if (reg == 0 || reg == 4 || reg == 5) 1579 ci.fpuadjust = 1; 1580 else if (reg == 3 || reg == 6 || reg == 7) 1581 ci.fpuadjust = -1; 1582 break; 1583 1584 default: 1585 break; 1586 } 1587 break; 1588 } 1589 else if (op == 0xDE) 1590 { ci.fpuadjust = -1; // pop versions of Fop's 1591 if (irm == 0xD9) 1592 ci.fpuadjust = -2; // FCOMPP 1593 } 1594 1595 // Most floating point opcodes aren't staged, but are 1596 // sent right through, in order to make use of the large 1597 // latencies with floating point instructions. 1598 if (ci.fp_op == FP.fld || 1599 (op == 0xD9 && (irm & 0xF8) == 0xC0)) 1600 { } // FLD ST(i) 1601 else 1602 ci.flags |= CIFL.nostage; 1603 1604 switch (op) 1605 { 1606 case 0xD8: 1607 r = S; 1608 w = C; 1609 if ((irm & ~7) == 0xD0) 1610 w |= S; 1611 break; 1612 case 0xD9: 1613 // FCHS or FABS or FSQRT 1614 if (irm == 0xE0 || irm == 0xE1 || irm == 0xFA) 1615 ci.fp_op = FP.fop; 1616 r = S; 1617 w = S|C; 1618 break; 1619 case 0xDA: 1620 if (irm == 0xE9) // FUCOMPP 1621 { r = S; 1622 w = S|C; 1623 break; 1624 } 1625 break; 1626 case 0xDB: 1627 if (irm == 0xE2) // FCLEX 1628 { r = 0; 1629 w = C; 1630 break; 1631 } 1632 if (irm == 0xE3) // FINIT 1633 { r = 0; 1634 w = S|C; 1635 break; 1636 } 1637 break; 1638 case 0xDC: 1639 case 0xDE: 1640 if ((irm & 0xF0) != 0xD0) 1641 { r = S; 1642 w = S|C; 1643 break; 1644 } 1645 break; 1646 case 0xDD: 1647 // Not entirely correct, but conservative 1648 r = S; 1649 w = S|C; 1650 break; 1651 case 0xDF: 1652 if (irm == 0xE0) // FSTSW AX 1653 { r = C; 1654 w = mAX; 1655 break; 1656 } 1657 break; 1658 1659 default: 1660 break; 1661 } 1662 break; 1663 1664 default: 1665 //printf("\t\tNo special case\n"); 1666 break; 1667 } 1668 1669 if ((r | w) & B) // if byte operation 1670 sz = 1; // operand size is 1 1671 1672 ci.r = r & ~(R | EA); 1673 ci.w = w & ~(R | EA); 1674 if (r & R) 1675 ci.r |= mask((r & B) ? (reg & 3) : reg); 1676 if (w & R) 1677 ci.w |= mask((w & B) ? (reg & 3) : reg); 1678 1679 // OR in bits for EA addressing mode 1680 if ((r | w) & EA) 1681 { ubyte sib; 1682 1683 sib = 0; 1684 switch (mod) 1685 { 1686 case 0: 1687 if (a32) 1688 { 1689 if (rm == 4) 1690 { 1691 sib = c.Isib; 1692 if ((sib & modregrm(0,7,0)) != modregrm(0,4,0)) 1693 ci.a |= mask((sib >> 3) & 7); // index register 1694 if ((sib & 7) != 5) 1695 ci.a |= mask(sib & 7); // base register 1696 } 1697 else if (rm != 5) 1698 ci.a |= mask(rm); 1699 } 1700 else 1701 { 1702 immutable ubyte[8] ea16 = [mBX|mSI,mBX|mDI,mBP|mSI,mBP|mDI,mSI,mDI,0,mBX]; 1703 ci.a |= ea16[rm]; 1704 } 1705 goto Lmem; 1706 1707 case 1: 1708 case 2: 1709 if (a32) 1710 { 1711 if (rm == 4) 1712 { 1713 sib = c.Isib; 1714 if ((sib & modregrm(0,7,0)) != modregrm(0,4,0)) 1715 ci.a |= mask((sib >> 3) & 7); // index register 1716 ci.a |= mask(sib & 7); // base register 1717 } 1718 else 1719 ci.a |= mask(rm); 1720 } 1721 else 1722 { 1723 immutable ubyte[8] ea16 = [mBX|mSI,mBX|mDI,mBP|mSI,mBP|mDI,mSI,mDI,mBP,mBX]; 1724 ci.a |= ea16[rm]; 1725 } 1726 1727 Lmem: 1728 if (r & EA) 1729 ci.r |= mMEM; 1730 if (w & EA) 1731 ci.w |= mMEM; 1732 ci.flags |= CIFL.ea; 1733 break; 1734 1735 case 3: 1736 if (r & EA) 1737 ci.r |= mask((r & B) ? (rm & 3) : rm); 1738 if (w & EA) 1739 ci.w |= mask((w & B) ? (rm & 3) : rm); 1740 break; 1741 1742 default: 1743 assert(0); 1744 } 1745 // Adjust sibmodrm so that addressing modes can be compared simply 1746 irm &= modregrm(3,0,7); 1747 if (a32) 1748 { 1749 if (irm != modregrm(0,0,5)) 1750 { 1751 switch (mod) 1752 { 1753 case 0: 1754 if ((sib & 7) != 5) // if not disp32[index] 1755 { 1756 c.IFL1 = FLconst; 1757 c.IEV1.Vpointer = 0; 1758 irm |= 0x80; 1759 } 1760 break; 1761 case 1: 1762 c.IEV1.Vpointer = cast(byte) c.IEV1.Vpointer; 1763 irm = modregrm(2, 0, rm); 1764 break; 1765 1766 default: 1767 break; 1768 } 1769 } 1770 } 1771 else 1772 { 1773 if (irm != modregrm(0,0,6)) 1774 { 1775 switch (mod) 1776 { 1777 case 0: 1778 c.IFL1 = FLconst; 1779 c.IEV1.Vpointer = 0; 1780 irm |= 0x80; 1781 break; 1782 case 1: 1783 c.IEV1.Vpointer = cast(byte) c.IEV1.Vpointer; 1784 irm = modregrm(2, 0, rm); 1785 break; 1786 1787 default: 1788 break; 1789 } 1790 } 1791 } 1792 1793 ci.r |= ci.a; 1794 ci.reg = reg; 1795 ci.sibmodrm = (sib << 8) | irm; 1796 } 1797 Lret: 1798 if (ci.w & mSP) // if stack pointer is modified 1799 ci.w |= mMEM; // then we are implicitly writing to memory 1800 if (op == LEA) // if LEA 1801 ci.r &= ~mMEM; // memory is not actually read 1802 ci.sz = cast(ubyte)sz; 1803 1804 //printf("\t\t"); ci.print(); 1805 } 1806 1807 /****************************************** 1808 * Determine if two instructions can pair. 1809 * Assume that in general, cu can pair in the U pipe and cv in the V. 1810 * Look for things like register contentions. 1811 * Input: 1812 * cu instruction for U pipe 1813 * cv instruction for V pipe 1814 * Returns: 1815 * !=0 if they can pair 1816 */ 1817 1818 private int pair_test(Cinfo *cu,Cinfo *cv) 1819 { 1820 uint pcu; 1821 uint pcv; 1822 uint r1,w1; 1823 uint r2,w2; 1824 uint x; 1825 1826 pcu = cu.pair; 1827 if (!(pcu & PU)) 1828 { 1829 // See if pairs with FXCH and cv is FXCH 1830 if (pcu & FX && cv.c.Iop == 0xD9 && (cv.c.Irm & ~7) == 0xC8) 1831 goto Lpair; 1832 goto Lnopair; 1833 } 1834 pcv = cv.pair; 1835 if (!(pcv & PV)) 1836 goto Lnopair; 1837 1838 r1 = cu.r; 1839 w1 = cu.w; 1840 r2 = cv.r; 1841 w2 = cv.w; 1842 1843 x = w1 & (r2 | w2) & ~(F|mMEM); // register contention 1844 if (x && // if register contention 1845 !(x == mSP && pcu & pcv & PE) // and not exception 1846 ) 1847 goto Lnopair; 1848 1849 // Look for flags contention 1850 if (w1 & r2 & F && !(pcv & PF)) 1851 goto Lnopair; 1852 1853 Lpair: 1854 return 1; 1855 1856 Lnopair: 1857 return 0; 1858 } 1859 1860 /****************************************** 1861 * Determine if two instructions have an AGI or register contention. 1862 * Returns: 1863 * !=0 if they have an AGI 1864 */ 1865 1866 private int pair_agi(Cinfo *c1, Cinfo *c2) 1867 { 1868 uint x = c1.w & c2.a; 1869 return x && !(x == mSP && c1.pair & c2.pair & PE); 1870 } 1871 1872 /******************************************** 1873 * Determine if three instructions can decode simultaneously 1874 * in Pentium Pro and Pentium II. 1875 * Input: 1876 * c0,c1,c2 candidates for decoders 0,1,2 1877 * c2 can be null 1878 * Returns: 1879 * !=0 if they can decode simultaneously 1880 */ 1881 1882 private int triple_test(Cinfo *c0, Cinfo *c1, Cinfo *c2) 1883 { 1884 assert(c0); 1885 if (!c1) 1886 return 0; 1887 int c2isz = c2 ? c2.isz : 0; 1888 if (c0.isz > 7 || c1.isz > 7 || c2isz > 7 || 1889 c0.isz + c1.isz + c2isz > 16) 1890 return 0; 1891 1892 // 4-1-1 decode 1893 if (c1.uops > 1 || 1894 (c2 && c2.uops > 1)) 1895 return 0; 1896 1897 return 1; 1898 } 1899 1900 /******************************************** 1901 * Get next instruction worth looking at for scheduling. 1902 * Returns: 1903 * null no more instructions 1904 */ 1905 1906 private code * cnext(code *c) 1907 { 1908 while (1) 1909 { 1910 c = code_next(c); 1911 if (!c) 1912 break; 1913 if (c.Iflags & (CFtarg | CFtarg2)) 1914 break; 1915 if (!(c.Iop == NOP || 1916 c.Iop == (ESCAPE | ESClinnum))) 1917 break; 1918 } 1919 return c; 1920 } 1921 1922 /****************************************** 1923 * Instruction scheduler. 1924 * Input: 1925 * c list of instructions to schedule 1926 * scratch scratch registers we can use 1927 * Returns: 1928 * revised list of scheduled instructions 1929 */ 1930 1931 /////////////////////////////////// 1932 // Determine if c1 and c2 are swappable. 1933 // c1 comes before c2. 1934 // If they do not conflict 1935 // return 0 1936 // If they do conflict 1937 // return 0x100 + delay_clocks 1938 // Input: 1939 // fpsched if 1, then adjust fxch_pre and fxch_post to swap, 1940 // then return 0 1941 // if 2, then adjust ci1 as well as ci2 1942 1943 @trusted 1944 private int conflict(Cinfo *ci1,Cinfo *ci2,int fpsched) 1945 { 1946 code *c1; 1947 code *c2; 1948 uint r1,w1,a1; 1949 uint r2,w2,a2; 1950 int sz1,sz2; 1951 int i = 0; 1952 int delay_clocks; 1953 1954 c1 = ci1.c; 1955 c2 = ci2.c; 1956 1957 //printf("conflict %x %x\n",c1,c2); 1958 1959 r1 = ci1.r; 1960 w1 = ci1.w; 1961 a1 = ci1.a; 1962 sz1 = ci1.sz; 1963 1964 r2 = ci2.r; 1965 w2 = ci2.w; 1966 a2 = ci2.a; 1967 sz2 = ci2.sz; 1968 1969 //printf("r1 %lx w1 %lx a1 %lx sz1 %x\n",r1,w1,a1,sz1); 1970 //printf("r2 %lx w2 %lx a2 %lx sz2 %x\n",r2,w2,a2,sz2); 1971 1972 if ((c1.Iflags | c2.Iflags) & (CFvolatile | CFvex)) 1973 goto Lconflict; 1974 1975 // Determine if we should handle FPU register conflicts separately 1976 //if (fpsched) printf("fp_op %d,%d:\n",ci1.fp_op,ci2.fp_op); 1977 if (fpsched && ci1.fp_op && ci2.fp_op) 1978 { 1979 w1 &= ~(S|C); 1980 r1 &= ~(S|C); 1981 w2 &= ~(S|C); 1982 r2 &= ~(S|C); 1983 } 1984 else 1985 fpsched = 0; 1986 1987 if ((r1 | r2) & N) 1988 { 1989 goto Lconflict; 1990 } 1991 1992 static if (0) 1993 { 1994 if (c1.Iop == 0xFF && c2.Iop == 0x8B) 1995 { c1.print(); c2.print(); i = 1; 1996 printf("r1=%lx, w1=%lx, a1=%lx, sz1=%d, r2=%lx, w2=%lx, a2=%lx, sz2=%d\n",r1,w1,a1,sz1,r2,w2,a2,sz2); 1997 } 1998 } 1999 L1: 2000 if (w1 & r2 || (r1 | w1) & w2) 2001 { ubyte ifl1,ifl2; 2002 2003 if (i) printf("test\n"); 2004 2005 static if (0) 2006 { 2007 if (c1.IFL1 != c2.IFL1) printf("t1\n"); 2008 if ((c1.Irm & modregrm(3,0,7)) != (c2.Irm & modregrm(3,0,7))) printf("t2\n"); 2009 if ((issib(c1.Irm) && c1.Isib != c2.Isib)) printf("t3\n"); 2010 if (c1.IEV1.Vpointer + sz1 <= c2.IEV1.Vpointer) printf("t4\n"); 2011 if (c2.IEV1.Vpointer + sz2 <= c1.IEV1.Vpointer) printf("t5\n"); 2012 } 2013 2014 // make sure CFpsw is reliably set 2015 if (w1 & w2 & F && // if both instructions write to flags 2016 w1 != F && 2017 w2 != F && 2018 !((r1 | r2) & F) && // but neither instruction reads them 2019 !((c1.Iflags | c2.Iflags) & CFpsw)) // and we don't care about flags 2020 { 2021 w1 &= ~F; 2022 w2 &= ~F; // remove conflict 2023 goto L1; // and try again 2024 } 2025 2026 // If other than the memory reference is a conflict 2027 if (w1 & r2 & ~mMEM || (r1 | w1) & w2 & ~mMEM) 2028 { if (i) printf("\t1\n"); 2029 if (i) printf("r1=%x, w1=%x, a1=%x, sz1=%d, r2=%x, w2=%x, a2=%x, sz2=%d\n",r1,w1,a1,sz1,r2,w2,a2,sz2); 2030 goto Lconflict; 2031 } 2032 2033 // If referring to distinct types, then no dependency 2034 if (c1.Irex && c2.Irex && c1.Irex != c2.Irex) 2035 goto Lswap; 2036 2037 ifl1 = c1.IFL1; 2038 ifl2 = c2.IFL1; 2039 2040 // Special case: Allow indexed references using registers other than 2041 // ESP and EBP to be swapped with PUSH instructions 2042 if (((c1.Iop & ~7) == 0x50 || // PUSH reg 2043 c1.Iop == 0x6A || // PUSH imm8 2044 c1.Iop == 0x68 || // PUSH imm16/imm32 2045 (c1.Iop == 0xFF && ci1.reg == 6) // PUSH EA 2046 ) && 2047 ci2.flags & CIFL.ea && !(a2 & mSP) && 2048 !(a2 & mBP && cast(int)c2.IEV1.Vpointer < 0) 2049 ) 2050 { 2051 if (c1.Iop == 0xFF) 2052 { 2053 if (!(w2 & mMEM)) 2054 goto Lswap; 2055 } 2056 else 2057 goto Lswap; 2058 } 2059 2060 // Special case: Allow indexed references using registers other than 2061 // ESP and EBP to be swapped with PUSH instructions 2062 if (((c2.Iop & ~7) == 0x50 || // PUSH reg 2063 c2.Iop == 0x6A || // PUSH imm8 2064 c2.Iop == 0x68 || // PUSH imm16/imm32 2065 (c2.Iop == 0xFF && ci2.reg == 6) // PUSH EA 2066 ) && 2067 ci1.flags & CIFL.ea && !(a1 & mSP) && 2068 !(a2 & mBP && cast(int)c2.IEV1.Vpointer < 0) 2069 ) 2070 { 2071 if (c2.Iop == 0xFF) 2072 { 2073 if (!(w1 & mMEM)) 2074 goto Lswap; 2075 } 2076 else 2077 goto Lswap; 2078 } 2079 2080 // If not both an EA addressing mode, conflict 2081 if (!(ci1.flags & ci2.flags & CIFL.ea)) 2082 { if (i) printf("\t2\n"); 2083 goto Lconflict; 2084 } 2085 2086 if (ci1.sibmodrm == ci2.sibmodrm) 2087 { if (ifl1 != ifl2) 2088 goto Lswap; 2089 switch (ifl1) 2090 { 2091 case FLconst: 2092 if (c1.IEV1.Vint != c2.IEV1.Vint && 2093 (c1.IEV1.Vint + sz1 <= c2.IEV1.Vint || 2094 c2.IEV1.Vint + sz2 <= c1.IEV1.Vint)) 2095 goto Lswap; 2096 break; 2097 case FLdatseg: 2098 if (c1.IEV1.Vseg != c2.IEV1.Vseg || 2099 c1.IEV1.Vint + sz1 <= c2.IEV1.Vint || 2100 c2.IEV1.Vint + sz2 <= c1.IEV1.Vint) 2101 goto Lswap; 2102 break; 2103 2104 default: 2105 break; 2106 } 2107 } 2108 2109 if ((c1.Iflags | c2.Iflags) & CFunambig && 2110 (ifl1 != ifl2 || 2111 ci1.sibmodrm != ci2.sibmodrm || 2112 (c1.IEV1.Vint != c2.IEV1.Vint && 2113 (c1.IEV1.Vint + sz1 <= c2.IEV1.Vint || 2114 c2.IEV1.Vint + sz2 <= c1.IEV1.Vint) 2115 ) 2116 ) 2117 ) 2118 { 2119 // Assume that [EBP] and [ESP] can point to the same location 2120 if (((a1 | a2) & (mBP | mSP)) == (mBP | mSP)) 2121 goto Lconflict; 2122 goto Lswap; 2123 } 2124 2125 if (i) printf("\t3\n"); 2126 goto Lconflict; 2127 } 2128 2129 Lswap: 2130 if (fpsched) 2131 { 2132 //printf("\tfpsched %d,%d:\n",ci1.fp_op,ci2.fp_op); 2133 ubyte x1 = ci1.fxch_pre; 2134 ubyte y1 = ci1.fxch_post; 2135 ubyte x2 = ci2.fxch_pre; 2136 ubyte y2 = ci2.fxch_post; 2137 2138 static uint X(uint a, uint b) { return (a << 8) | b; } 2139 switch (X(ci1.fp_op,ci2.fp_op)) 2140 { 2141 case X(FP.fstp, FP.fld): 2142 if (x1 || y1) 2143 goto Lconflict; 2144 if (x2) 2145 goto Lconflict; 2146 if (y2 == 0) 2147 ci2.fxch_post++; 2148 else if (y2 == 1) 2149 { 2150 ci2.fxch_pre++; 2151 ci2.fxch_post++; 2152 } 2153 else 2154 { 2155 goto Lconflict; 2156 } 2157 break; 2158 2159 case X(FP.fstp, FP.fop): 2160 if (x1 || y1) 2161 goto Lconflict; 2162 ci2.fxch_pre++; 2163 ci2.fxch_post++; 2164 break; 2165 2166 case X(FP.fop, FP.fop): 2167 if (x1 == 0 && y1 == 1 && x2 == 0 && y2 == 0) 2168 { ci2.fxch_pre = 1; 2169 ci2.fxch_post = 1; 2170 break; 2171 } 2172 if (x1 == 0 && y1 == 0 && x2 == 1 && y2 == 1) 2173 break; 2174 goto Lconflict; 2175 2176 case X(FP.fop, FP.fld): 2177 if (x1 || y1) 2178 goto Lconflict; 2179 if (x2) 2180 goto Lconflict; 2181 if (y2) 2182 break; 2183 else if (fpsched == 2) 2184 ci1.fxch_post = 1; 2185 ci2.fxch_post = 1; 2186 break; 2187 2188 default: 2189 goto Lconflict; 2190 } 2191 2192 //printf("\tpre = %d, post = %d\n",ci2.fxch_pre,ci2.fxch_post); 2193 } 2194 2195 //printf("w1 = x%x, w2 = x%x\n",w1,w2); 2196 if (i) printf("no conflict\n\n"); 2197 return 0; 2198 2199 Lconflict: 2200 //printf("r1=%x, w1=%x, r2=%x, w2=%x\n",r1,w1,r2,w2); 2201 delay_clocks = 0; 2202 2203 // Determine if AGI 2204 if (!PRO && pair_agi(ci1,ci2)) 2205 delay_clocks = 1; 2206 2207 // Special delays for floating point 2208 if (fpsched) 2209 { if (ci1.fp_op == FP.fld && ci2.fp_op == FP.fstp) 2210 delay_clocks = 1; 2211 else if (ci1.fp_op == FP.fop && ci2.fp_op == FP.fstp) 2212 delay_clocks = 3; 2213 else if (ci1.fp_op == FP.fop && ci2.fp_op == FP.fop) 2214 delay_clocks = 2; 2215 } 2216 else if (PRO) 2217 { 2218 // Look for partial register write stalls 2219 if (w1 & r2 & ALLREGS && sz1 < sz2) 2220 delay_clocks = 7; 2221 } 2222 else if ((w1 | r1) & (w2 | r2) & (C | S)) 2223 { 2224 int op = c1.Iop; 2225 int reg = c1.Irm & modregrm(0,7,0); 2226 if (ci1.fp_op == FP.fld || 2227 (op == 0xD9 && (c1.Irm & 0xF8) == 0xC0) 2228 ) 2229 { } // FLD 2230 else if (op == 0xD9 && (c1.Irm & 0xF8) == 0xC8) 2231 { } // FXCH 2232 else if (c2.Iop == 0xD9 && (c2.Irm & 0xF8) == 0xC8) 2233 { } // FXCH 2234 else 2235 delay_clocks = 3; 2236 } 2237 2238 if (i) printf("conflict %d\n\n",delay_clocks); 2239 return 0x100 + delay_clocks; 2240 } 2241 2242 enum TBLMAX = 2*3*20; // must be divisible by both 2 and 3 2243 // (U,V pipe in Pentium, 3 decode units 2244 // in Pentium Pro) 2245 2246 struct Schedule 2247 { 2248 nothrow: 2249 Cinfo*[TBLMAX] tbl; // even numbers are U pipe, odd numbers are V 2250 int tblmax; // max number of slots used 2251 2252 Cinfo[TBLMAX] cinfo; 2253 int cinfomax; 2254 2255 Barray!(Cinfo*) stagelist; // list of instructions in staging area 2256 2257 int fpustackused; // number of slots in FPU stack that are used 2258 2259 @trusted 2260 void initialize(int fpustackinit) // initialize scheduler 2261 { 2262 //printf("Schedule::initialize(fpustackinit = %d)\n", fpustackinit); 2263 memset(&this, 0, Schedule.sizeof); 2264 fpustackused = fpustackinit; 2265 } 2266 2267 @trusted 2268 void dtor() 2269 { 2270 stagelist.dtor(); 2271 } 2272 2273 @trusted 2274 code **assemble(code **pc) // reassemble scheduled instructions 2275 { 2276 code *c; 2277 2278 debug 2279 if (debugs) printf("assemble:\n"); 2280 2281 assert(!*pc); 2282 2283 // Try to insert the rest of the staged instructions 2284 size_t sli; 2285 for (sli = 0; sli < stagelist.length; ++sli) 2286 { 2287 Cinfo* ci = stagelist[sli]; 2288 if (!ci) 2289 continue; 2290 if (!insert(ci)) 2291 break; 2292 } 2293 2294 // Get the instructions out of the schedule table 2295 assert(cast(uint)tblmax <= TBLMAX); 2296 for (int i = 0; i < tblmax; i++) 2297 { 2298 Cinfo* ci = tbl[i]; 2299 2300 debug 2301 if (debugs) 2302 { 2303 if (PRO) 2304 { immutable char[4][3] tbl = [ "0 "," 1 "," 2" ]; 2305 2306 if (ci) 2307 printf("%s %d ",tbl[i - ((i / 3) * 3)].ptr,ci.uops); 2308 else 2309 printf("%s ",tbl[i - ((i / 3) * 3)].ptr); 2310 } 2311 else 2312 { 2313 printf((i & 1) ? " V " : "U "); 2314 } 2315 if (ci) 2316 ci.c.print(); 2317 else 2318 printf("\n"); 2319 } 2320 2321 if (!ci) 2322 continue; 2323 fpustackused += ci.fpuadjust; 2324 //printf("stage()1: fpustackused = %d\n", fpustackused); 2325 c = ci.c; 2326 if (i == 0) 2327 c.Iflags |= CFtarg; // by definition, first is always a jump target 2328 else 2329 c.Iflags &= ~CFtarg; // the rest are not 2330 2331 // Put in any FXCH prefix 2332 if (ci.fxch_pre) 2333 { code *cf; 2334 assert(i); 2335 cf = gen2(null,0xD9,0xC8 + ci.fxch_pre); 2336 *pc = cf; 2337 pc = &cf.next; 2338 } 2339 2340 *pc = c; 2341 do 2342 { 2343 assert(*pc != code_next(*pc)); 2344 pc = &(*pc).next; 2345 } while (*pc); 2346 2347 // Put in any FXCH postfix 2348 if (ci.fxch_post) 2349 { 2350 for (int j = i + 1; j < tblmax; j++) 2351 { if (tbl[j]) 2352 { if (tbl[j].fxch_pre == ci.fxch_post) 2353 { 2354 tbl[j].fxch_pre = 0; // they cancel each other out 2355 goto L1; 2356 } 2357 break; 2358 } 2359 } 2360 { code *cf; 2361 cf = gen2(null,0xD9,0xC8 + ci.fxch_post); 2362 *pc = cf; 2363 pc = &cf.next; 2364 } 2365 } 2366 L1: 2367 } 2368 2369 // Just append any instructions left in the staging area 2370 foreach (ci; stagelist[sli .. stagelist.length]) 2371 { 2372 if (!ci) 2373 continue; 2374 2375 debug 2376 if (debugs) { printf("appending: "); ci.c.print(); } 2377 2378 *pc = ci.c; 2379 do 2380 { 2381 pc = &(*pc).next; 2382 2383 } while (*pc); 2384 fpustackused += ci.fpuadjust; 2385 //printf("stage()2: fpustackused = %d\n", fpustackused); 2386 } 2387 stagelist.setLength(0); 2388 2389 return pc; 2390 } 2391 2392 /****************************** 2393 * Insert c into scheduling table. 2394 * Returns: 2395 * 0 could not be scheduled; have to start a new one 2396 */ 2397 2398 int insert(Cinfo *ci) 2399 { code *c; 2400 int clocks; 2401 int i; 2402 int ic = 0; 2403 int imin; 2404 targ_size_t offset; 2405 targ_size_t vpointer; 2406 int movesp = 0; 2407 int reg2 = -1; // avoid "may be uninitialized" warning 2408 2409 //printf("insert "); ci.c.print(); 2410 //printf("insert() %d\n", fpustackused); 2411 c = ci.c; 2412 //printf("\tc.Iop %x\n",c.Iop); 2413 vpointer = c.IEV1.Vpointer; 2414 assert(cast(uint)tblmax <= TBLMAX); 2415 if (tblmax == TBLMAX) // if out of space 2416 goto Lnoinsert; 2417 if (tblmax == 0) // if table is empty 2418 { // Just stuff it in the first slot 2419 i = tblmax; 2420 goto Linsert; 2421 } 2422 else if (c.Iflags & (CFtarg | CFtarg2)) 2423 // Jump targets can only be first in the scheduler 2424 goto Lnoinsert; 2425 2426 // Special case of: 2427 // PUSH reg1 2428 // MOV reg2,x[ESP] 2429 if (c.Iop == 0x8B && 2430 (c.Irm & modregrm(3,0,7)) == modregrm(1,0,4) && 2431 c.Isib == modregrm(0,4,SP) && 2432 c.IFL1 == FLconst && 2433 (cast(byte)c.IEV1.Vpointer) >= REGSIZE 2434 ) 2435 { 2436 movesp = 1; // this is a MOV reg2,offset[ESP] 2437 offset = cast(byte)c.IEV1.Vpointer; 2438 reg2 = (c.Irm >> 3) & 7; 2439 } 2440 2441 2442 // Start at tblmax, and back up until we get a conflict 2443 ic = -1; 2444 imin = 0; 2445 for (i = tblmax; i >= 0; i--) 2446 { 2447 Cinfo* cit = tbl[i]; 2448 if (!cit) 2449 continue; 2450 2451 // Look for special case swap 2452 if (movesp && 2453 (cit.c.Iop & ~7) == 0x50 && // if PUSH reg1 2454 (cit.c.Iop & 7) != reg2 && // if reg1 != reg2 2455 (cast(byte)c.IEV1.Vpointer) >= -cit.spadjust 2456 ) 2457 { 2458 c.IEV1.Vpointer += cit.spadjust; 2459 //printf("\t1, spadjust = %d, ptr = x%x\n",cit.spadjust,c.IEV1.Vpointer); 2460 continue; 2461 } 2462 2463 if (movesp && 2464 cit.c.Iop == 0x83 && 2465 cit.c.Irm == modregrm(3,5,SP) && // if SUB ESP,offset 2466 cit.c.IFL2 == FLconst && 2467 (cast(byte)c.IEV1.Vpointer) >= -cit.spadjust 2468 ) 2469 { 2470 //printf("\t2, spadjust = %d\n",cit.spadjust); 2471 c.IEV1.Vpointer += cit.spadjust; 2472 continue; 2473 } 2474 2475 clocks = conflict(cit,ci,1); 2476 if (clocks) 2477 { int j; 2478 2479 ic = i; // where the conflict occurred 2480 clocks &= 0xFF; // convert to delay count 2481 2482 // Move forward the delay clocks 2483 if (clocks == 0) 2484 j = i + 1; 2485 else if (PRO) 2486 j = (((i + 3) / 3) * 3) + clocks * 3; 2487 else 2488 { j = ((i + 2) & ~1) + clocks * 2; 2489 2490 // It's possible we skipped over some AGI generating 2491 // instructions due to movesp. 2492 int k; 2493 for (k = i + 1; k < j; k++) 2494 { 2495 if (k >= TBLMAX) 2496 goto Lnoinsert; 2497 if (tbl[k] && pair_agi(tbl[k],ci)) 2498 { 2499 k = ((k + 2) & ~1) + 1; 2500 } 2501 } 2502 j = k; 2503 } 2504 2505 if (j >= TBLMAX) // exceed table size? 2506 goto Lnoinsert; 2507 imin = j; // first possible slot c can go in 2508 break; 2509 } 2510 } 2511 2512 2513 // Scan forward looking for a hole to put it in 2514 for (i = imin; i < TBLMAX; i++) 2515 { 2516 if (tbl[i]) 2517 { 2518 // In case, due to movesp, we skipped over some AGI instructions 2519 if (!PRO && pair_agi(tbl[i],ci)) 2520 { 2521 i = ((i + 2) & ~1) + 1; 2522 if (i >= TBLMAX) 2523 goto Lnoinsert; 2524 } 2525 } 2526 else 2527 { 2528 if (PRO) 2529 { int i0 = (i / 3) * 3; // index of decode unit 0 2530 Cinfo *ci0; 2531 2532 assert(((TBLMAX / 3) * 3) == TBLMAX); 2533 switch (i - i0) 2534 { 2535 case 0: // i0 can handle any instruction 2536 goto Linsert; 2537 case 1: 2538 ci0 = tbl[i0]; 2539 if (ci.uops > 1) 2540 { 2541 if (i0 >= imin && ci0.uops == 1) 2542 goto L1; 2543 i++; 2544 break; 2545 } 2546 if (triple_test(ci0,ci,tbl[i0 + 2])) 2547 goto Linsert; 2548 break; 2549 case 2: 2550 ci0 = tbl[i0]; 2551 if (ci.uops > 1) 2552 { 2553 if (i0 >= imin && ci0.uops == 1) 2554 { 2555 if (i >= tblmax) 2556 { if (i + 1 >= TBLMAX) 2557 goto Lnoinsert; 2558 tblmax = i + 1; 2559 } 2560 tbl[i0 + 2] = tbl[i0 + 1]; 2561 tbl[i0 + 1] = ci0; 2562 i = i0; 2563 goto Linsert; 2564 } 2565 break; 2566 } 2567 if (triple_test(ci0,tbl[i0 + 1],ci)) 2568 goto Linsert; 2569 break; 2570 default: 2571 assert(0); 2572 } 2573 } 2574 else 2575 { 2576 assert((TBLMAX & 1) == 0); 2577 if (i & 1) // if V pipe 2578 { 2579 if (pair_test(tbl[i - 1],ci)) 2580 { 2581 goto Linsert; 2582 } 2583 else if (i > imin && pair_test(ci,tbl[i - 1])) 2584 { 2585 L1: 2586 tbl[i] = tbl[i - 1]; 2587 if (i >= tblmax) 2588 tblmax = i + 1; 2589 i--; 2590 //printf("\tswapping with x%02x\n",tbl[i + 1].c.Iop); 2591 goto Linsert; 2592 } 2593 } 2594 else // will always fit in U pipe 2595 { 2596 assert(!tbl[i + 1]); // because V pipe should be empty 2597 goto Linsert; 2598 } 2599 } 2600 } 2601 } 2602 2603 Lnoinsert: 2604 //printf("\tnoinsert\n"); 2605 c.IEV1.Vpointer = vpointer; // reset to original value 2606 return 0; 2607 2608 Linsert: 2609 // Insert at location i 2610 assert(i < TBLMAX); 2611 assert(tblmax <= TBLMAX); 2612 tbl[i] = ci; 2613 //printf("\tinsert at location %d\n",i); 2614 2615 // If it's a scheduled floating point code, we have to adjust 2616 // the FXCH values 2617 if (ci.fp_op) 2618 { 2619 ci.fxch_pre = 0; 2620 ci.fxch_post = 0; // start over again 2621 2622 int fpu = fpustackused; 2623 for (int j = 0; j < tblmax; j++) 2624 { 2625 if (tbl[j]) 2626 { 2627 fpu += tbl[j].fpuadjust; 2628 if (fpu >= 8) // if FPU stack overflow 2629 { tbl[i] = null; 2630 //printf("fpu stack overflow\n"); 2631 goto Lnoinsert; 2632 } 2633 } 2634 } 2635 2636 for (int j = tblmax; j > i; j--) 2637 { 2638 if (j < TBLMAX && tbl[j]) 2639 conflict(tbl[j],ci,2); 2640 } 2641 } 2642 2643 if (movesp) 2644 { // Adjust [ESP] offsets 2645 2646 //printf("\tic = %d, inserting at %d\n",ic,i); 2647 assert(cast(uint)tblmax <= TBLMAX); 2648 for (int j = ic + 1; j < i; j++) 2649 { 2650 Cinfo* cit = tbl[j]; 2651 if (cit) 2652 { 2653 c.IEV1.Vpointer -= cit.spadjust; 2654 //printf("\t3, spadjust = %d, ptr = x%x\n",cit.spadjust,c.IEV1.Vpointer); 2655 } 2656 } 2657 } 2658 if (i >= tblmax) 2659 tblmax = i + 1; 2660 2661 // Now do a hack. Look back at immediately preceding instructions, 2662 // and see if we can swap with a push. 2663 if (0 && movesp) 2664 { 2665 while (1) 2666 { 2667 int j; 2668 for (j = 1; i > j; j++) 2669 if (tbl[i - j]) 2670 break; 2671 2672 if (i >= j && tbl[i - j] && 2673 (tbl[i - j].c.Iop & ~7) == 0x50 && // if PUSH reg1 2674 (tbl[i - j].c.Iop & 7) != reg2 && // if reg1 != reg2 2675 cast(byte)c.IEV1.Vpointer >= REGSIZE) 2676 { 2677 //printf("\t-4 prec, i-j=%d, i=%d\n",i-j,i); 2678 assert(cast(uint)i < TBLMAX); 2679 assert(cast(uint)(i - j) < TBLMAX); 2680 tbl[i] = tbl[i - j]; 2681 tbl[i - j] = ci; 2682 i -= j; 2683 c.IEV1.Vpointer -= REGSIZE; 2684 } 2685 else 2686 break; 2687 } 2688 } 2689 2690 //printf("\tinsert\n"); 2691 return 1; 2692 } 2693 2694 /****************************** 2695 * Insert c into staging area. 2696 * Params: 2697 * c = instruction to stage 2698 * Returns: 2699 * false if could not be scheduled; have to start a new one 2700 */ 2701 2702 @trusted 2703 bool stage(code *c) 2704 { 2705 //printf("stage: "); c.print(); 2706 if (cinfomax == TBLMAX) // if out of space 2707 return false; 2708 auto ci = &cinfo[cinfomax++]; 2709 getinfo(ci,c); 2710 2711 if (c.Iflags & (CFtarg | CFtarg2 | CFvolatile | CFvex)) 2712 { 2713 // Insert anything in stagelist 2714 foreach (ref cs; stagelist[]) 2715 { 2716 if (cs) 2717 { 2718 if (!insert(cs)) 2719 return false; 2720 cs = null; 2721 } 2722 } 2723 return insert(ci) != 0; 2724 } 2725 2726 // Look through stagelist, and insert any AGI conflicting instructions 2727 bool agi = false; 2728 foreach (ref cs; stagelist[]) 2729 { 2730 if (cs) 2731 { 2732 if (pair_agi(cs,ci)) 2733 { 2734 if (!insert(cs)) 2735 goto Lnostage; 2736 cs = null; 2737 agi = true; // we put out an AGI 2738 } 2739 } 2740 } 2741 2742 // Look through stagelist, and insert any other conflicting instructions 2743 foreach (i, ref cs; stagelist[]) 2744 { 2745 if (!cs) 2746 continue; 2747 if (conflict(cs,ci,0) && // if conflict 2748 !(cs.flags & ci.flags & CIFL.push)) 2749 { 2750 if (cs.spadjust) 2751 { 2752 // We need to insert all previous adjustments to ESP 2753 foreach (ref ca; stagelist[0 .. i]) 2754 { 2755 if (ca && ca.spadjust) 2756 { 2757 if (!insert(ca)) 2758 goto Lnostage; 2759 ca = null; 2760 } 2761 } 2762 } 2763 2764 if (!insert(cs)) 2765 goto Lnostage; 2766 cs = null; 2767 } 2768 } 2769 2770 // If floating point opcode, don't stage it, send it right out 2771 if (!agi && ci.flags & CIFL.nostage) 2772 { 2773 if (!insert(ci)) 2774 goto Lnostage; 2775 return true; 2776 } 2777 2778 stagelist.push(ci); // append to staging list 2779 return true; 2780 2781 Lnostage: 2782 return false; 2783 } 2784 2785 } 2786 2787 2788 2789 /******************************************** 2790 * Snip off tail of instruction sequence. 2791 * Returns: 2792 * next instruction (the tail) or 2793 * null for no more instructions 2794 */ 2795 2796 private code * csnip(code *c) 2797 { 2798 if (c) 2799 { 2800 uint iflags = c.Iflags & CFclassinit; 2801 code **pc; 2802 while (1) 2803 { 2804 pc = &c.next; 2805 c = *pc; 2806 if (!c) 2807 break; 2808 if (c.Iflags & (CFtarg | CFtarg2)) 2809 break; 2810 if (!(c.Iop == NOP || 2811 c.Iop == (ESCAPE | ESClinnum) || 2812 c.Iflags & iflags)) 2813 break; 2814 } 2815 *pc = null; 2816 } 2817 return c; 2818 } 2819 2820 2821 /****************************** 2822 * Schedule Pentium instructions, 2823 * based on Steve Russell's algorithm. 2824 */ 2825 2826 @trusted 2827 private code *schedule(code *c,regm_t scratch) 2828 { 2829 code *cresult = null; 2830 code **pctail = &cresult; 2831 Schedule sch = void; 2832 2833 sch.initialize(0); // initialize scheduling table 2834 while (c) 2835 { 2836 if ((c.Iop == NOP || 2837 ((c.Iop & ESCAPEmask) == ESCAPE && c.Iop != (ESCAPE | ESCadjfpu)) || 2838 c.Iflags & CFclassinit) && 2839 !(c.Iflags & (CFtarg | CFtarg2))) 2840 { code *cn; 2841 2842 // Just append this instruction to pctail and go to the next one 2843 *pctail = c; 2844 cn = code_next(c); 2845 c.next = null; 2846 pctail = &c.next; 2847 c = cn; 2848 continue; 2849 } 2850 2851 //printf("init\n"); 2852 sch.initialize(sch.fpustackused); // initialize scheduling table 2853 2854 while (c) 2855 { 2856 //printf("insert %p\n",c); 2857 if (!sch.stage(c)) // store c in scheduling table 2858 break; 2859 c = csnip(c); 2860 } 2861 2862 //printf("assem %d\n",sch.tblmax); 2863 pctail = sch.assemble(pctail); // reassemble instruction stream 2864 } 2865 sch.dtor(); 2866 2867 return cresult; 2868 } 2869 2870 /**************************************************************************/ 2871 2872 /******************************************** 2873 * Replace any occurrence of r1 in EA with r2. 2874 */ 2875 2876 private void repEA(code *c,uint r1,uint r2) 2877 { 2878 uint mod,reg,rm; 2879 uint rmn; 2880 2881 rmn = c.Irm; 2882 mod = rmn & 0xC0; 2883 reg = rmn & modregrm(0,7,0); 2884 rm = rmn & 7; 2885 2886 if (mod == 0xC0 && rm == r1) 2887 { } //c.Irm = mod | reg | r2; 2888 else if (is32bitaddr(I32,c.Iflags) && 2889 // If not disp32 2890 (rmn & modregrm(3,0,7)) != modregrm(0,0,5)) 2891 { 2892 if (rm == 4) 2893 { // SIB byte addressing 2894 uint sib; 2895 uint base; 2896 uint index; 2897 2898 sib = c.Isib; 2899 base = sib & 7; 2900 index = (sib >> 3) & 7; 2901 if (base == r1 && 2902 !(r1 == 5 && mod == 0) && 2903 !(r2 == 5 && mod == 0) 2904 ) 2905 base = r2; 2906 if (index == r1) 2907 index = r2; 2908 c.Isib = cast(ubyte)((sib & 0xC0) | (index << 3) | base); 2909 } 2910 else if (rm == r1) 2911 { 2912 if (r1 == BP && r2 == SP) 2913 { // Replace [EBP] with [ESP] 2914 c.Irm = cast(ubyte)(mod | reg | 4); 2915 c.Isib = modregrm(0,4,SP); 2916 } 2917 else if (r2 == BP && mod == 0) 2918 { 2919 c.Irm = cast(ubyte)(modregrm(1,0,0) | reg | r2); 2920 c.IFL1 = FLconst; 2921 c.IEV1.Vint = 0; 2922 } 2923 else 2924 c.Irm = cast(ubyte)(mod | reg | r2); 2925 } 2926 } 2927 } 2928 2929 /****************************************** 2930 * Instruction scheduler. 2931 * Input: 2932 * c list of instructions to schedule 2933 * scratch scratch registers we can use 2934 * Returns: 2935 * revised list of scheduled instructions 2936 */ 2937 2938 /****************************************** 2939 * Swap c1 and c2. 2940 * c1 comes before c2. 2941 * Swap in place to not disturb addresses of jmp targets 2942 */ 2943 2944 private void code_swap(code *c1,code *c2) 2945 { code cs; 2946 2947 // Special case of: 2948 // PUSH reg1 2949 // MOV reg2,x[ESP] 2950 //printf("code_swap(%x, %x)\n",c1,c2); 2951 if ((c1.Iop & ~7) == 0x50 && 2952 c2.Iop == 0x8B && 2953 (c2.Irm & modregrm(3,0,7)) == modregrm(1,0,4) && 2954 c2.Isib == modregrm(0,4,SP) && 2955 c2.IFL1 == FLconst && 2956 (cast(byte)c2.IEV1.Vpointer) >= REGSIZE && 2957 (c1.Iop & 7) != ((c2.Irm >> 3) & 7) 2958 ) 2959 c2.IEV1.Vpointer -= REGSIZE; 2960 2961 2962 cs = *c2; 2963 *c2 = *c1; 2964 *c1 = cs; 2965 // Retain original CFtarg 2966 c1.Iflags = (c1.Iflags & ~(CFtarg | CFtarg2)) | (c2.Iflags & (CFtarg | CFtarg2)); 2967 c2.Iflags = (c2.Iflags & ~(CFtarg | CFtarg2)) | (cs.Iflags & (CFtarg | CFtarg2)); 2968 2969 c1.next = c2.next; 2970 c2.next = cs.next; 2971 } 2972 2973 private code *peephole(code *cstart,regm_t scratch) 2974 { 2975 // Look for cases of: 2976 // MOV r1,r2 2977 // OP ?,r1 2978 // we can replace with: 2979 // MOV r1,r2 2980 // OP ?,r2 2981 // to improve pairing 2982 code *c1; 2983 uint r1,r2; 2984 uint mod,reg,rm; 2985 2986 //printf("peephole\n"); 2987 for (code *c = cstart; c; c = c1) 2988 { 2989 ubyte rmn; 2990 2991 //c.print(); 2992 c1 = cnext(c); 2993 Ln: 2994 if (!c1) 2995 break; 2996 if (c1.Iflags & (CFtarg | CFtarg2)) 2997 continue; 2998 2999 // Do: 3000 // PUSH reg 3001 if (I32 && (c.Iop & ~7) == 0x50) 3002 { 3003 uint regx = c.Iop & 7; 3004 3005 // MOV [ESP],regx => NOP 3006 if (c1.Iop == 0x8B && 3007 c1.Irm == modregrm(0,regx,4) && 3008 c1.Isib == modregrm(0,4,SP)) 3009 { c1.Iop = NOP; 3010 continue; 3011 } 3012 3013 // PUSH [ESP] => PUSH regx 3014 if (c1.Iop == 0xFF && 3015 c1.Irm == modregrm(0,6,4) && 3016 c1.Isib == modregrm(0,4,SP)) 3017 { c1.Iop = 0x50 + regx; 3018 continue; 3019 } 3020 3021 // CMP [ESP],imm => CMP regx,i,, 3022 if (c1.Iop == 0x83 && 3023 c1.Irm == modregrm(0,7,4) && 3024 c1.Isib == modregrm(0,4,SP)) 3025 { c1.Irm = modregrm(3,7,regx); 3026 if (c1.IFL2 == FLconst && cast(byte)c1.IEV2.Vuns == 0) 3027 { // to TEST regx,regx 3028 c1.Iop = (c1.Iop & 1) | 0x84; 3029 c1.Irm = modregrm(3,regx,regx); 3030 } 3031 continue; 3032 } 3033 3034 } 3035 3036 // Do: 3037 // MOV reg,[ESP] => PUSH reg 3038 // ADD ESP,4 => NOP 3039 if (I32 && c.Iop == 0x8B && (c.Irm & 0xC7) == modregrm(0,0,4) && 3040 c.Isib == modregrm(0,4,SP) && 3041 c1.Iop == 0x83 && (c1.Irm & 0xC7) == modregrm(3,0,SP) && 3042 !(c1.Iflags & CFpsw) && c1.IFL2 == FLconst && c1.IEV2.Vint == 4) 3043 { 3044 uint regx = (c.Irm >> 3) & 7; 3045 c.Iop = 0x58 + regx; 3046 c1.Iop = NOP; 3047 continue; 3048 } 3049 3050 // Combine two SUBs of the same register 3051 if (c.Iop == c1.Iop && 3052 c.Iop == 0x83 && 3053 (c.Irm & 0xC0) == 0xC0 && 3054 (c.Irm & modregrm(3,0,7)) == (c1.Irm & modregrm(3,0,7)) && 3055 !(c1.Iflags & CFpsw) && 3056 c.IFL2 == FLconst && c1.IFL2 == FLconst 3057 ) 3058 { int i = cast(byte)c.IEV2.Vint; 3059 int i1 = cast(byte)c1.IEV2.Vint; 3060 switch ((c.Irm & modregrm(0,7,0)) | ((c1.Irm & modregrm(0,7,0)) >> 3)) 3061 { 3062 case (0 << 3) | 0: // ADD, ADD 3063 case (5 << 3) | 5: // SUB, SUB 3064 i += i1; 3065 goto Laa; 3066 case (0 << 3) | 5: // ADD, SUB 3067 case (5 << 3) | 0: // SUB, ADD 3068 i -= i1; 3069 goto Laa; 3070 Laa: 3071 if (cast(byte)i != i) 3072 c.Iop &= ~2; 3073 c.IEV2.Vint = i; 3074 c1.Iop = NOP; 3075 if (i == 0) 3076 c.Iop = NOP; 3077 continue; 3078 3079 default: 3080 break; 3081 } 3082 } 3083 3084 if (c.Iop == 0x8B && (c.Irm & 0xC0) == 0xC0) // MOV r1,r2 3085 { r1 = (c.Irm >> 3) & 7; 3086 r2 = c.Irm & 7; 3087 } 3088 else if (c.Iop == 0x89 && (c.Irm & 0xC0) == 0xC0) // MOV r1,r2 3089 { r1 = c.Irm & 7; 3090 r2 = (c.Irm >> 3) & 7; 3091 } 3092 else 3093 { 3094 continue; 3095 } 3096 3097 rmn = c1.Irm; 3098 mod = rmn & 0xC0; 3099 reg = rmn & modregrm(0,7,0); 3100 rm = rmn & 7; 3101 if (cod3_EA(c1)) 3102 repEA(c1,r1,r2); 3103 switch (c1.Iop) 3104 { 3105 case 0x50: 3106 case 0x51: 3107 case 0x52: 3108 case 0x53: 3109 case 0x54: 3110 case 0x55: 3111 case 0x56: 3112 case 0x57: // PUSH reg 3113 if ((c1.Iop & 7) == r1) 3114 { c1.Iop = 0x50 | r2; 3115 //printf("schedule PUSH reg\n"); 3116 } 3117 break; 3118 3119 case 0x81: 3120 case 0x83: 3121 // Look for CMP EA,imm 3122 if (reg == modregrm(0,7,0)) 3123 { 3124 if (mod == 0xC0 && rm == r1) 3125 c1.Irm = cast(ubyte)(mod | reg | r2); 3126 } 3127 break; 3128 3129 case 0x84: // TEST reg,byte ptr EA 3130 if (r1 >= 4 || r2 >= 4) // if not a byte register 3131 break; 3132 if ((rmn & 0xC0) == 0xC0) 3133 { 3134 if ((rmn & 3) == r1) 3135 { c1.Irm = rmn = cast(ubyte)((rmn & modregrm(3,7,4)) | r2); 3136 //printf("schedule 1\n"); 3137 } 3138 } 3139 if ((rmn & modregrm(0,3,0)) == modregrm(0,r1,0)) 3140 { c1.Irm = (rmn & modregrm(3,4,7)) | modregrm(0,r2,0); 3141 //printf("schedule 2\n"); 3142 } 3143 break; 3144 case 0x85: // TEST reg,word ptr EA 3145 if ((rmn & 0xC0) == 0xC0) 3146 { 3147 if ((rmn & 7) == r1) 3148 { c1.Irm = rmn = cast(ubyte)((rmn & modregrm(3,7,0)) | r2); 3149 //printf("schedule 3\n"); 3150 } 3151 } 3152 if ((rmn & modregrm(0,7,0)) == modregrm(0,r1,0)) 3153 { c1.Irm = (rmn & modregrm(3,0,7)) | modregrm(0,r2,0); 3154 //printf("schedule 4\n"); 3155 } 3156 break; 3157 3158 case 0x89: // MOV EA,reg 3159 if ((rmn & modregrm(0,7,0)) == modregrm(0,r1,0)) 3160 { c1.Irm = (rmn & modregrm(3,0,7)) | modregrm(0,r2,0); 3161 //printf("schedule 5\n"); 3162 if (c1.Irm == modregrm(3,r2,r2)) 3163 goto Lnop; 3164 } 3165 break; 3166 3167 case 0x8B: // MOV reg,EA 3168 if ((rmn & 0xC0) == 0xC0 && 3169 (rmn & 7) == r1) // if EA == r1 3170 { c1.Irm = cast(ubyte)((rmn & modregrm(3,7,0)) | r2); 3171 //printf("schedule 6\n"); 3172 if (c1.Irm == modregrm(3,r2,r2)) 3173 goto Lnop; 3174 } 3175 break; 3176 3177 case 0x3C: // CMP AL,imm8 3178 if (r1 == AX && r2 < 4) 3179 { c1.Iop = 0x80; 3180 c1.Irm = modregrm(3,7,r2); 3181 //printf("schedule 7, r2 = %d\n", r2); 3182 } 3183 break; 3184 3185 case 0x3D: // CMP AX,imm16 3186 if (r1 == AX) 3187 { c1.Iop = 0x81; 3188 c1.Irm = modregrm(3,7,r2); 3189 if (c1.IFL2 == FLconst && 3190 c1.IEV2.Vuns == cast(byte)c1.IEV2.Vuns) 3191 c1.Iop = 0x83; 3192 //printf("schedule 8\n"); 3193 } 3194 break; 3195 3196 default: 3197 break; 3198 } 3199 continue; 3200 Lnop: 3201 c1.Iop = NOP; 3202 c1 = cnext(c1); 3203 goto Ln; 3204 } 3205 return cstart; 3206 } 3207 3208 /*****************************************************************/ 3209 3210 /********************************************** 3211 * Replace complex instructions with simple ones more conducive 3212 * to scheduling. 3213 */ 3214 3215 @trusted 3216 code *simpleops(code *c,regm_t scratch) 3217 { code *cstart; 3218 uint reg; 3219 code *c2; 3220 3221 // Worry about using registers not saved yet by prolog 3222 scratch &= ~fregsaved; 3223 3224 if (!(scratch & (scratch - 1))) // if 0 or 1 registers 3225 return c; 3226 3227 reg = findreg(scratch); 3228 3229 cstart = c; 3230 for (code** pc = &cstart; *pc; pc = &(*pc).next) 3231 { 3232 c = *pc; 3233 if (c.Iflags & (CFtarg | CFtarg2 | CFopsize)) 3234 continue; 3235 if (c.Iop == 0x83 && 3236 (c.Irm & modregrm(0,7,0)) == modregrm(0,7,0) && 3237 (c.Irm & modregrm(3,0,0)) != modregrm(3,0,0) 3238 ) 3239 { // Replace CMP mem,imm with: 3240 // MOV reg,mem 3241 // CMP reg,imm 3242 targ_long imm; 3243 3244 //printf("replacing CMP\n"); 3245 c.Iop = 0x8B; 3246 c.Irm = (c.Irm & modregrm(3,0,7)) | modregrm(0,reg,0); 3247 3248 c2 = code_calloc(); 3249 if (reg == AX) 3250 c2.Iop = 0x3D; 3251 else 3252 { c2.Iop = 0x83; 3253 c2.Irm = modregrm(3,7,reg); 3254 } 3255 c2.IFL2 = c.IFL2; 3256 c2.IEV2 = c.IEV2; 3257 3258 // See if c2 should be replaced by a TEST 3259 imm = c2.IEV2.Vuns; 3260 if (!(c2.Iop & 1)) 3261 imm &= 0xFF; 3262 else if (I32 ? c.Iflags & CFopsize : !(c.Iflags & CFopsize)) 3263 imm = cast(short) imm; 3264 if (imm == 0) 3265 { 3266 c2.Iop = 0x85; // TEST reg,reg 3267 c2.Irm = modregrm(3,reg,reg); 3268 } 3269 goto L1; 3270 } 3271 else if (c.Iop == 0xFF && 3272 (c.Irm & modregrm(0,7,0)) == modregrm(0,6,0) && 3273 (c.Irm & modregrm(3,0,0)) != modregrm(3,0,0) 3274 ) 3275 { // Replace PUSH mem with: 3276 // MOV reg,mem 3277 // PUSH reg 3278 3279 // printf("replacing PUSH\n"); 3280 c.Iop = 0x8B; 3281 c.Irm = (c.Irm & modregrm(3,0,7)) | modregrm(0,reg,0); 3282 3283 c2 = gen1(null,0x50 + reg); 3284 L1: 3285 //c.print(); 3286 //c2.print(); 3287 c2.next = c.next; 3288 c.next = c2; 3289 3290 // Switch to another reg 3291 if (scratch & ~mask(reg)) 3292 reg = findreg(scratch & ~mask(reg)); 3293 } 3294 } 3295 return cstart; 3296 } 3297 3298 }