1 /**
2  * Instruction scheduler
3  *
4  * Compiler implementation of the
5  * $(LINK2 https://www.dlang.org, D programming language).
6  *
7  * Copyright:   Copyright (C) 1995-1998 by Symantec
8  *              Copyright (C) 2000-2023 by The D Language Foundation, All Rights Reserved
9  * Authors:     $(LINK2 https://www.digitalmars.com, Walter Bright)
10  * License:     $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
11  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cgsched.c, backend/cgsched.d)
12  */
13 
14 module dmd.backend.cgsched;
15 
16 version (SCPP)
17     version = COMPILE;
18 version (MARS)
19     version = COMPILE;
20 
21 version (COMPILE)
22 {
23 
24 import core.stdc.stdio;
25 import core.stdc.stdlib;
26 import core.stdc.string;
27 
28 import dmd.backend.cc;
29 import dmd.backend.cdef;
30 import dmd.backend.code;
31 import dmd.backend.code_x86;
32 import dmd.backend.dlist;
33 import dmd.backend.global;
34 import dmd.backend.mem;
35 import dmd.backend.ty;
36 import dmd.backend.barray;
37 
38 extern (C++):
39 
40 nothrow:
41 @safe:
42 
43 int REGSIZE();
44 code *gen1(code *c, uint op);
45 code *gen2(code *c, uint op, uint rm);
46 
47 private uint mask(uint m) { return 1 << m; }
48 
49 // is32bitaddr works correctly only when x is 0 or 1.  This is
50 // true today for the current definition of I32, but if the definition
51 // of I32 changes, this macro will need to change as well
52 //
53 // Note: even for linux targets, CFaddrsize can be set by the inline
54 // assembler.
55 private bool is32bitaddr(bool x, uint Iflags) { return I64 || (x ^ ((Iflags & CFaddrsize) != 0)); }
56 
57 // If we use Pentium Pro scheduler
58 @trusted
59 private bool PRO() { return config.target_cpu >= TARGET_PentiumPro; }
60 
61 private enum FP : ubyte
62 {
63     fstp = 1,       /// FSTP mem
64     fld  = 2,       /// FLD mem
65     fop  = 3,       /// Fop ST0,mem or Fop ST0
66 }
67 
68 private enum CIFL : ubyte
69 {
70     arraybounds = 1,     /// this instruction is a jmp to array bounds
71     ea          = 2,     /// this instruction has a memory-referencing
72                              /// modregrm EA byte
73     nostage     = 4,     /// don't stage these instructions
74     push        = 8,     /// it's a push we can swap around
75 }
76 
77 // Struct where we gather information about an instruction
78 struct Cinfo
79 {
80     code *c;            // the instruction
81     ubyte pair;         // pairing information
82     ubyte sz;           // operand size
83     ubyte isz;          // instruction size
84 
85     // For floating point scheduling
86     ubyte fxch_pre;
87     ubyte fxch_post;
88     FP fp_op;           /// FPxxxx
89 
90     ubyte flags;         /// CIFLxxx
91 
92     uint r;             // read mask
93     uint w;             // write mask
94     uint a;             // registers used in addressing mode
95     ubyte reg;          // reg field of modregrm byte
96     ubyte uops;         // Pentium Pro micro-ops
97     uint sibmodrm;      // (sib << 8) + mod__rm byte
98     uint spadjust;      // if !=0, then amount ESP changes as a result of this
99                         // instruction being executed
100     int fpuadjust;      // if !=0, then amount FPU stack changes as a result
101                         // of this instruction being executed
102 
103     @trusted
104     nothrow void print()        // pretty-printer
105     {
106         Cinfo *ci = &this;
107 
108         if (ci == null)
109         {
110             printf("Cinfo 0\n");
111             return;
112         }
113 
114         printf("Cinfo %p:  c %p, pair %x, sz %d, isz %d, flags - ",
115                ci,c,pair,sz,isz);
116         if (ci.flags & CIFL.arraybounds)
117             printf("arraybounds,");
118         if (ci.flags & CIFL.ea)
119             printf("ea,");
120         if (ci.flags & CIFL.nostage)
121             printf("nostage,");
122         if (ci.flags & CIFL.push)
123             printf("push,");
124         if (ci.flags & ~(CIFL.arraybounds|CIFL.nostage|CIFL.push|CIFL.ea))
125             printf("bad flag,");
126         printf("\n\tr %x w %x a %x reg %x uops %x sibmodrm %x spadjust %d\n",
127                 cast(int)r,cast(int)w,cast(int)a,reg,uops,sibmodrm,cast(int)spadjust);
128         if (ci.fp_op)
129         {
130             __gshared const(char*)[3] fpops = ["fstp","fld","fop"];
131 
132             printf("\tfp_op %s, fxch_pre %x, fxch_post %x\n",
133                     fpops[fp_op-1],fxch_pre,fxch_post);
134         }
135     }
136 
137 }
138 
139 
140 /*****************************************
141  * Do Pentium optimizations.
142  * Input:
143  *      scratch         scratch registers we can use
144  */
145 
146 @trusted
147 private void cgsched_pentium(code **pc,regm_t scratch)
148 {
149     //printf("scratch = x%02x\n",scratch);
150     if (config.target_scheduler >= TARGET_80486)
151     {
152         if (!I64)
153             *pc = peephole(*pc,0);
154         if (I32)                        // forget about 16 bit code
155         {
156             if (config.target_cpu == TARGET_Pentium ||
157                 config.target_cpu == TARGET_PentiumMMX)
158                 *pc = simpleops(*pc,scratch);
159             *pc = schedule(*pc,0);
160         }
161     }
162 }
163 
164 /************************************
165  * Entry point
166  */
167 @trusted
168 public void cgsched_block(block* b)
169 {
170     if (config.flags4 & CFG4speed &&
171         config.target_cpu >= TARGET_Pentium &&
172         b.BC != BCasm)
173     {
174         regm_t scratch = allregs;
175 
176         scratch &= ~(b.Bregcon.used | b.Bregcon.params | mfuncreg);
177         scratch &= ~(b.Bregcon.immed.mval | b.Bregcon.cse.mval);
178         cgsched_pentium(&b.Bcode,scratch);
179         //printf("after schedule:\n"); WRcodlst(b.Bcode);
180     }
181 }
182 
183 enum
184 {
185     NP    = 0,       /// not pairable
186     PU    = 1,       /// pairable in U only, never executed in V
187     PV    = 2,       /// pairable in V only
188     UV    = (PU|PV), /// pairable in both U and V
189     PE    = 4,       /// register contention exception
190     PF    = 8,       /// flags contention exception
191     FX    = 0x10,    /// pairable with FXCH instruction
192 }
193 
194 extern (D) private immutable ubyte[256] pentcycl =
195 [
196         UV,UV,UV,UV,    UV,UV,NP,NP,    // 0
197         UV,UV,UV,UV,    UV,UV,NP,NP,    // 8
198         PU,PU,PU,PU,    PU,PU,NP,NP,    // 10
199         PU,PU,PU,PU,    PU,PU,NP,NP,    // 18
200         UV,UV,UV,UV,    UV,UV,NP,NP,    // 20
201         UV,UV,UV,UV,    UV,UV,NP,NP,    // 28
202         UV,UV,UV,UV,    UV,UV,NP,NP,    // 30
203         UV,UV,UV,UV,    UV,UV,NP,NP,    // 38
204 
205         UV,UV,UV,UV,    UV,UV,UV,UV,    // 40
206         UV,UV,UV,UV,    UV,UV,UV,UV,    // 48
207         PE|UV,PE|UV,PE|UV,PE|UV,        PE|UV,PE|UV,PE|UV,PE|UV, // 50  PUSH reg
208         PE|UV,PE|UV,PE|UV,PE|UV,        PE|UV,PE|UV,PE|UV,PE|UV, // 58  POP reg
209         NP,NP,NP,NP,    NP,NP,NP,NP,    // 60
210         PE|UV,NP,PE|UV,NP,      NP,NP,NP,NP,    // 68
211         PV|PF,PV|PF,PV|PF,PV|PF,        PV|PF,PV|PF,PV|PF,PV|PF,        // 70   Jcc rel8
212         PV|PF,PV|PF,PV|PF,PV|PF,        PV|PF,PV|PF,PV|PF,PV|PF,        // 78   Jcc rel8
213 
214         NP,NP,NP,NP,    NP,NP,NP,NP,    // 80
215         UV,UV,UV,UV,    NP,UV,NP,NP,    // 88
216         NP,NP,NP,NP,    NP,NP,NP,NP,    // 90
217         NP,NP,NP,NP,    NP,NP,NP,NP,    // 98
218         UV,UV,UV,UV,    NP,NP,NP,NP,    // A0
219         UV,UV,NP,NP,    NP,NP,NP,NP,    // A8
220         UV,UV,UV,UV,    UV,UV,UV,UV,    // B0
221         UV,UV,UV,UV,    UV,UV,UV,UV,    // B8
222 
223         NP,NP,NP,NP,    NP,NP,NP,NP,    // C0
224         NP,NP,NP,NP,    NP,NP,NP,NP,    // C8
225         PU,PU,NP,NP,    NP,NP,NP,NP,    // D0
226         FX,NP,FX,FX,    NP,NP,FX,NP,    // D8   all floating point
227         NP,NP,NP,NP,    NP,NP,NP,NP,    // E0
228         PE|PV,PV,NP,PV, NP,NP,NP,NP,    // E8
229         NP,NP,NP,NP,    NP,NP,NP,NP,    // F0
230         NP,NP,NP,NP,    NP,NP,NP,NP,    // F8
231 ];
232 
233 /********************************************
234  * For each opcode, determine read [0] and written [1] masks.
235  */
236 
237 enum
238 {
239     EA    = 0x100000,
240     R     = 0x200000,       /// register (reg of modregrm field)
241     N     = 0x400000,       /// other things modified, not swappable
242     B     = 0x800000,       /// it's a byte operation
243     C     = 0x1000000,      /// floating point flags
244     mMEM  = 0x2000000,      /// memory
245     S     = 0x4000000,      /// floating point stack
246     F     = 0x8000000,      /// flags
247 }
248 
249 extern (D) private immutable uint[2][256] oprw =
250 [
251       // 00
252       [ EA|R|B, F|EA|B ],       // ADD
253       [ EA|R,   F|EA   ],
254       [ EA|R|B, F|R|B  ],
255       [ EA|R,   F|R    ],
256       [ mAX,    F|mAX  ],
257       [ mAX,    F|mAX  ],
258       [ N,      N      ],       // PUSH ES
259       [ N,      N      ],       // POP  ES
260 
261       // 08
262       [ EA|R|B, F|EA|B ],       // OR
263       [ EA|R,   F|EA   ],
264       [ EA|R|B, F|R|B  ],
265       [ EA|R,   F|R    ],
266       [ mAX,    F|mAX  ],
267       [ mAX,    F|mAX  ],
268       [ N,      N      ],       // PUSH CS
269       [ N,      N      ],       // 2 byte escape
270 
271       // 10
272       [ F|EA|R|B,F|EA|B ],      // ADC
273       [ F|EA|R, F|EA    ],
274       [ F|EA|R|B,F|R|B  ],
275       [ F|EA|R, F|R     ],
276       [ F|mAX,  F|mAX   ],
277       [ F|mAX,  F|mAX   ],
278       [ N,      N       ],      // PUSH SS
279       [ N,      N       ],      // POP  SS
280 
281       // 18
282       [ F|EA|R|B,F|EA|B ],      // SBB
283       [ F|EA|R, F|EA    ],
284       [ F|EA|R|B,F|R|B  ],
285       [ F|EA|R, F|R     ],
286       [ F|mAX,  F|mAX   ],
287       [ F|mAX,  F|mAX   ],
288       [ N,      N       ],      // PUSH DS
289       [ N,      N       ],      // POP  DS
290 
291       // 20
292       [ EA|R|B, F|EA|B ],       // AND
293       [ EA|R,   F|EA   ],
294       [ EA|R|B, F|R|B  ],
295       [ EA|R,   F|R    ],
296       [ mAX,    F|mAX  ],
297       [ mAX,    F|mAX  ],
298       [ N,      N      ],       // SEG ES
299       [ F|mAX,  F|mAX  ],       // DAA
300 
301       // 28
302       [ EA|R|B, F|EA|B ],       // SUB
303       [ EA|R,   F|EA   ],
304       [ EA|R|B, F|R|B  ],
305       [ EA|R,   F|R    ],
306       [ mAX,    F|mAX  ],
307       [ mAX,    F|mAX  ],
308       [ N,      N      ],       // SEG CS
309       [ F|mAX,  F|mAX  ],       // DAS
310 
311       // 30
312       [ EA|R|B, F|EA|B ],       // XOR
313       [ EA|R,   F|EA   ],
314       [ EA|R|B, F|R|B  ],
315       [ EA|R,   F|R    ],
316       [ mAX,    F|mAX  ],
317       [ mAX,    F|mAX  ],
318       [ N,      N      ],       // SEG SS
319       [ F|mAX,  F|mAX  ],       // AAA
320 
321       // 38
322       [ EA|R|B, F ],            // CMP
323       [ EA|R,   F ],
324       [ EA|R|B, F ],
325       [ EA|R,   F ],
326       [ mAX,    F ],            // CMP AL,imm8
327       [ mAX,    F ],            // CMP EAX,imm16/32
328       [ N,      N ],            // SEG DS
329       [ N,      N ],            // AAS
330 
331       // 40
332       [ mAX,    F|mAX ],        // INC EAX
333       [ mCX,    F|mCX ],
334       [ mDX,    F|mDX ],
335       [ mBX,    F|mBX ],
336       [ mSP,    F|mSP ],
337       [ mBP,    F|mBP ],
338       [ mSI,    F|mSI ],
339       [ mDI,    F|mDI ],
340 
341       // 48
342       [ mAX,    F|mAX ],        // DEC EAX
343       [ mCX,    F|mCX ],
344       [ mDX,    F|mDX ],
345       [ mBX,    F|mBX ],
346       [ mSP,    F|mSP ],
347       [ mBP,    F|mBP ],
348       [ mSI,    F|mSI ],
349       [ mDI,    F|mDI ],
350 
351       // 50
352       [ mAX|mSP,        mSP|mMEM ],             // PUSH EAX
353       [ mCX|mSP,        mSP|mMEM ],
354       [ mDX|mSP,        mSP|mMEM ],
355       [ mBX|mSP,        mSP|mMEM ],
356       [ mSP|mSP,        mSP|mMEM ],
357       [ mBP|mSP,        mSP|mMEM ],
358       [ mSI|mSP,        mSP|mMEM ],
359       [ mDI|mSP,        mSP|mMEM ],
360 
361       // 58
362       [ mSP|mMEM,       mAX|mSP ],              // POP EAX
363       [ mSP|mMEM,       mCX|mSP ],
364       [ mSP|mMEM,       mDX|mSP ],
365       [ mSP|mMEM,       mBX|mSP ],
366       [ mSP|mMEM,       mSP|mSP ],
367       [ mSP|mMEM,       mBP|mSP ],
368       [ mSP|mMEM,       mSI|mSP ],
369       [ mSP|mMEM,       mDI|mSP ],
370 
371       // 60
372       [ N,      N ],            // PUSHA
373       [ N,      N ],            // POPA
374       [ N,      N ],            // BOUND Gv,Ma
375       [ N,      N ],            // ARPL  Ew,Rw
376       [ N,      N ],            // SEG FS
377       [ N,      N ],            // SEG GS
378       [ N,      N ],            // operand size prefix
379       [ N,      N ],            // address size prefix
380 
381       // 68
382       [ mSP,    mSP|mMEM ],     // PUSH immed16/32
383       [ EA,     F|R      ],     // IMUL Gv,Ev,lv
384       [ mSP,    mSP|mMEM ],     // PUSH immed8
385       [ EA,     F|R      ],     // IMUL Gv,Ev,lb
386       [ N,      N        ],     // INSB Yb,DX
387       [ N,      N        ],     // INSW/D Yv,DX
388       [ N,      N        ],     // OUTSB DX,Xb
389       [ N,      N        ],     // OUTSW/D DX,Xv
390 
391       // 70
392       [ F|N,    N ],
393       [ F|N,    N ],
394       [ F|N,    N ],
395       [ F|N,    N ],
396       [ F|N,    N ],
397       [ F|N,    N ],
398       [ F|N,    N ],
399       [ F|N,    N ],
400 
401       // 78
402       [ F|N,    N ],
403       [ F|N,    N ],
404       [ F|N,    N ],
405       [ F|N,    N ],
406       [ F|N,    N ],
407       [ F|N,    N ],
408       [ F|N,    N ],
409       [ F|N,    N ],
410 
411       // 80
412       [ N,      N    ],
413       [ N,      N    ],
414       [ N,      N    ],
415       [ N,      N    ],
416       [ EA|R,   F    ],         // TEST EA,r8
417       [ EA|R,   F    ],         // TEST EA,r16/32
418       [ EA|R,   EA|R ],         // XCHG EA,r8
419       [ EA|R,   EA|R ],         // XCHG EA,r16/32
420 
421       // 88
422       [ R|B,    EA|B ],         // MOV EA8,r8
423       [ R,      EA ],           // MOV EA,r16/32
424       [ EA|B,   R|B ],          // MOV r8,EA8
425       [ EA,     R ],            // MOV r16/32,EA
426       [ N,      N ],            // MOV EA,segreg
427       [ EA,     R ],            // LEA r16/32,EA
428       [ N,      N ],            // MOV segreg,EA
429       [ mSP|mMEM, EA|mSP ],     // POP mem16/32
430 
431       // 90
432       [ 0,              0       ],      // NOP
433       [ mAX|mCX,        mAX|mCX ],
434       [ mAX|mDX,        mAX|mDX ],
435       [ mAX|mBX,        mAX|mBX ],
436       [ mAX|mSP,        mAX|mSP ],
437       [ mAX|mBP,        mAX|mBP ],
438       [ mAX|mSI,        mAX|mSI ],
439       [ mAX|mDI,        mAX|mDI ],
440 
441       // 98
442       [ mAX,            mAX      ],     // CBW
443       [ mAX,            mDX      ],     // CWD
444       [ N,              N|F      ],     // CALL far ptr
445       [ N,              N        ],     // WAIT
446       [ F|mSP,          mSP|mMEM ],     // PUSHF
447       [ mSP|mMEM,       F|mSP    ],     // POPF
448       [ mAX,            F        ],     // SAHF
449       [ F,              mAX      ],     // LAHF
450 
451       // A0
452       [ mMEM,           mAX  ],         // MOV AL,moffs8
453       [ mMEM,           mAX  ],         // MOV EAX,moffs32
454       [ mAX,            mMEM ],         // MOV moffs8,AL
455       [ mAX,            mMEM ],         // MOV moffs32,EAX
456       [ N,              N    ],         // MOVSB
457       [ N,              N    ],         // MOVSW/D
458       [ N,              N    ],         // CMPSB
459       [ N,              N    ],         // CMPSW/D
460 
461       // A8
462       [ mAX,    F ],                    // TEST AL,imm8
463       [ mAX,    F ],                    // TEST AX,imm16
464       [ N,      N ],                    // STOSB
465       [ N,      N ],                    // STOSW/D
466       [ N,      N ],                    // LODSB
467       [ N,      N ],                    // LODSW/D
468       [ N,      N ],                    // SCASB
469       [ N,      N ],                    // SCASW/D
470 
471       // B0
472       [ 0,      mAX ],                  // MOV AL,imm8
473       [ 0,      mCX ],
474       [ 0,      mDX ],
475       [ 0,      mBX ],
476       [ 0,      mAX ],
477       [ 0,      mCX ],
478       [ 0,      mDX ],
479       [ 0,      mBX ],
480 
481       // B8
482       [ 0,      mAX ],                  // MOV AX,imm16
483       [ 0,      mCX ],
484       [ 0,      mDX ],
485       [ 0,      mBX ],
486       [ 0,      mSP ],
487       [ 0,      mBP ],
488       [ 0,      mSI ],
489       [ 0,      mDI ],
490 
491       // C0
492       [ EA,     F|EA ],         // Shift Eb,Ib
493       [ EA,     F|EA ],
494       [ N,      N    ],
495       [ N,      N    ],
496       [ N,      N    ],
497       [ N,      N    ],
498       [ 0,      EA|B ],         // MOV EA8,imm8
499       [ 0,      EA   ],         // MOV EA,imm16
500 
501       // C8
502       [ N,      N ],            // ENTER
503       [ N,      N ],            // LEAVE
504       [ N,      N ],            // RETF lw
505       [ N,      N ],            // RETF
506       [ N,      N ],            // INT 3
507       [ N,      N ],            // INT lb
508       [ N,      N ],            // INTO
509       [ N,      N ],            // IRET
510 
511       // D0
512       [ EA,             F|EA  ],        // Shift EA,1
513       [ EA,             F|EA  ],
514       [ EA|mCX,         F|EA  ],        // Shift EA,CL
515       [ EA|mCX,         F|EA  ],
516       [ mAX,            F|mAX ],        // AAM
517       [ mAX,            F|mAX ],        // AAD
518       [ N,              N     ],        // reserved
519       [ mAX|mBX|mMEM,   mAX   ],        // XLAT
520 
521       // D8
522       [ N,      N ],
523       [ N,      N ],
524       [ N,      N ],
525       [ N,      N ],
526       [ N,      N ],
527       [ N,      N ],
528       [ N,      N ],
529       [ N,      N ],
530 
531       // E0
532       [ F|mCX|N,mCX|N ],        // LOOPNE jb
533       [ F|mCX|N,mCX|N ],        // LOOPE  jb
534       [ mCX|N,  mCX|N ],        // LOOP   jb
535       [ mCX|N,  N     ],        // JCXZ   jb
536       [ N,      N     ],        // IN AL,lb
537       [ N,      N     ],        // IN EAX,lb
538       [ N,      N     ],        // OUT lb,AL
539       [ N,      N     ],        // OUT lb,EAX
540 
541       // E8
542       [ N,      N|F   ],        // CALL jv
543       [ N,      N     ],        // JMP Jv
544       [ N,      N     ],        // JMP Ab
545       [ N,      N     ],        // JMP jb
546       [ N|mDX,  N|mAX ],        // IN AL,DX
547       [ N|mDX,  N|mAX ],        // IN AX,DX
548       [ N|mAX|mDX,N   ],        // OUT DX,AL
549       [ N|mAX|mDX,N   ],        // OUT DX,AX
550 
551       // F0
552       [ N,      N ],            // LOCK
553       [ N,      N ],            // reserved
554       [ N,      N ],            // REPNE
555       [ N,      N ],            // REP,REPE
556       [ N,      N ],            // HLT
557       [ F,      F ],            // CMC
558       [ N,      N ],
559       [ N,      N ],
560 
561       // F8
562       [ 0,      F    ],         // CLC
563       [ 0,      F    ],         // STC
564       [ N,      N    ],         // CLI
565       [ N,      N    ],         // STI
566       [ N,      N    ],         // CLD
567       [ N,      N    ],         // STD
568       [ EA,     F|EA ],         // INC/DEC
569       [ N,      N    ],
570 ];
571 
572 /****************************************
573  * Same thing, but for groups.
574  */
575 
576 extern (D) private immutable uint[2][8][8] grprw =
577 [
578     [
579         // Grp 1
580       [ EA,     F|EA ],           // ADD
581       [ EA,     F|EA ],           // OR
582       [ F|EA,   F|EA ],           // ADC
583       [ F|EA,   F|EA ],           // SBB
584       [ EA,     F|EA ],           // AND
585       [ EA,     F|EA ],           // SUB
586       [ EA,     F|EA ],           // XOR
587       [ EA,     F    ],           // CMP
588     ],
589     [
590         // Grp 3
591       [ EA,     F ],              // TEST EA,imm
592       [ N,      N ],              // reserved
593       [ EA,     EA ],             // NOT
594       [ EA,     F|EA ],           // NEG
595       [ mAX|EA, F|mAX|mDX ],      // MUL
596       [ mAX|EA, F|mAX|mDX ],      // IMUL
597       [ mAX|mDX|EA, F|mAX|mDX ],  // DIV
598 
599         // Could generate an exception we want to catch
600         //mAX|mDX|EA|N,   F|mAX|mDX|N,    // IDIV
601 
602       [ mAX|mDX|EA,     F|mAX|mDX ],      // IDIV
603     ],
604     [
605         // Grp 5
606       [ EA,     F|EA ],           // INC Ev
607       [ EA,     F|EA ],           // DEC Ev
608       [ N|EA,   N ],              // CALL Ev
609       [ N|EA,   N ],              // CALL eP
610       [ N|EA,   N ],              // JMP Ev
611       [ N|EA,   N ],              // JMP Ep
612       [ mSP|EA, mSP|mMEM ],       // PUSH Ev
613       [ N,      N ],              // reserved
614     ],
615     [
616         // Grp 3, byte version
617       [ EA|B,   F ],              // TEST EA,imm
618       [ N,      N ],              // reserved
619       [ EA|B,   EA|B ],           // NOT
620       [ EA|B,   F|EA|B ],         // NEG
621       [ mAX|EA, F|mAX ],          // MUL
622       [ mAX|EA, F|mAX ],          // IMUL
623       [ mAX|EA, F|mAX ],          // DIV
624 
625         // Could generate an exception we want to catch
626         //mAX|EA|N,       F|mAX|N,        // IDIV
627 
628       [ mAX|EA, F|mAX ],          // IDIV
629     ]
630 ];
631 
632 /********************************************
633  * For floating point opcodes 0xD8..0xDF, with Irm < 0xC0.
634  *      [][][0] = read
635  *          [1] = write
636  */
637 
638 extern (D) private immutable uint[2][8][8] grpf1 =
639 [
640     [
641         // 0xD8
642       [ EA|S,   S|C ],    // FADD  float
643       [ EA|S,   S|C ],    // FMUL  float
644       [ EA|S,   C ],      // FCOM  float
645       [ EA|S,   S|C ],    // FCOMP float
646       [ EA|S,   S|C ],    // FSUB  float
647       [ EA|S,   S|C ],    // FSUBR float
648       [ EA|S,   S|C ],    // FDIV  float
649       [ EA|S,   S|C ],    // FDIVR float
650     ],
651     [
652         // 0xD9
653       [ EA,     S|C ],    // FLD  float
654       [ N,      N ],      //
655       [ S,      EA|C ],   // FST  float
656       [ S,      EA|S|C ], // FSTP float
657       [ N,      N ],      // FLDENV
658       [ N,      N ],      // FLDCW
659       [ N,      N ],      // FSTENV
660       [ N,      N ],      // FSTCW
661     ],
662     [
663         // 0xDA
664       [ EA|S,   S|C ],    // FIADD  long
665       [ EA|S,   S|C ],    // FIMUL  long
666       [ EA|S,   C ],      // FICOM  long
667       [ EA|S,   S|C ],    // FICOMP long
668       [ EA|S,   S|C ],    // FISUB  long
669       [ EA|S,   S|C ],    // FISUBR long
670       [ EA|S,   S|C ],    // FIDIV  long
671       [ EA|S,   S|C ],    // FIDIVR long
672     ],
673     [
674         // 0xDB
675       [ EA,     S|C ],    // FILD long
676       [ S,      EA|S|C ], // FISTTP int
677       [ S,      EA|C ],   // FIST long
678       [ S,      EA|S|C ], // FISTP long
679       [ N,      N ],      //
680       [ EA,     S|C ],    // FLD real80
681       [ N,      N ],      //
682       [ S,      EA|S|C ], // FSTP real80
683     ],
684     [
685         // 0xDC
686       [ EA|S,   S|C ],    // FADD  double
687       [ EA|S,   S|C ],    // FMUL  double
688       [ EA|S,   C ],      // FCOM  double
689       [ EA|S,   S|C ],    // FCOMP double
690       [ EA|S,   S|C ],    // FSUB  double
691       [ EA|S,   S|C ],    // FSUBR double
692       [ EA|S,   S|C ],    // FDIV  double
693       [ EA|S,   S|C ],    // FDIVR double
694     ],
695     [
696         // 0xDD
697       [ EA,     S|C ],    // FLD double
698       [ S,      EA|S|C ], // FISTTP long
699       [ S,      EA|C ],   // FST double
700       [ S,      EA|S|C ], // FSTP double
701       [ N,      N ],      // FRSTOR
702       [ N,      N ],      //
703       [ N,      N ],      // FSAVE
704       [ C,      EA ],     // FSTSW
705     ],
706     [
707         // 0xDE
708       [ EA|S,   S|C ],    // FIADD  short
709       [ EA|S,   S|C ],    // FIMUL  short
710       [ EA|S,   C ],      // FICOM  short
711       [ EA|S,   S|C ],    // FICOMP short
712       [ EA|S,   S|C ],    // FISUB  short
713       [ EA|S,   S|C ],    // FISUBR short
714       [ EA|S,   S|C ],    // FIDIV  short
715       [ EA|S,   S|C ],    // FIDIVR short
716     ],
717     [
718         // 0xDF
719       [ EA,     S|C ],    // FILD short
720       [ S,      EA|S|C ], // FISTTP short
721       [ S,      EA|C ],   // FIST short
722       [ S,      EA|S|C ], // FISTP short
723       [ EA,     S|C ],    // FBLD packed BCD
724       [ EA,     S|C ],    // FILD long long
725       [ S,      EA|S|C ], // FBSTP packed BCD
726       [ S,      EA|S|C ], // FISTP long long
727     ]
728 ];
729 
730 
731 /********************************************
732  * Micro-ops for floating point opcodes 0xD8..0xDF, with Irm < 0xC0.
733  */
734 
735 extern (D) private immutable ubyte[8][8] uopsgrpf1 =
736 [
737     [
738         // 0xD8
739         2,              // FADD  float
740         2,              // FMUL  float
741         2,              // FCOM  float
742         2,              // FCOMP float
743         2,              // FSUB  float
744         2,              // FSUBR float
745         2,              // FDIV  float
746         2,              // FDIVR float
747     ],
748     [
749         // 0xD9
750         1,              // FLD  float
751         0,              //
752         2,              // FST  float
753         2,              // FSTP float
754         5,              // FLDENV
755         3,              // FLDCW
756         5,              // FSTENV
757         5,              // FSTCW
758     ],
759     [
760         // 0xDA
761         5,              // FIADD  long
762         5,              // FIMUL  long
763         5,              // FICOM  long
764         5,              // FICOMP long
765         5,              // FISUB  long
766         5,              // FISUBR long
767         5,              // FIDIV  long
768         5,              // FIDIVR long
769     ],
770     [
771         // 0xDB
772         4,              // FILD long
773         0,              //
774         4,              // FIST long
775         4,              // FISTP long
776         0,              //
777         4,              // FLD real80
778         0,              //
779         5,              // FSTP real80
780     ],
781     [
782         // 0xDC
783         2,              // FADD  double
784         2,              // FMUL  double
785         2,              // FCOM  double
786         2,              // FCOMP double
787         2,              // FSUB  double
788         2,              // FSUBR double
789         2,              // FDIV  double
790         2,              // FDIVR double
791     ],
792     [
793         // 0xDD
794         1,              // FLD double
795         0,              //
796         2,              // FST double
797         2,              // FSTP double
798         5,              // FRSTOR
799         0,              //
800         5,              // FSAVE
801         5,              // FSTSW
802     ],
803     [
804         // 0xDE
805         5,              // FIADD  short
806         5,              // FIMUL  short
807         5,              // FICOM  short
808         5,              // FICOMP short
809         5,              // FISUB  short
810         5,              // FISUBR short
811         5,              // FIDIV  short
812         5,              // FIDIVR short
813     ],
814     [
815         // 0xDF
816         4,              // FILD short
817         0,              //
818         4,              // FIST short
819         4,              // FISTP short
820         5,              // FBLD packed BCD
821         4,              // FILD long long
822         5,              // FBSTP packed BCD
823         4,              // FISTP long long
824     ]
825 ];
826 
827 /**************************************************
828  * Determine number of micro-ops for Pentium Pro and Pentium II processors.
829  * 0 means special case,
830  * 5 means 'complex'
831  */
832 
833 extern (D) private immutable ubyte[256] insuops =
834 [       0,0,0,0,        1,1,4,5,                /* 00 */
835         0,0,0,0,        1,1,4,0,                /* 08 */
836         0,0,0,0,        2,2,4,5,                /* 10 */
837         0,0,0,0,        2,2,4,5,                /* 18 */
838         0,0,0,0,        1,1,0,1,                /* 20 */
839         0,0,0,0,        1,1,0,1,                /* 28 */
840         0,0,0,0,        1,1,0,1,                /* 30 */
841         0,0,0,0,        1,1,0,1,                /* 38 */
842         1,1,1,1,        1,1,1,1,                /* 40 */
843         1,1,1,1,        1,1,1,1,                /* 48 */
844         3,3,3,3,        3,3,3,3,                /* 50 */
845         2,2,2,2,        3,2,2,2,                /* 58 */
846         5,5,5,5,        0,0,0,0,                /* 60 */
847         3,3,0,0,        5,5,5,5,                /* 68 */
848         1,1,1,1,        1,1,1,1,                /* 70 */
849         1,1,1,1,        1,1,1,1,                /* 78 */
850         0,0,0,0,        0,0,0,0,                /* 80 */
851         0,0,0,0,        0,1,4,0,                /* 88 */
852         1,3,3,3,        3,3,3,3,                /* 90 */
853         1,1,5,0,        5,5,1,1,                /* 98 */
854         1,1,2,2,        5,5,5,5,                /* A0 */
855         1,1,3,3,        2,2,3,3,                /* A8 */
856         1,1,1,1,        1,1,1,1,                /* B0 */
857         1,1,1,1,        1,1,1,1,                /* B8 */
858         0,0,5,4,        0,0,0,0,                /* C0 */
859         5,3,5,5,        5,3,5,5,                /* C8 */
860         0,0,0,0,        4,3,0,2,                /* D0 */
861         0,0,0,0,        0,0,0,0,                /* D8 */
862         4,4,4,2,        5,5,5,5,                /* E0 */
863         4,1,5,1,        5,5,5,5,                /* E8 */
864         0,0,5,5,        5,1,0,0,                /* F0 */
865         1,1,5,5,        4,4,0,0,                /* F8 */
866 ];
867 
868 extern (D) private immutable ubyte[8] uopsx = [ 1,1,2,5,1,1,1,5 ];
869 
870 /************************************************
871  * Determine number of micro-ops for Pentium Pro and Pentium II processors.
872  * 5 means 'complex'.
873  * Doesn't currently handle:
874  *      floating point
875  *      MMX
876  *      0F opcodes
877  *      prefix bytes
878  */
879 
880 private int uops(code *c)
881 {   int n;
882     int op;
883     int op2;
884 
885     op = c.Iop & 0xFF;
886     if ((c.Iop & 0xFF00) == 0x0F00)
887         op = 0x0F;
888     n = insuops[op];
889     if (!n)                             // if special case
890     {   ubyte irm,mod,reg,rm;
891 
892         irm = c.Irm;
893         mod = (irm >> 6) & 3;
894         reg = (irm >> 3) & 7;
895         rm = irm & 7;
896 
897         switch (op)
898         {
899             case 0x10:
900             case 0x11:                  // ADC rm,r
901             case 0x18:
902             case 0x19:                  // SBB rm,r
903                 n = (mod == 3) ? 2 : 4;
904                 break;
905 
906             case 0x12:
907             case 0x13:                  // ADC r,rm
908             case 0x1A:
909             case 0x1B:                  // SBB r,rm
910                 n = (mod == 3) ? 2 : 3;
911                 break;
912 
913             case 0x00:
914             case 0x01:                  // ADD rm,r
915             case 0x08:
916             case 0x09:                  // OR rm,r
917             case 0x20:
918             case 0x21:                  // AND rm,r
919             case 0x28:
920             case 0x29:                  // SUB rm,r
921             case 0x30:
922             case 0x31:                  // XOR rm,r
923                 n = (mod == 3) ? 1 : 4;
924                 break;
925 
926             case 0x02:
927             case 0x03:                  // ADD r,rm
928             case 0x0A:
929             case 0x0B:                  // OR r,rm
930             case 0x22:
931             case 0x23:                  // AND r,rm
932             case 0x2A:
933             case 0x2B:                  // SUB r,rm
934             case 0x32:
935             case 0x33:                  // XOR r,rm
936             case 0x38:
937             case 0x39:                  // CMP rm,r
938             case 0x3A:
939             case 0x3B:                  // CMP r,rm
940             case 0x69:                  // IMUL rm,r,imm
941             case 0x6B:                  // IMUL rm,r,imm8
942             case 0x84:
943             case 0x85:                  // TEST rm,r
944                 n = (mod == 3) ? 1 : 2;
945                 break;
946 
947             case 0x80:
948             case 0x81:
949             case 0x82:
950             case 0x83:
951                 if (reg == 2 || reg == 3)       // ADC/SBB rm,imm
952                     n = (mod == 3) ? 2 : 4;
953                 else if (reg == 7)              // CMP rm,imm
954                     n = (mod == 3) ? 1 : 2;
955                 else
956                     n = (mod == 3) ? 1 : 4;
957                 break;
958 
959             case 0x86:
960             case 0x87:                          // XCHG rm,r
961                 n = (mod == 3) ? 3 : 5;
962                 break;
963 
964             case 0x88:
965             case 0x89:                          // MOV rm,r
966                 n = (mod == 3) ? 1 : 2;
967                 break;
968 
969             case 0x8A:
970             case 0x8B:                          // MOV r,rm
971                 n = 1;
972                 break;
973 
974             case 0x8C:                          // MOV Sreg,rm
975                 n = (mod == 3) ? 1 : 3;
976                 break;
977 
978             case 0x8F:
979                 if (reg == 0)                   // POP m
980                     n = 5;
981                 break;
982 
983             case 0xC6:
984             case 0xC7:
985                 if (reg == 0)                   // MOV rm,imm
986                     n = (mod == 3) ? 1 : 2;
987                 break;
988 
989             case 0xD0:
990             case 0xD1:
991                 if (reg == 2 || reg == 3)       // RCL/RCR rm,1
992                     n = (mod == 3) ? 2 : 4;
993                 else
994                     n = (mod == 3) ? 1 : 4;
995                 break;
996 
997             case 0xC0:
998             case 0xC1:                          // RCL/RCR rm,imm8
999             case 0xD2:
1000             case 0xD3:
1001                 if (reg == 2 || reg == 3)       // RCL/RCR rm,CL
1002                     n = 5;
1003                 else
1004                     n = (mod == 3) ? 1 : 4;
1005                 break;
1006 
1007             case 0xD8:
1008             case 0xD9:
1009             case 0xDA:
1010             case 0xDB:
1011             case 0xDC:
1012             case 0xDD:
1013             case 0xDE:
1014             case 0xDF:
1015                 // Floating point opcodes
1016                 if (irm < 0xC0)
1017                 {   n = uopsgrpf1[op - 0xD8][reg];
1018                     break;
1019                 }
1020                 n = uopsx[op - 0xD8];
1021                 switch (op)
1022                 {
1023                     case 0xD9:
1024                         switch (irm)
1025                         {
1026                             case 0xE0:          // FCHS
1027                                 n = 3;
1028                                 break;
1029                             case 0xE8:
1030                             case 0xE9:
1031                             case 0xEA:
1032                             case 0xEB:
1033                             case 0xEC:
1034                             case 0xED:
1035                                 n = 2;
1036                                 break;
1037                             case 0xF0:
1038                             case 0xF1:
1039                             case 0xF2:
1040                             case 0xF3:
1041                             case 0xF4:
1042                             case 0xF5:
1043                             case 0xF8:
1044                             case 0xF9:
1045                             case 0xFB:
1046                             case 0xFC:
1047                             case 0xFD:
1048                             case 0xFE:
1049                             case 0xFF:
1050                                 n = 5;
1051                                 break;
1052 
1053                             default:
1054                                 break;
1055                         }
1056                         break;
1057                     case 0xDE:
1058                         if (irm == 0xD9)        // FCOMPP
1059                             n = 2;
1060                         break;
1061 
1062                     default:
1063                         break;
1064                 }
1065                 break;
1066 
1067             case 0xF6:
1068                 if (reg == 6 || reg == 7)       // DIV AL,rm8
1069                     n = (mod == 3) ? 3 : 4;
1070                 else if (reg == 4 || reg == 5 || reg == 0)      // MUL/IMUL/TEST rm8
1071                     n = (mod == 3) ? 1 : 2;
1072                 else if (reg == 2 || reg == 3)  // NOT/NEG rm
1073                     n = (mod == 3) ? 1 : 4;
1074                 break;
1075 
1076             case 0xF7:
1077                 if (reg == 6 || reg == 7)       // DIV EAX,rm
1078                     n = 4;
1079                 else if (reg == 4 || reg == 5)  // MUL/IMUL rm
1080                     n = (mod == 3) ? 3 : 4;
1081                 else if (reg == 2 || reg == 3)  // NOT/NEG rm
1082                     n = (mod == 3) ? 1 : 4;
1083                 break;
1084 
1085             case 0xFF:
1086                 if (reg == 2 || reg == 3 ||     // CALL rm, CALL m,rm
1087                     reg == 5)                   // JMP seg:offset
1088                     n = 5;
1089                 else if (reg == 4)
1090                     n = (mod == 3) ? 1 : 2;
1091                 else if (reg == 0 || reg == 1)  // INC/DEC rm
1092                     n = (mod == 3) ? 1 : 4;
1093                 else if (reg == 6)              // PUSH rm
1094                     n = (mod == 3) ? 3 : 4;
1095                 break;
1096 
1097             case 0x0F:
1098                 op2 = c.Iop & 0xFF;
1099                 if ((op2 & 0xF0) == 0x80)       // Jcc
1100                 {   n = 1;
1101                     break;
1102                 }
1103                 if ((op2 & 0xF0) == 0x90)       // SETcc
1104                 {   n = (mod == 3) ? 1 : 3;
1105                     break;
1106                 }
1107                 if (op2 == 0xB6 || op2 == 0xB7 ||       // MOVZX
1108                     op2 == 0xBE || op2 == 0xBF)         // MOVSX
1109                 {   n = 1;
1110                     break;
1111                 }
1112                 if (op2 == 0xAF)                        // IMUL r,m
1113                 {   n = (mod == 3) ? 1 : 2;
1114                     break;
1115                 }
1116                 break;
1117 
1118             default:
1119                 break;
1120         }
1121     }
1122     if (n == 0)
1123         n = 5;                                  // copout for now
1124     return n;
1125 }
1126 
1127 /******************************************
1128  * Determine pairing classification.
1129  * Don't deal with floating point, just assume they are all NP (Not Pairable).
1130  * Returns:
1131  *      NP,UV,PU,PV optionally OR'd with PE
1132  */
1133 
1134 private int pair_class(code *c)
1135 {   ubyte op;
1136     ubyte irm,mod,reg,rm;
1137     uint a32;
1138     int pc;
1139 
1140     // Of course, with Intel this is *never* simple, and Intel's
1141     // documentation is vague about the specifics.
1142 
1143     op = c.Iop & 0xFF;
1144     if ((c.Iop & 0xFF00) == 0x0F00)
1145         op = 0x0F;
1146     pc = pentcycl[op];
1147     a32 = I32;
1148     if (c.Iflags & CFaddrsize)
1149         a32 ^= 1;
1150     irm = c.Irm;
1151     mod = (irm >> 6) & 3;
1152     reg = (irm >> 3) & 7;
1153     rm = irm & 7;
1154     switch (op)
1155     {
1156         case 0x0F:                              // 2 byte opcode
1157             if ((c.Iop & 0xF0) == 0x80)        // if Jcc
1158                 pc = PV | PF;
1159             break;
1160 
1161         case 0x80:
1162         case 0x81:
1163         case 0x83:
1164             if (reg == 2 ||                     // ADC EA,immed
1165                 reg == 3)                       // SBB EA,immed
1166             {   pc = PU;
1167                 goto L2;
1168             }
1169             goto L1;                            // AND/OR/XOR/ADD/SUB/CMP EA,immed
1170 
1171         case 0x84:
1172         case 0x85:                              // TEST EA,reg
1173             if (mod == 3)                       // TEST reg,reg
1174                 pc = UV;
1175             break;
1176 
1177         case 0xC0:
1178         case 0xC1:
1179             if (reg >= 4)
1180                 pc = PU;
1181             break;
1182 
1183         case 0xC6:
1184         case 0xC7:
1185             if (reg == 0)                       // MOV EA,immed
1186             {
1187         L1:
1188                 pc = UV;
1189         L2:
1190                 // if EA contains a displacement then
1191                 // can't execute in V, or pair in U
1192                 switch (mod)
1193                 {   case 0:
1194                         if (a32)
1195                         {   if (rm == 5 ||
1196                                 (rm == 4 && (c.Isib & 7) == 5)
1197                                )
1198                                 pc = NP;
1199                         }
1200                         else if (rm == 6)
1201                             pc = NP;
1202                         break;
1203                     case 1:
1204                     case 2:
1205                         pc = NP;
1206                         break;
1207 
1208                     default:
1209                         break;
1210                 }
1211             }
1212             break;
1213 
1214         case 0xD9:
1215             if (irm < 0xC0)
1216             {
1217                 if (reg == 0)
1218                     pc = FX;
1219             }
1220             else if (irm < 0xC8)
1221                 pc = FX;
1222             else if (irm < 0xD0)
1223                 pc = PV;
1224             else
1225             {
1226                 switch (irm)
1227                 {
1228                     case 0xE0:
1229                     case 0xE1:
1230                     case 0xE4:
1231                         pc = FX;
1232                         break;
1233 
1234                     default:
1235                         break;
1236                 }
1237             }
1238             break;
1239 
1240         case 0xDB:
1241             if (irm < 0xC0 && (reg == 0 || reg == 5))
1242                 pc = FX;
1243             break;
1244 
1245         case 0xDD:
1246             if (irm < 0xC0)
1247             {
1248                 if (reg == 0)
1249                     pc = FX;
1250             }
1251             else if (irm >= 0xE0 && irm < 0xF0)
1252                 pc = FX;
1253             break;
1254 
1255         case 0xDF:
1256             if (irm < 0xC0 && (reg == 0 || reg == 5))
1257                 pc = FX;
1258             break;
1259 
1260         case 0xFE:
1261             if (reg == 0 || reg == 1)           // INC/DEC EA
1262                 pc = UV;
1263             break;
1264         case 0xFF:
1265             if (reg == 0 || reg == 1)           // INC/DEC EA
1266                 pc = UV;
1267             else if (reg == 2 || reg == 4)      // CALL/JMP near ptr EA
1268                 pc = PE|PV;
1269             else if (reg == 6 && mod == 3)      // PUSH reg
1270                 pc = PE | UV;
1271             break;
1272 
1273         default:
1274             break;
1275     }
1276     if (c.Iflags & CFPREFIX && pc == UV)       // if prefix byte
1277         pc = PU;
1278     return pc;
1279 }
1280 
1281 /******************************************
1282  * For an instruction, determine what is read
1283  * and what is written, and what is used for addressing.
1284  * Determine operand size if EA (larger is ok).
1285  */
1286 
1287 @trusted
1288 private void getinfo(Cinfo *ci,code *c)
1289 {
1290     memset(ci,0,Cinfo.sizeof);
1291     if (!c)
1292         return;
1293     ci.c = c;
1294 
1295     if (PRO)
1296     {
1297         ci.uops = cast(ubyte)uops(c);
1298         ci.isz = cast(ubyte)calccodsize(c);
1299     }
1300     else
1301         ci.pair = cast(ubyte)pair_class(c);
1302 
1303     ubyte op;
1304     ubyte op2;
1305     ubyte irm,mod,reg,rm;
1306     uint a32;
1307     int pc;
1308     uint r,w;
1309     int sz = I32 ? 4 : 2;
1310 
1311     ci.r = 0;
1312     ci.w = 0;
1313     ci.a = 0;
1314     op = c.Iop & 0xFF;
1315     if ((c.Iop & 0xFF00) == 0x0F00)
1316         op = 0x0F;
1317     //printf("\tgetinfo %x, op %x \n",c,op);
1318     pc = pentcycl[op];
1319     a32 = I32;
1320     if (c.Iflags & CFaddrsize)
1321         a32 ^= 1;
1322     if (c.Iflags & CFopsize)
1323         sz ^= 2 | 4;
1324     irm = c.Irm;
1325     mod = (irm >> 6) & 3;
1326     reg = (irm >> 3) & 7;
1327     rm = irm & 7;
1328 
1329     r = oprw[op][0];
1330     w = oprw[op][1];
1331 
1332     switch (op)
1333     {
1334         case 0x50:
1335         case 0x51:
1336         case 0x52:
1337         case 0x53:
1338         case 0x55:
1339         case 0x56:
1340         case 0x57:                              // PUSH reg
1341             ci.flags |= CIFL.push;
1342             goto Lpush;
1343 
1344         case 0x54:                              // PUSH ESP
1345         case 0x6A:                              // PUSH imm8
1346         case 0x68:                              // PUSH imm
1347         case 0x0E:
1348         case 0x16:
1349         case 0x1E:
1350         case 0x06:
1351         case 0x9C:
1352         Lpush:
1353             ci.spadjust = -sz;
1354             ci.a |= mSP;
1355             break;
1356 
1357         case 0x58:
1358         case 0x59:
1359         case 0x5A:
1360         case 0x5B:
1361         case 0x5C:
1362         case 0x5D:
1363         case 0x5E:
1364         case 0x5F:                              // POP reg
1365         case 0x1F:
1366         case 0x07:
1367         case 0x17:
1368         case 0x9D:                              // POPF
1369         Lpop:
1370             ci.spadjust = sz;
1371             ci.a |= mSP;
1372             break;
1373 
1374         case 0x80:
1375             if (reg == 7)                       // CMP
1376                 c.Iflags |= CFpsw;
1377             r = B | grprw[0][reg][0];           // Grp 1 (byte)
1378             w = B | grprw[0][reg][1];
1379             break;
1380 
1381         case 0x81:
1382         case 0x83:
1383             if (reg == 7)                       // CMP
1384                 c.Iflags |= CFpsw;
1385             else if (irm == modregrm(3,0,SP))   // ADD ESP,imm
1386             {
1387                 assert(c.IFL2 == FLconst);
1388                 ci.spadjust = (op == 0x81) ? c.IEV2.Vint : cast(byte)c.IEV2.Vint;
1389             }
1390             else if (irm == modregrm(3,5,SP))   // SUB ESP,imm
1391             {
1392                 assert(c.IFL2 == FLconst);
1393                 ci.spadjust = (op == 0x81) ? -c.IEV2.Vint : -cast(int)cast(byte)c.IEV2.Vint;
1394             }
1395             r = grprw[0][reg][0];               // Grp 1
1396             w = grprw[0][reg][1];
1397             break;
1398 
1399         case 0x8F:
1400             if (reg == 0)                       // POP rm
1401                 goto Lpop;
1402             break;
1403 
1404         case 0xA0:
1405         case 0xA1:
1406         case 0xA2:
1407         case 0xA3:
1408             // Fake having an EA to simplify code in conflict()
1409             ci.flags |= CIFL.ea;
1410             ci.reg = 0;
1411             ci.sibmodrm = a32 ? modregrm(0,0,5) : modregrm(0,0,6);
1412             c.IFL1 = c.IFL2;
1413             c.IEV1 = c.IEV2;
1414             break;
1415 
1416         case 0xC2:
1417         case 0xC3:
1418         case 0xCA:
1419         case 0xCB:                              // RET
1420             ci.a |= mSP;
1421             break;
1422 
1423         case 0xE8:
1424             if (c.Iflags & CFclassinit)        // call to __j_classinit
1425             {   r = 0;
1426                 w = F;
1427 
1428 version (CLASSINIT2)
1429                 ci.pair = UV;                  // it is patched to CMP EAX,0
1430 else
1431                 ci.pair = NP;
1432 
1433             }
1434             break;
1435 
1436         case 0xF6:
1437             r = grprw[3][reg][0];               // Grp 3, byte version
1438             w = grprw[3][reg][1];
1439             break;
1440 
1441         case 0xF7:
1442             r = grprw[1][reg][0];               // Grp 3
1443             w = grprw[1][reg][1];
1444             break;
1445 
1446         case 0x0F:
1447             op2 = c.Iop & 0xFF;
1448             if ((op2 & 0xF0) == 0x80)           // if Jxx instructions
1449             {
1450                 ci.r = F | N;
1451                 ci.w = N;
1452                 goto Lret;
1453             }
1454             ci.r = N;
1455             ci.w = N;          // copout for now
1456             goto Lret;
1457 
1458         case 0xD7:                              // XLAT
1459             ci.a = mAX | mBX;
1460             break;
1461 
1462         case 0xFF:
1463             r = grprw[2][reg][0];               // Grp 5
1464             w = grprw[2][reg][1];
1465             if (reg == 6)                       // PUSH rm
1466                 goto Lpush;
1467             break;
1468 
1469         case 0x38:
1470         case 0x39:
1471         case 0x3A:
1472         case 0x3B:
1473         case 0x3C:                              // CMP AL,imm8
1474         case 0x3D:                              // CMP EAX,imm32
1475             // For CMP opcodes, always test for flags
1476             c.Iflags |= CFpsw;
1477             break;
1478 
1479         case ESCAPE:
1480             if (c.Iop == (ESCAPE | ESCadjfpu))
1481                 ci.fpuadjust = c.IEV1.Vint;
1482             break;
1483 
1484         case 0xD0:
1485         case 0xD1:
1486         case 0xD2:
1487         case 0xD3:
1488         case 0xC0:
1489         case 0xC1:
1490             if (reg == 2 || reg == 3)           // if RCL or RCR
1491                 c.Iflags |= CFpsw;             // always test for flags
1492             break;
1493 
1494         case 0xD8:
1495         case 0xD9:
1496         case 0xDA:
1497         case 0xDB:
1498         case 0xDC:
1499         case 0xDD:
1500         case 0xDE:
1501         case 0xDF:
1502             if (irm < 0xC0)
1503             {   r = grpf1[op - 0xD8][reg][0];
1504                 w = grpf1[op - 0xD8][reg][1];
1505                 switch (op)
1506                 {
1507                     case 0xD8:
1508                         if (reg == 3)           // if FCOMP
1509                             ci.fpuadjust = -1;
1510                         else
1511                             ci.fp_op = FP.fop;
1512                         break;
1513 
1514                     case 0xD9:
1515                         if (reg == 0)           // if FLD float
1516                         {   ci.fpuadjust = 1;
1517                             ci.fp_op = FP.fld;
1518                         }
1519                         else if (reg == 3)      // if FSTP float
1520                         {   ci.fpuadjust = -1;
1521                             ci.fp_op = FP.fstp;
1522                         }
1523                         else if (reg == 5 || reg == 7)
1524                             sz = 2;
1525                         else if (reg == 4 || reg == 6)
1526                             sz = 28;
1527                         break;
1528                     case 0xDA:
1529                         if (reg == 3)           // if FICOMP
1530                             ci.fpuadjust = -1;
1531                         break;
1532                     case 0xDB:
1533                         if (reg == 0 || reg == 5)
1534                         {   ci.fpuadjust = 1;
1535                             ci.fp_op = FP.fld;  // FILD / FLD long double
1536                         }
1537                         if (reg == 3 || reg == 7)
1538                             ci.fpuadjust = -1;
1539                         if (reg == 7)
1540                             ci.fp_op = FP.fstp; // FSTP long double
1541                         if (reg == 5 || reg == 7)
1542                             sz = 10;
1543                         break;
1544                     case 0xDC:
1545                         sz = 8;
1546                         if (reg == 3)           // if FCOMP
1547                             ci.fpuadjust = -1;
1548                         else
1549                             ci.fp_op = FP.fop;
1550                         break;
1551                     case 0xDD:
1552                         if (reg == 0)           // if FLD double
1553                         {   ci.fpuadjust = 1;
1554                             ci.fp_op = FP.fld;
1555                         }
1556                         if (reg == 3)           // if FSTP double
1557                         {   ci.fpuadjust = -1;
1558                             ci.fp_op = FP.fstp;
1559                         }
1560                         if (reg == 7)
1561                             sz = 2;
1562                         else if (reg == 4 || reg == 6)
1563                             sz = 108;
1564                         else
1565                             sz = 8;
1566                         break;
1567                     case 0xDE:
1568                         sz = 2;
1569                         if (reg == 3)           // if FICOMP
1570                             ci.fpuadjust = -1;
1571                         break;
1572                     case 0xDF:
1573                         sz = 2;
1574                         if (reg == 4 || reg == 6)
1575                             sz = 10;
1576                         else if (reg == 5 || reg == 7)
1577                             sz = 8;
1578                         if (reg == 0 || reg == 4 || reg == 5)
1579                             ci.fpuadjust = 1;
1580                         else if (reg == 3 || reg == 6 || reg == 7)
1581                             ci.fpuadjust = -1;
1582                         break;
1583 
1584                     default:
1585                         break;
1586                 }
1587                 break;
1588             }
1589             else if (op == 0xDE)
1590             {   ci.fpuadjust = -1;             // pop versions of Fop's
1591                 if (irm == 0xD9)
1592                     ci.fpuadjust = -2;         // FCOMPP
1593             }
1594 
1595             // Most floating point opcodes aren't staged, but are
1596             // sent right through, in order to make use of the large
1597             // latencies with floating point instructions.
1598             if (ci.fp_op == FP.fld ||
1599                 (op == 0xD9 && (irm & 0xF8) == 0xC0))
1600             { }                                // FLD ST(i)
1601             else
1602                 ci.flags |= CIFL.nostage;
1603 
1604             switch (op)
1605             {
1606                 case 0xD8:
1607                     r = S;
1608                     w = C;
1609                     if ((irm & ~7) == 0xD0)
1610                         w |= S;
1611                     break;
1612                 case 0xD9:
1613                     // FCHS or FABS or FSQRT
1614                     if (irm == 0xE0 || irm == 0xE1 || irm == 0xFA)
1615                         ci.fp_op = FP.fop;
1616                     r = S;
1617                     w = S|C;
1618                     break;
1619                 case 0xDA:
1620                     if (irm == 0xE9)    // FUCOMPP
1621                     {   r = S;
1622                         w = S|C;
1623                         break;
1624                     }
1625                     break;
1626                 case 0xDB:
1627                     if (irm == 0xE2)    // FCLEX
1628                     {   r = 0;
1629                         w = C;
1630                         break;
1631                     }
1632                     if (irm == 0xE3)    // FINIT
1633                     {   r = 0;
1634                         w = S|C;
1635                         break;
1636                     }
1637                     break;
1638                 case 0xDC:
1639                 case 0xDE:
1640                     if ((irm & 0xF0) != 0xD0)
1641                     {   r = S;
1642                         w = S|C;
1643                         break;
1644                     }
1645                     break;
1646                 case 0xDD:
1647                     // Not entirely correct, but conservative
1648                     r = S;
1649                     w = S|C;
1650                     break;
1651                 case 0xDF:
1652                     if (irm == 0xE0)    // FSTSW AX
1653                     {   r = C;
1654                         w = mAX;
1655                         break;
1656                     }
1657                     break;
1658 
1659                 default:
1660                     break;
1661             }
1662             break;
1663 
1664         default:
1665             //printf("\t\tNo special case\n");
1666             break;
1667     }
1668 
1669     if ((r | w) & B)                            // if byte operation
1670         sz = 1;                                 // operand size is 1
1671 
1672     ci.r = r & ~(R | EA);
1673     ci.w = w & ~(R | EA);
1674     if (r & R)
1675         ci.r |= mask((r & B) ? (reg & 3) : reg);
1676     if (w & R)
1677         ci.w |= mask((w & B) ? (reg & 3) : reg);
1678 
1679     // OR in bits for EA addressing mode
1680     if ((r | w) & EA)
1681     {   ubyte sib;
1682 
1683         sib = 0;
1684         switch (mod)
1685         {
1686             case 0:
1687                 if (a32)
1688                 {
1689                     if (rm == 4)
1690                     {
1691                         sib = c.Isib;
1692                         if ((sib & modregrm(0,7,0)) != modregrm(0,4,0))
1693                             ci.a |= mask((sib >> 3) & 7);      // index register
1694                         if ((sib & 7) != 5)
1695                             ci.a |= mask(sib & 7);             // base register
1696                     }
1697                     else if (rm != 5)
1698                         ci.a |= mask(rm);
1699                 }
1700                 else
1701                 {
1702                     immutable ubyte[8] ea16 = [mBX|mSI,mBX|mDI,mBP|mSI,mBP|mDI,mSI,mDI,0,mBX];
1703                     ci.a |= ea16[rm];
1704                 }
1705                 goto Lmem;
1706 
1707             case 1:
1708             case 2:
1709                 if (a32)
1710                 {
1711                     if (rm == 4)
1712                     {
1713                         sib = c.Isib;
1714                         if ((sib & modregrm(0,7,0)) != modregrm(0,4,0))
1715                             ci.a |= mask((sib >> 3) & 7);      // index register
1716                         ci.a |= mask(sib & 7);                 // base register
1717                     }
1718                     else
1719                         ci.a |= mask(rm);
1720                 }
1721                 else
1722                 {
1723                     immutable ubyte[8] ea16 = [mBX|mSI,mBX|mDI,mBP|mSI,mBP|mDI,mSI,mDI,mBP,mBX];
1724                     ci.a |= ea16[rm];
1725                 }
1726 
1727             Lmem:
1728                 if (r & EA)
1729                     ci.r |= mMEM;
1730                 if (w & EA)
1731                     ci.w |= mMEM;
1732                 ci.flags |= CIFL.ea;
1733                 break;
1734 
1735             case 3:
1736                 if (r & EA)
1737                     ci.r |= mask((r & B) ? (rm & 3) : rm);
1738                 if (w & EA)
1739                     ci.w |= mask((w & B) ? (rm & 3) : rm);
1740                 break;
1741 
1742             default:
1743                 assert(0);
1744         }
1745         // Adjust sibmodrm so that addressing modes can be compared simply
1746         irm &= modregrm(3,0,7);
1747         if (a32)
1748         {
1749             if (irm != modregrm(0,0,5))
1750             {
1751                 switch (mod)
1752                 {
1753                 case 0:
1754                     if ((sib & 7) != 5)     // if not disp32[index]
1755                     {
1756                         c.IFL1 = FLconst;
1757                         c.IEV1.Vpointer = 0;
1758                         irm |= 0x80;
1759                     }
1760                     break;
1761                 case 1:
1762                     c.IEV1.Vpointer = cast(byte) c.IEV1.Vpointer;
1763                     irm = modregrm(2, 0, rm);
1764                     break;
1765 
1766                 default:
1767                     break;
1768                 }
1769             }
1770         }
1771         else
1772         {
1773             if (irm != modregrm(0,0,6))
1774             {
1775                 switch (mod)
1776                 {
1777                     case 0:
1778                         c.IFL1 = FLconst;
1779                         c.IEV1.Vpointer = 0;
1780                         irm |= 0x80;
1781                         break;
1782                     case 1:
1783                         c.IEV1.Vpointer = cast(byte) c.IEV1.Vpointer;
1784                         irm = modregrm(2, 0, rm);
1785                         break;
1786 
1787                     default:
1788                         break;
1789                 }
1790             }
1791         }
1792 
1793         ci.r |= ci.a;
1794         ci.reg = reg;
1795         ci.sibmodrm = (sib << 8) | irm;
1796     }
1797 Lret:
1798     if (ci.w & mSP)                    // if stack pointer is modified
1799         ci.w |= mMEM;                  // then we are implicitly writing to memory
1800     if (op == LEA)                     // if LEA
1801         ci.r &= ~mMEM;                 // memory is not actually read
1802     ci.sz = cast(ubyte)sz;
1803 
1804     //printf("\t\t"); ci.print();
1805 }
1806 
1807 /******************************************
1808  * Determine if two instructions can pair.
1809  * Assume that in general, cu can pair in the U pipe and cv in the V.
1810  * Look for things like register contentions.
1811  * Input:
1812  *      cu      instruction for U pipe
1813  *      cv      instruction for V pipe
1814  * Returns:
1815  *      !=0 if they can pair
1816  */
1817 
1818 private int pair_test(Cinfo *cu,Cinfo *cv)
1819 {
1820     uint pcu;
1821     uint pcv;
1822     uint r1,w1;
1823     uint r2,w2;
1824     uint x;
1825 
1826     pcu = cu.pair;
1827     if (!(pcu & PU))
1828     {
1829         // See if pairs with FXCH and cv is FXCH
1830         if (pcu & FX && cv.c.Iop == 0xD9 && (cv.c.Irm & ~7) == 0xC8)
1831             goto Lpair;
1832         goto Lnopair;
1833     }
1834     pcv = cv.pair;
1835     if (!(pcv & PV))
1836         goto Lnopair;
1837 
1838     r1 = cu.r;
1839     w1 = cu.w;
1840     r2 = cv.r;
1841     w2 = cv.w;
1842 
1843     x = w1 & (r2 | w2) & ~(F|mMEM);     // register contention
1844     if (x &&                            // if register contention
1845         !(x == mSP && pcu & pcv & PE)   // and not exception
1846        )
1847         goto Lnopair;
1848 
1849     // Look for flags contention
1850     if (w1 & r2 & F && !(pcv & PF))
1851         goto Lnopair;
1852 
1853 Lpair:
1854     return 1;
1855 
1856 Lnopair:
1857     return 0;
1858 }
1859 
1860 /******************************************
1861  * Determine if two instructions have an AGI or register contention.
1862  * Returns:
1863  *      !=0 if they have an AGI
1864  */
1865 
1866 private int pair_agi(Cinfo *c1, Cinfo *c2)
1867 {
1868     uint x = c1.w & c2.a;
1869     return x && !(x == mSP && c1.pair & c2.pair & PE);
1870 }
1871 
1872 /********************************************
1873  * Determine if three instructions can decode simultaneously
1874  * in Pentium Pro and Pentium II.
1875  * Input:
1876  *      c0,c1,c2        candidates for decoders 0,1,2
1877  *                      c2 can be null
1878  * Returns:
1879  *      !=0 if they can decode simultaneously
1880  */
1881 
1882 private int triple_test(Cinfo *c0, Cinfo *c1, Cinfo *c2)
1883 {
1884     assert(c0);
1885     if (!c1)
1886         return 0;
1887     int c2isz = c2 ? c2.isz : 0;
1888     if (c0.isz > 7 || c1.isz > 7 || c2isz > 7 ||
1889         c0.isz + c1.isz + c2isz > 16)
1890         return 0;
1891 
1892     // 4-1-1 decode
1893     if (c1.uops > 1 ||
1894         (c2 && c2.uops > 1))
1895         return 0;
1896 
1897     return 1;
1898 }
1899 
1900 /********************************************
1901  * Get next instruction worth looking at for scheduling.
1902  * Returns:
1903  *      null    no more instructions
1904  */
1905 
1906 private code * cnext(code *c)
1907 {
1908     while (1)
1909     {
1910         c = code_next(c);
1911         if (!c)
1912             break;
1913         if (c.Iflags & (CFtarg | CFtarg2))
1914             break;
1915         if (!(c.Iop == NOP ||
1916               c.Iop == (ESCAPE | ESClinnum)))
1917             break;
1918     }
1919     return c;
1920 }
1921 
1922 /******************************************
1923  * Instruction scheduler.
1924  * Input:
1925  *      c               list of instructions to schedule
1926  *      scratch         scratch registers we can use
1927  * Returns:
1928  *      revised list of scheduled instructions
1929  */
1930 
1931 ///////////////////////////////////
1932 // Determine if c1 and c2 are swappable.
1933 // c1 comes before c2.
1934 // If they do not conflict
1935 //      return 0
1936 // If they do conflict
1937 //      return 0x100 + delay_clocks
1938 // Input:
1939 //      fpsched         if 1, then adjust fxch_pre and fxch_post to swap,
1940 //                      then return 0
1941 //                      if 2, then adjust ci1 as well as ci2
1942 
1943 @trusted
1944 private int conflict(Cinfo *ci1,Cinfo *ci2,int fpsched)
1945 {
1946     code *c1;
1947     code *c2;
1948     uint r1,w1,a1;
1949     uint r2,w2,a2;
1950     int sz1,sz2;
1951     int i = 0;
1952     int delay_clocks;
1953 
1954     c1 = ci1.c;
1955     c2 = ci2.c;
1956 
1957     //printf("conflict %x %x\n",c1,c2);
1958 
1959     r1 = ci1.r;
1960     w1 = ci1.w;
1961     a1 = ci1.a;
1962     sz1 = ci1.sz;
1963 
1964     r2 = ci2.r;
1965     w2 = ci2.w;
1966     a2 = ci2.a;
1967     sz2 = ci2.sz;
1968 
1969     //printf("r1 %lx w1 %lx a1 %lx sz1 %x\n",r1,w1,a1,sz1);
1970     //printf("r2 %lx w2 %lx a2 %lx sz2 %x\n",r2,w2,a2,sz2);
1971 
1972     if ((c1.Iflags | c2.Iflags) & (CFvolatile | CFvex))
1973         goto Lconflict;
1974 
1975     // Determine if we should handle FPU register conflicts separately
1976     //if (fpsched) printf("fp_op %d,%d:\n",ci1.fp_op,ci2.fp_op);
1977     if (fpsched && ci1.fp_op && ci2.fp_op)
1978     {
1979         w1 &= ~(S|C);
1980         r1 &= ~(S|C);
1981         w2 &= ~(S|C);
1982         r2 &= ~(S|C);
1983     }
1984     else
1985         fpsched = 0;
1986 
1987     if ((r1 | r2) & N)
1988     {
1989         goto Lconflict;
1990     }
1991 
1992 static if (0)
1993 {
1994     if (c1.Iop == 0xFF && c2.Iop == 0x8B)
1995     {   c1.print(); c2.print(); i = 1;
1996         printf("r1=%lx, w1=%lx, a1=%lx, sz1=%d, r2=%lx, w2=%lx, a2=%lx, sz2=%d\n",r1,w1,a1,sz1,r2,w2,a2,sz2);
1997     }
1998 }
1999 L1:
2000     if (w1 & r2 || (r1 | w1) & w2)
2001     {   ubyte ifl1,ifl2;
2002 
2003 if (i) printf("test\n");
2004 
2005 static if (0)
2006 {
2007 if (c1.IFL1 != c2.IFL1) printf("t1\n");
2008 if ((c1.Irm & modregrm(3,0,7)) != (c2.Irm & modregrm(3,0,7))) printf("t2\n");
2009 if ((issib(c1.Irm) && c1.Isib != c2.Isib)) printf("t3\n");
2010 if (c1.IEV1.Vpointer + sz1 <= c2.IEV1.Vpointer) printf("t4\n");
2011 if (c2.IEV1.Vpointer + sz2 <= c1.IEV1.Vpointer) printf("t5\n");
2012 }
2013 
2014         // make sure CFpsw is reliably set
2015         if (w1 & w2 & F &&              // if both instructions write to flags
2016             w1 != F &&
2017             w2 != F &&
2018             !((r1 | r2) & F) &&         // but neither instruction reads them
2019             !((c1.Iflags | c2.Iflags) & CFpsw))       // and we don't care about flags
2020         {
2021             w1 &= ~F;
2022             w2 &= ~F;                   // remove conflict
2023             goto L1;                    // and try again
2024         }
2025 
2026         // If other than the memory reference is a conflict
2027         if (w1 & r2 & ~mMEM || (r1 | w1) & w2 & ~mMEM)
2028         {   if (i) printf("\t1\n");
2029             if (i) printf("r1=%x, w1=%x, a1=%x, sz1=%d, r2=%x, w2=%x, a2=%x, sz2=%d\n",r1,w1,a1,sz1,r2,w2,a2,sz2);
2030             goto Lconflict;
2031         }
2032 
2033         // If referring to distinct types, then no dependency
2034         if (c1.Irex && c2.Irex && c1.Irex != c2.Irex)
2035             goto Lswap;
2036 
2037         ifl1 = c1.IFL1;
2038         ifl2 = c2.IFL1;
2039 
2040         // Special case: Allow indexed references using registers other than
2041         // ESP and EBP to be swapped with PUSH instructions
2042         if (((c1.Iop & ~7) == 0x50 ||          // PUSH reg
2043              c1.Iop == 0x6A ||                 // PUSH imm8
2044              c1.Iop == 0x68 ||                 // PUSH imm16/imm32
2045              (c1.Iop == 0xFF && ci1.reg == 6) // PUSH EA
2046             ) &&
2047             ci2.flags & CIFL.ea && !(a2 & mSP) &&
2048             !(a2 & mBP && cast(int)c2.IEV1.Vpointer < 0)
2049            )
2050         {
2051             if (c1.Iop == 0xFF)
2052             {
2053                 if (!(w2 & mMEM))
2054                     goto Lswap;
2055             }
2056             else
2057                 goto Lswap;
2058         }
2059 
2060         // Special case: Allow indexed references using registers other than
2061         // ESP and EBP to be swapped with PUSH instructions
2062         if (((c2.Iop & ~7) == 0x50 ||          // PUSH reg
2063              c2.Iop == 0x6A ||                 // PUSH imm8
2064              c2.Iop == 0x68 ||                 // PUSH imm16/imm32
2065              (c2.Iop == 0xFF && ci2.reg == 6) // PUSH EA
2066             ) &&
2067             ci1.flags & CIFL.ea && !(a1 & mSP) &&
2068             !(a2 & mBP && cast(int)c2.IEV1.Vpointer < 0)
2069            )
2070         {
2071             if (c2.Iop == 0xFF)
2072             {
2073                 if (!(w1 & mMEM))
2074                     goto Lswap;
2075             }
2076             else
2077                 goto Lswap;
2078         }
2079 
2080         // If not both an EA addressing mode, conflict
2081         if (!(ci1.flags & ci2.flags & CIFL.ea))
2082         {   if (i) printf("\t2\n");
2083             goto Lconflict;
2084         }
2085 
2086         if (ci1.sibmodrm == ci2.sibmodrm)
2087         {   if (ifl1 != ifl2)
2088                 goto Lswap;
2089             switch (ifl1)
2090             {
2091                 case FLconst:
2092                     if (c1.IEV1.Vint != c2.IEV1.Vint &&
2093                         (c1.IEV1.Vint + sz1 <= c2.IEV1.Vint ||
2094                          c2.IEV1.Vint + sz2 <= c1.IEV1.Vint))
2095                         goto Lswap;
2096                     break;
2097                 case FLdatseg:
2098                     if (c1.IEV1.Vseg != c2.IEV1.Vseg ||
2099                         c1.IEV1.Vint + sz1 <= c2.IEV1.Vint ||
2100                         c2.IEV1.Vint + sz2 <= c1.IEV1.Vint)
2101                         goto Lswap;
2102                     break;
2103 
2104                 default:
2105                     break;
2106             }
2107         }
2108 
2109         if ((c1.Iflags | c2.Iflags) & CFunambig &&
2110             (ifl1 != ifl2 ||
2111              ci1.sibmodrm != ci2.sibmodrm ||
2112              (c1.IEV1.Vint != c2.IEV1.Vint &&
2113               (c1.IEV1.Vint + sz1 <= c2.IEV1.Vint ||
2114                c2.IEV1.Vint + sz2 <= c1.IEV1.Vint)
2115              )
2116             )
2117            )
2118         {
2119             // Assume that [EBP] and [ESP] can point to the same location
2120             if (((a1 | a2) & (mBP | mSP)) == (mBP | mSP))
2121                 goto Lconflict;
2122             goto Lswap;
2123         }
2124 
2125         if (i) printf("\t3\n");
2126         goto Lconflict;
2127     }
2128 
2129 Lswap:
2130     if (fpsched)
2131     {
2132         //printf("\tfpsched %d,%d:\n",ci1.fp_op,ci2.fp_op);
2133         ubyte x1 = ci1.fxch_pre;
2134         ubyte y1 = ci1.fxch_post;
2135         ubyte x2 = ci2.fxch_pre;
2136         ubyte y2 = ci2.fxch_post;
2137 
2138         static uint X(uint a, uint b) { return (a << 8) | b; }
2139         switch (X(ci1.fp_op,ci2.fp_op))
2140         {
2141             case X(FP.fstp, FP.fld):
2142                 if (x1 || y1)
2143                     goto Lconflict;
2144                 if (x2)
2145                     goto Lconflict;
2146                 if (y2 == 0)
2147                     ci2.fxch_post++;
2148                 else if (y2 == 1)
2149                 {
2150                     ci2.fxch_pre++;
2151                     ci2.fxch_post++;
2152                 }
2153                 else
2154                 {
2155                     goto Lconflict;
2156                 }
2157                 break;
2158 
2159             case X(FP.fstp, FP.fop):
2160                 if (x1 || y1)
2161                     goto Lconflict;
2162                 ci2.fxch_pre++;
2163                 ci2.fxch_post++;
2164                 break;
2165 
2166             case X(FP.fop, FP.fop):
2167                 if (x1 == 0 && y1 == 1 && x2 == 0 && y2 == 0)
2168                 {   ci2.fxch_pre = 1;
2169                     ci2.fxch_post = 1;
2170                     break;
2171                 }
2172                 if (x1 == 0 && y1 == 0 && x2 == 1 && y2 == 1)
2173                     break;
2174                 goto Lconflict;
2175 
2176             case X(FP.fop, FP.fld):
2177                 if (x1 || y1)
2178                     goto Lconflict;
2179                 if (x2)
2180                     goto Lconflict;
2181                 if (y2)
2182                     break;
2183                 else if (fpsched == 2)
2184                     ci1.fxch_post = 1;
2185                 ci2.fxch_post = 1;
2186                 break;
2187 
2188             default:
2189                 goto Lconflict;
2190         }
2191 
2192         //printf("\tpre = %d, post = %d\n",ci2.fxch_pre,ci2.fxch_post);
2193     }
2194 
2195     //printf("w1 = x%x, w2 = x%x\n",w1,w2);
2196     if (i) printf("no conflict\n\n");
2197     return 0;
2198 
2199 Lconflict:
2200     //printf("r1=%x, w1=%x, r2=%x, w2=%x\n",r1,w1,r2,w2);
2201     delay_clocks = 0;
2202 
2203     // Determine if AGI
2204     if (!PRO && pair_agi(ci1,ci2))
2205         delay_clocks = 1;
2206 
2207     // Special delays for floating point
2208     if (fpsched)
2209     {   if (ci1.fp_op == FP.fld && ci2.fp_op == FP.fstp)
2210             delay_clocks = 1;
2211         else if (ci1.fp_op == FP.fop && ci2.fp_op == FP.fstp)
2212             delay_clocks = 3;
2213         else if (ci1.fp_op == FP.fop && ci2.fp_op == FP.fop)
2214             delay_clocks = 2;
2215     }
2216     else if (PRO)
2217     {
2218         // Look for partial register write stalls
2219         if (w1 & r2 & ALLREGS && sz1 < sz2)
2220             delay_clocks = 7;
2221     }
2222     else if ((w1 | r1) & (w2 | r2) & (C | S))
2223     {
2224         int op = c1.Iop;
2225         int reg = c1.Irm & modregrm(0,7,0);
2226         if (ci1.fp_op == FP.fld ||
2227             (op == 0xD9 && (c1.Irm & 0xF8) == 0xC0)
2228            )
2229         { }                             // FLD
2230         else if (op == 0xD9 && (c1.Irm & 0xF8) == 0xC8)
2231         { }                             // FXCH
2232         else if (c2.Iop == 0xD9 && (c2.Irm & 0xF8) == 0xC8)
2233         { }                             // FXCH
2234         else
2235             delay_clocks = 3;
2236     }
2237 
2238     if (i) printf("conflict %d\n\n",delay_clocks);
2239     return 0x100 + delay_clocks;
2240 }
2241 
2242 enum TBLMAX = 2*3*20;        // must be divisible by both 2 and 3
2243                              // (U,V pipe in Pentium, 3 decode units
2244                              //  in Pentium Pro)
2245 
2246 struct Schedule
2247 {
2248 nothrow:
2249     Cinfo*[TBLMAX] tbl;         // even numbers are U pipe, odd numbers are V
2250     int tblmax;                 // max number of slots used
2251 
2252     Cinfo[TBLMAX] cinfo;
2253     int cinfomax;
2254 
2255     Barray!(Cinfo*) stagelist;  // list of instructions in staging area
2256 
2257     int fpustackused;           // number of slots in FPU stack that are used
2258 
2259     @trusted
2260     void initialize(int fpustackinit)          // initialize scheduler
2261     {
2262         //printf("Schedule::initialize(fpustackinit = %d)\n", fpustackinit);
2263         memset(&this, 0, Schedule.sizeof);
2264         fpustackused = fpustackinit;
2265     }
2266 
2267     @trusted
2268     void dtor()
2269     {
2270         stagelist.dtor();
2271     }
2272 
2273 @trusted
2274 code **assemble(code **pc)  // reassemble scheduled instructions
2275 {
2276     code *c;
2277 
2278     debug
2279     if (debugs) printf("assemble:\n");
2280 
2281     assert(!*pc);
2282 
2283     // Try to insert the rest of the staged instructions
2284     size_t sli;
2285     for (sli = 0; sli < stagelist.length; ++sli)
2286     {
2287         Cinfo* ci = stagelist[sli];
2288         if (!ci)
2289             continue;
2290         if (!insert(ci))
2291             break;
2292     }
2293 
2294     // Get the instructions out of the schedule table
2295     assert(cast(uint)tblmax <= TBLMAX);
2296     for (int i = 0; i < tblmax; i++)
2297     {
2298         Cinfo* ci = tbl[i];
2299 
2300         debug
2301         if (debugs)
2302         {
2303             if (PRO)
2304             {   immutable char[4][3] tbl = [ "0  "," 1 ","  2" ];
2305 
2306                 if (ci)
2307                     printf("%s %d ",tbl[i - ((i / 3) * 3)].ptr,ci.uops);
2308                 else
2309                     printf("%s   ",tbl[i - ((i / 3) * 3)].ptr);
2310             }
2311             else
2312             {
2313                 printf((i & 1) ? " V " : "U  ");
2314             }
2315             if (ci)
2316                 ci.c.print();
2317             else
2318                 printf("\n");
2319         }
2320 
2321         if (!ci)
2322             continue;
2323         fpustackused += ci.fpuadjust;
2324         //printf("stage()1: fpustackused = %d\n", fpustackused);
2325         c = ci.c;
2326         if (i == 0)
2327             c.Iflags |= CFtarg;        // by definition, first is always a jump target
2328         else
2329             c.Iflags &= ~CFtarg;       // the rest are not
2330 
2331         // Put in any FXCH prefix
2332         if (ci.fxch_pre)
2333         {   code *cf;
2334             assert(i);
2335             cf = gen2(null,0xD9,0xC8 + ci.fxch_pre);
2336             *pc = cf;
2337             pc = &cf.next;
2338         }
2339 
2340         *pc = c;
2341         do
2342         {
2343             assert(*pc != code_next(*pc));
2344             pc = &(*pc).next;
2345         } while (*pc);
2346 
2347         // Put in any FXCH postfix
2348         if (ci.fxch_post)
2349         {
2350             for (int j = i + 1; j < tblmax; j++)
2351             {   if (tbl[j])
2352                 {   if (tbl[j].fxch_pre == ci.fxch_post)
2353                     {
2354                         tbl[j].fxch_pre = 0;           // they cancel each other out
2355                         goto L1;
2356                     }
2357                     break;
2358                 }
2359             }
2360             {   code *cf;
2361                 cf = gen2(null,0xD9,0xC8 + ci.fxch_post);
2362                 *pc = cf;
2363                 pc = &cf.next;
2364             }
2365         }
2366     L1:
2367     }
2368 
2369     // Just append any instructions left in the staging area
2370     foreach (ci; stagelist[sli .. stagelist.length])
2371     {
2372         if (!ci)
2373             continue;
2374 
2375         debug
2376         if (debugs) { printf("appending: "); ci.c.print(); }
2377 
2378         *pc = ci.c;
2379         do
2380         {
2381             pc = &(*pc).next;
2382 
2383         } while (*pc);
2384         fpustackused += ci.fpuadjust;
2385         //printf("stage()2: fpustackused = %d\n", fpustackused);
2386     }
2387     stagelist.setLength(0);
2388 
2389     return pc;
2390 }
2391 
2392 /******************************
2393  * Insert c into scheduling table.
2394  * Returns:
2395  *      0       could not be scheduled; have to start a new one
2396  */
2397 
2398 int insert(Cinfo *ci)
2399 {   code *c;
2400     int clocks;
2401     int i;
2402     int ic = 0;
2403     int imin;
2404     targ_size_t offset;
2405     targ_size_t vpointer;
2406     int movesp = 0;
2407     int reg2 = -1;              // avoid "may be uninitialized" warning
2408 
2409     //printf("insert "); ci.c.print();
2410     //printf("insert() %d\n", fpustackused);
2411     c = ci.c;
2412     //printf("\tc.Iop %x\n",c.Iop);
2413     vpointer = c.IEV1.Vpointer;
2414     assert(cast(uint)tblmax <= TBLMAX);
2415     if (tblmax == TBLMAX)               // if out of space
2416         goto Lnoinsert;
2417     if (tblmax == 0)                    // if table is empty
2418     {   // Just stuff it in the first slot
2419         i = tblmax;
2420         goto Linsert;
2421     }
2422     else if (c.Iflags & (CFtarg | CFtarg2))
2423         // Jump targets can only be first in the scheduler
2424         goto Lnoinsert;
2425 
2426     // Special case of:
2427     //  PUSH reg1
2428     //  MOV  reg2,x[ESP]
2429     if (c.Iop == 0x8B &&
2430         (c.Irm & modregrm(3,0,7)) == modregrm(1,0,4) &&
2431         c.Isib == modregrm(0,4,SP) &&
2432         c.IFL1 == FLconst &&
2433         (cast(byte)c.IEV1.Vpointer) >= REGSIZE
2434        )
2435     {
2436         movesp = 1;                     // this is a MOV reg2,offset[ESP]
2437         offset = cast(byte)c.IEV1.Vpointer;
2438         reg2 = (c.Irm >> 3) & 7;
2439     }
2440 
2441 
2442     // Start at tblmax, and back up until we get a conflict
2443     ic = -1;
2444     imin = 0;
2445     for (i = tblmax; i >= 0; i--)
2446     {
2447         Cinfo* cit = tbl[i];
2448         if (!cit)
2449             continue;
2450 
2451         // Look for special case swap
2452         if (movesp &&
2453             (cit.c.Iop & ~7) == 0x50 &&               // if PUSH reg1
2454             (cit.c.Iop & 7) != reg2 &&                // if reg1 != reg2
2455             (cast(byte)c.IEV1.Vpointer) >= -cit.spadjust
2456            )
2457         {
2458             c.IEV1.Vpointer += cit.spadjust;
2459             //printf("\t1, spadjust = %d, ptr = x%x\n",cit.spadjust,c.IEV1.Vpointer);
2460             continue;
2461         }
2462 
2463         if (movesp &&
2464             cit.c.Iop == 0x83 &&
2465             cit.c.Irm == modregrm(3,5,SP) &&          // if SUB ESP,offset
2466             cit.c.IFL2 == FLconst &&
2467             (cast(byte)c.IEV1.Vpointer) >= -cit.spadjust
2468            )
2469         {
2470             //printf("\t2, spadjust = %d\n",cit.spadjust);
2471             c.IEV1.Vpointer += cit.spadjust;
2472             continue;
2473         }
2474 
2475         clocks = conflict(cit,ci,1);
2476         if (clocks)
2477         {   int j;
2478 
2479             ic = i;                     // where the conflict occurred
2480             clocks &= 0xFF;             // convert to delay count
2481 
2482             // Move forward the delay clocks
2483             if (clocks == 0)
2484                 j = i + 1;
2485             else if (PRO)
2486                 j = (((i + 3) / 3) * 3) + clocks * 3;
2487             else
2488             {   j = ((i + 2) & ~1) + clocks * 2;
2489 
2490                 // It's possible we skipped over some AGI generating
2491                 // instructions due to movesp.
2492                 int k;
2493                 for (k = i + 1; k < j; k++)
2494                 {
2495                     if (k >= TBLMAX)
2496                         goto Lnoinsert;
2497                     if (tbl[k] && pair_agi(tbl[k],ci))
2498                     {
2499                         k = ((k + 2) & ~1) + 1;
2500                     }
2501                 }
2502                 j = k;
2503             }
2504 
2505             if (j >= TBLMAX)                    // exceed table size?
2506                 goto Lnoinsert;
2507             imin = j;                           // first possible slot c can go in
2508             break;
2509         }
2510     }
2511 
2512 
2513     // Scan forward looking for a hole to put it in
2514     for (i = imin; i < TBLMAX; i++)
2515     {
2516         if (tbl[i])
2517         {
2518             // In case, due to movesp, we skipped over some AGI instructions
2519             if (!PRO && pair_agi(tbl[i],ci))
2520             {
2521                 i = ((i + 2) & ~1) + 1;
2522                 if (i >= TBLMAX)
2523                     goto Lnoinsert;
2524             }
2525         }
2526         else
2527         {
2528             if (PRO)
2529             {   int i0 = (i / 3) * 3;           // index of decode unit 0
2530                 Cinfo *ci0;
2531 
2532                 assert(((TBLMAX / 3) * 3) == TBLMAX);
2533                 switch (i - i0)
2534                 {
2535                     case 0:                     // i0 can handle any instruction
2536                         goto Linsert;
2537                     case 1:
2538                         ci0 = tbl[i0];
2539                         if (ci.uops > 1)
2540                         {
2541                             if (i0 >= imin && ci0.uops == 1)
2542                                 goto L1;
2543                             i++;
2544                             break;
2545                         }
2546                         if (triple_test(ci0,ci,tbl[i0 + 2]))
2547                             goto Linsert;
2548                         break;
2549                     case 2:
2550                         ci0 = tbl[i0];
2551                         if (ci.uops > 1)
2552                         {
2553                             if (i0 >= imin && ci0.uops == 1)
2554                             {
2555                                 if (i >= tblmax)
2556                                 {   if (i + 1 >= TBLMAX)
2557                                         goto Lnoinsert;
2558                                     tblmax = i + 1;
2559                                 }
2560                                 tbl[i0 + 2] = tbl[i0 + 1];
2561                                 tbl[i0 + 1] = ci0;
2562                                 i = i0;
2563                                 goto Linsert;
2564                             }
2565                             break;
2566                         }
2567                         if (triple_test(ci0,tbl[i0 + 1],ci))
2568                             goto Linsert;
2569                         break;
2570                     default:
2571                         assert(0);
2572                 }
2573             }
2574             else
2575             {
2576                 assert((TBLMAX & 1) == 0);
2577                 if (i & 1)                      // if V pipe
2578                 {
2579                     if (pair_test(tbl[i - 1],ci))
2580                     {
2581                         goto Linsert;
2582                     }
2583                     else if (i > imin && pair_test(ci,tbl[i - 1]))
2584                     {
2585                 L1:
2586                         tbl[i] = tbl[i - 1];
2587                         if (i >= tblmax)
2588                             tblmax = i + 1;
2589                         i--;
2590                         //printf("\tswapping with x%02x\n",tbl[i + 1].c.Iop);
2591                         goto Linsert;
2592                     }
2593                 }
2594                 else                    // will always fit in U pipe
2595                 {
2596                     assert(!tbl[i + 1]);        // because V pipe should be empty
2597                     goto Linsert;
2598                 }
2599             }
2600         }
2601     }
2602 
2603 Lnoinsert:
2604     //printf("\tnoinsert\n");
2605     c.IEV1.Vpointer = vpointer;  // reset to original value
2606     return 0;
2607 
2608 Linsert:
2609     // Insert at location i
2610     assert(i < TBLMAX);
2611     assert(tblmax <= TBLMAX);
2612     tbl[i] = ci;
2613     //printf("\tinsert at location %d\n",i);
2614 
2615     // If it's a scheduled floating point code, we have to adjust
2616     // the FXCH values
2617     if (ci.fp_op)
2618     {
2619         ci.fxch_pre = 0;
2620         ci.fxch_post = 0;                      // start over again
2621 
2622         int fpu = fpustackused;
2623         for (int j = 0; j < tblmax; j++)
2624         {
2625             if (tbl[j])
2626             {
2627                 fpu += tbl[j].fpuadjust;
2628                 if (fpu >= 8)                   // if FPU stack overflow
2629                 {   tbl[i] = null;
2630                     //printf("fpu stack overflow\n");
2631                     goto Lnoinsert;
2632                 }
2633             }
2634         }
2635 
2636         for (int j = tblmax; j > i; j--)
2637         {
2638             if (j < TBLMAX && tbl[j])
2639                 conflict(tbl[j],ci,2);
2640         }
2641     }
2642 
2643     if (movesp)
2644     {   // Adjust [ESP] offsets
2645 
2646         //printf("\tic = %d, inserting at %d\n",ic,i);
2647         assert(cast(uint)tblmax <= TBLMAX);
2648         for (int j = ic + 1; j < i; j++)
2649         {
2650             Cinfo* cit = tbl[j];
2651             if (cit)
2652             {
2653                 c.IEV1.Vpointer -= cit.spadjust;
2654                 //printf("\t3, spadjust = %d, ptr = x%x\n",cit.spadjust,c.IEV1.Vpointer);
2655             }
2656         }
2657     }
2658     if (i >= tblmax)
2659         tblmax = i + 1;
2660 
2661     // Now do a hack. Look back at immediately preceding instructions,
2662     // and see if we can swap with a push.
2663     if (0 && movesp)
2664     {
2665         while (1)
2666         {
2667             int j;
2668             for (j = 1; i > j; j++)
2669                 if (tbl[i - j])
2670                     break;
2671 
2672             if (i >= j && tbl[i - j] &&
2673                    (tbl[i - j].c.Iop & ~7) == 0x50 &&       // if PUSH reg1
2674                    (tbl[i - j].c.Iop & 7) != reg2 &&  // if reg1 != reg2
2675                    cast(byte)c.IEV1.Vpointer >= REGSIZE)
2676             {
2677                 //printf("\t-4 prec, i-j=%d, i=%d\n",i-j,i);
2678                 assert(cast(uint)i < TBLMAX);
2679                 assert(cast(uint)(i - j) < TBLMAX);
2680                 tbl[i] = tbl[i - j];
2681                 tbl[i - j] = ci;
2682                 i -= j;
2683                 c.IEV1.Vpointer -= REGSIZE;
2684             }
2685             else
2686                 break;
2687         }
2688     }
2689 
2690     //printf("\tinsert\n");
2691     return 1;
2692 }
2693 
2694 /******************************
2695  * Insert c into staging area.
2696  * Params:
2697  *      c = instruction to stage
2698  * Returns:
2699  *      false if could not be scheduled; have to start a new one
2700  */
2701 
2702 @trusted
2703 bool stage(code *c)
2704 {
2705     //printf("stage: "); c.print();
2706     if (cinfomax == TBLMAX)             // if out of space
2707         return false;
2708     auto ci = &cinfo[cinfomax++];
2709     getinfo(ci,c);
2710 
2711     if (c.Iflags & (CFtarg | CFtarg2 | CFvolatile | CFvex))
2712     {
2713         // Insert anything in stagelist
2714         foreach (ref cs;  stagelist[])
2715         {
2716             if (cs)
2717             {
2718                 if (!insert(cs))
2719                     return false;
2720                 cs = null;
2721             }
2722         }
2723         return insert(ci) != 0;
2724     }
2725 
2726     // Look through stagelist, and insert any AGI conflicting instructions
2727     bool agi = false;
2728     foreach (ref cs; stagelist[])
2729     {
2730         if (cs)
2731         {
2732             if (pair_agi(cs,ci))
2733             {
2734                 if (!insert(cs))
2735                     goto Lnostage;
2736                 cs = null;
2737                 agi = true;                    // we put out an AGI
2738             }
2739         }
2740     }
2741 
2742     // Look through stagelist, and insert any other conflicting instructions
2743     foreach (i, ref cs; stagelist[])
2744     {
2745         if (!cs)
2746             continue;
2747         if (conflict(cs,ci,0) &&                // if conflict
2748             !(cs.flags & ci.flags & CIFL.push))
2749         {
2750             if (cs.spadjust)
2751             {
2752                 // We need to insert all previous adjustments to ESP
2753                 foreach (ref ca; stagelist[0 .. i])
2754                 {
2755                     if (ca && ca.spadjust)
2756                     {
2757                         if (!insert(ca))
2758                             goto Lnostage;
2759                         ca = null;
2760                     }
2761                 }
2762             }
2763 
2764             if (!insert(cs))
2765                 goto Lnostage;
2766             cs = null;
2767         }
2768     }
2769 
2770     // If floating point opcode, don't stage it, send it right out
2771     if (!agi && ci.flags & CIFL.nostage)
2772     {
2773         if (!insert(ci))
2774             goto Lnostage;
2775         return true;
2776     }
2777 
2778     stagelist.push(ci);         // append to staging list
2779     return true;
2780 
2781 Lnostage:
2782     return false;
2783 }
2784 
2785 }
2786 
2787 
2788 
2789 /********************************************
2790  * Snip off tail of instruction sequence.
2791  * Returns:
2792  *      next instruction (the tail) or
2793  *      null for no more instructions
2794  */
2795 
2796 private code * csnip(code *c)
2797 {
2798     if (c)
2799     {
2800         uint iflags = c.Iflags & CFclassinit;
2801         code **pc;
2802         while (1)
2803         {
2804             pc = &c.next;
2805             c = *pc;
2806             if (!c)
2807                 break;
2808             if (c.Iflags & (CFtarg | CFtarg2))
2809                 break;
2810             if (!(c.Iop == NOP ||
2811                   c.Iop == (ESCAPE | ESClinnum) ||
2812                   c.Iflags & iflags))
2813                 break;
2814         }
2815         *pc = null;
2816     }
2817     return c;
2818 }
2819 
2820 
2821 /******************************
2822  * Schedule Pentium instructions,
2823  * based on Steve Russell's algorithm.
2824  */
2825 
2826 @trusted
2827 private code *schedule(code *c,regm_t scratch)
2828 {
2829     code *cresult = null;
2830     code **pctail = &cresult;
2831     Schedule sch = void;
2832 
2833     sch.initialize(0);                  // initialize scheduling table
2834     while (c)
2835     {
2836         if ((c.Iop == NOP ||
2837              ((c.Iop & ESCAPEmask) == ESCAPE && c.Iop != (ESCAPE | ESCadjfpu)) ||
2838              c.Iflags & CFclassinit) &&
2839             !(c.Iflags & (CFtarg | CFtarg2)))
2840         {   code *cn;
2841 
2842             // Just append this instruction to pctail and go to the next one
2843             *pctail = c;
2844             cn = code_next(c);
2845             c.next = null;
2846             pctail = &c.next;
2847             c = cn;
2848             continue;
2849         }
2850 
2851         //printf("init\n");
2852         sch.initialize(sch.fpustackused);       // initialize scheduling table
2853 
2854         while (c)
2855         {
2856             //printf("insert %p\n",c);
2857             if (!sch.stage(c))          // store c in scheduling table
2858                 break;
2859             c = csnip(c);
2860         }
2861 
2862         //printf("assem %d\n",sch.tblmax);
2863         pctail = sch.assemble(pctail);  // reassemble instruction stream
2864     }
2865     sch.dtor();
2866 
2867     return cresult;
2868 }
2869 
2870 /**************************************************************************/
2871 
2872 /********************************************
2873  * Replace any occurrence of r1 in EA with r2.
2874  */
2875 
2876 private void repEA(code *c,uint r1,uint r2)
2877 {
2878     uint mod,reg,rm;
2879     uint rmn;
2880 
2881     rmn = c.Irm;
2882     mod = rmn & 0xC0;
2883     reg = rmn & modregrm(0,7,0);
2884     rm =  rmn & 7;
2885 
2886     if (mod == 0xC0 && rm == r1)
2887     { }    //c.Irm = mod | reg | r2;
2888     else if (is32bitaddr(I32,c.Iflags) &&
2889         // If not disp32
2890         (rmn & modregrm(3,0,7)) != modregrm(0,0,5))
2891     {
2892         if (rm == 4)
2893         {   // SIB byte addressing
2894             uint sib;
2895             uint base;
2896             uint index;
2897 
2898             sib = c.Isib;
2899             base = sib & 7;
2900             index = (sib >> 3) & 7;
2901             if (base == r1 &&
2902                 !(r1 == 5 && mod == 0) &&
2903                 !(r2 == 5 && mod == 0)
2904                )
2905                 base = r2;
2906             if (index == r1)
2907                 index = r2;
2908             c.Isib = cast(ubyte)((sib & 0xC0) | (index << 3) | base);
2909         }
2910         else if (rm == r1)
2911         {
2912             if (r1 == BP && r2 == SP)
2913             {   // Replace [EBP] with [ESP]
2914                 c.Irm = cast(ubyte)(mod | reg | 4);
2915                 c.Isib = modregrm(0,4,SP);
2916             }
2917             else if (r2 == BP && mod == 0)
2918             {
2919                 c.Irm = cast(ubyte)(modregrm(1,0,0) | reg | r2);
2920                 c.IFL1 = FLconst;
2921                 c.IEV1.Vint = 0;
2922             }
2923             else
2924                 c.Irm = cast(ubyte)(mod | reg | r2);
2925         }
2926     }
2927 }
2928 
2929 /******************************************
2930  * Instruction scheduler.
2931  * Input:
2932  *      c               list of instructions to schedule
2933  *      scratch         scratch registers we can use
2934  * Returns:
2935  *      revised list of scheduled instructions
2936  */
2937 
2938 /******************************************
2939  * Swap c1 and c2.
2940  * c1 comes before c2.
2941  * Swap in place to not disturb addresses of jmp targets
2942  */
2943 
2944 private void code_swap(code *c1,code *c2)
2945 {   code cs;
2946 
2947     // Special case of:
2948     //  PUSH reg1
2949     //  MOV  reg2,x[ESP]
2950     //printf("code_swap(%x, %x)\n",c1,c2);
2951     if ((c1.Iop & ~7) == 0x50 &&
2952         c2.Iop == 0x8B &&
2953         (c2.Irm & modregrm(3,0,7)) == modregrm(1,0,4) &&
2954         c2.Isib == modregrm(0,4,SP) &&
2955         c2.IFL1 == FLconst &&
2956         (cast(byte)c2.IEV1.Vpointer) >= REGSIZE &&
2957         (c1.Iop & 7) != ((c2.Irm >> 3) & 7)
2958        )
2959         c2.IEV1.Vpointer -= REGSIZE;
2960 
2961 
2962     cs = *c2;
2963     *c2 = *c1;
2964     *c1 = cs;
2965     // Retain original CFtarg
2966     c1.Iflags = (c1.Iflags & ~(CFtarg | CFtarg2)) | (c2.Iflags & (CFtarg | CFtarg2));
2967     c2.Iflags = (c2.Iflags & ~(CFtarg | CFtarg2)) | (cs.Iflags & (CFtarg | CFtarg2));
2968 
2969     c1.next = c2.next;
2970     c2.next = cs.next;
2971 }
2972 
2973 private code *peephole(code *cstart,regm_t scratch)
2974 {
2975     // Look for cases of:
2976     //  MOV r1,r2
2977     //  OP ?,r1
2978     // we can replace with:
2979     //  MOV r1,r2
2980     //  OP ?,r2
2981     // to improve pairing
2982     code *c1;
2983     uint r1,r2;
2984     uint mod,reg,rm;
2985 
2986     //printf("peephole\n");
2987     for (code *c = cstart; c; c = c1)
2988     {
2989         ubyte rmn;
2990 
2991         //c.print();
2992         c1 = cnext(c);
2993     Ln:
2994         if (!c1)
2995             break;
2996         if (c1.Iflags & (CFtarg | CFtarg2))
2997             continue;
2998 
2999         // Do:
3000         //      PUSH    reg
3001         if (I32 && (c.Iop & ~7) == 0x50)
3002         {
3003             uint regx = c.Iop & 7;
3004 
3005             //  MOV     [ESP],regx       =>      NOP
3006             if (c1.Iop == 0x8B &&
3007                 c1.Irm == modregrm(0,regx,4) &&
3008                 c1.Isib == modregrm(0,4,SP))
3009             {   c1.Iop = NOP;
3010                 continue;
3011             }
3012 
3013             //  PUSH    [ESP]           =>      PUSH    regx
3014             if (c1.Iop == 0xFF &&
3015                 c1.Irm == modregrm(0,6,4) &&
3016                 c1.Isib == modregrm(0,4,SP))
3017             {   c1.Iop = 0x50 + regx;
3018                 continue;
3019             }
3020 
3021             //  CMP     [ESP],imm       =>      CMP     regx,i,,
3022             if (c1.Iop == 0x83 &&
3023                 c1.Irm == modregrm(0,7,4) &&
3024                 c1.Isib == modregrm(0,4,SP))
3025             {   c1.Irm = modregrm(3,7,regx);
3026                 if (c1.IFL2 == FLconst && cast(byte)c1.IEV2.Vuns == 0)
3027                 {   // to TEST regx,regx
3028                     c1.Iop = (c1.Iop & 1) | 0x84;
3029                     c1.Irm = modregrm(3,regx,regx);
3030                 }
3031                 continue;
3032             }
3033 
3034         }
3035 
3036         // Do:
3037         //      MOV     reg,[ESP]       =>      PUSH    reg
3038         //      ADD     ESP,4           =>      NOP
3039         if (I32 && c.Iop == 0x8B && (c.Irm & 0xC7) == modregrm(0,0,4) &&
3040             c.Isib == modregrm(0,4,SP) &&
3041             c1.Iop == 0x83 && (c1.Irm & 0xC7) == modregrm(3,0,SP) &&
3042             !(c1.Iflags & CFpsw) && c1.IFL2 == FLconst && c1.IEV2.Vint == 4)
3043         {
3044             uint regx = (c.Irm >> 3) & 7;
3045             c.Iop = 0x58 + regx;
3046             c1.Iop = NOP;
3047             continue;
3048         }
3049 
3050         // Combine two SUBs of the same register
3051         if (c.Iop == c1.Iop &&
3052             c.Iop == 0x83 &&
3053             (c.Irm & 0xC0) == 0xC0 &&
3054             (c.Irm & modregrm(3,0,7)) == (c1.Irm & modregrm(3,0,7)) &&
3055             !(c1.Iflags & CFpsw) &&
3056             c.IFL2 == FLconst && c1.IFL2 == FLconst
3057            )
3058         {   int i = cast(byte)c.IEV2.Vint;
3059             int i1 = cast(byte)c1.IEV2.Vint;
3060             switch ((c.Irm & modregrm(0,7,0)) | ((c1.Irm & modregrm(0,7,0)) >> 3))
3061             {
3062                 case (0 << 3) | 0:              // ADD, ADD
3063                 case (5 << 3) | 5:              // SUB, SUB
3064                     i += i1;
3065                     goto Laa;
3066                 case (0 << 3) | 5:              // ADD, SUB
3067                 case (5 << 3) | 0:              // SUB, ADD
3068                     i -= i1;
3069                     goto Laa;
3070                 Laa:
3071                     if (cast(byte)i != i)
3072                         c.Iop &= ~2;
3073                     c.IEV2.Vint = i;
3074                     c1.Iop = NOP;
3075                     if (i == 0)
3076                         c.Iop = NOP;
3077                     continue;
3078 
3079                 default:
3080                     break;
3081             }
3082         }
3083 
3084         if (c.Iop == 0x8B && (c.Irm & 0xC0) == 0xC0)    // MOV r1,r2
3085         {   r1 = (c.Irm >> 3) & 7;
3086             r2 = c.Irm & 7;
3087         }
3088         else if (c.Iop == 0x89 && (c.Irm & 0xC0) == 0xC0)   // MOV r1,r2
3089         {   r1 = c.Irm & 7;
3090             r2 = (c.Irm >> 3) & 7;
3091         }
3092         else
3093         {
3094             continue;
3095         }
3096 
3097         rmn = c1.Irm;
3098         mod = rmn & 0xC0;
3099         reg = rmn & modregrm(0,7,0);
3100         rm =  rmn & 7;
3101         if (cod3_EA(c1))
3102             repEA(c1,r1,r2);
3103         switch (c1.Iop)
3104         {
3105             case 0x50:
3106             case 0x51:
3107             case 0x52:
3108             case 0x53:
3109             case 0x54:
3110             case 0x55:
3111             case 0x56:
3112             case 0x57:                          // PUSH reg
3113                 if ((c1.Iop & 7) == r1)
3114                 {   c1.Iop = 0x50 | r2;
3115                     //printf("schedule PUSH reg\n");
3116                 }
3117                 break;
3118 
3119             case 0x81:
3120             case 0x83:
3121                 // Look for CMP EA,imm
3122                 if (reg == modregrm(0,7,0))
3123                 {
3124                     if (mod == 0xC0 && rm == r1)
3125                         c1.Irm = cast(ubyte)(mod | reg | r2);
3126                 }
3127                 break;
3128 
3129             case 0x84:                  // TEST reg,byte ptr EA
3130                 if (r1 >= 4 || r2 >= 4) // if not a byte register
3131                     break;
3132                 if ((rmn & 0xC0) == 0xC0)
3133                 {
3134                     if ((rmn & 3) == r1)
3135                     {   c1.Irm = rmn = cast(ubyte)((rmn & modregrm(3,7,4)) | r2);
3136                         //printf("schedule 1\n");
3137                     }
3138                 }
3139                 if ((rmn & modregrm(0,3,0)) == modregrm(0,r1,0))
3140                 {   c1.Irm = (rmn & modregrm(3,4,7)) | modregrm(0,r2,0);
3141                     //printf("schedule 2\n");
3142                 }
3143                 break;
3144             case 0x85:                  // TEST reg,word ptr EA
3145                 if ((rmn & 0xC0) == 0xC0)
3146                 {
3147                     if ((rmn & 7) == r1)
3148                     {   c1.Irm = rmn = cast(ubyte)((rmn & modregrm(3,7,0)) | r2);
3149                         //printf("schedule 3\n");
3150                     }
3151                 }
3152                 if ((rmn & modregrm(0,7,0)) == modregrm(0,r1,0))
3153                 {   c1.Irm = (rmn & modregrm(3,0,7)) | modregrm(0,r2,0);
3154                     //printf("schedule 4\n");
3155                 }
3156                 break;
3157 
3158             case 0x89:                  // MOV EA,reg
3159                 if ((rmn & modregrm(0,7,0)) == modregrm(0,r1,0))
3160                 {   c1.Irm = (rmn & modregrm(3,0,7)) | modregrm(0,r2,0);
3161                     //printf("schedule 5\n");
3162                     if (c1.Irm == modregrm(3,r2,r2))
3163                         goto Lnop;
3164                 }
3165                 break;
3166 
3167             case 0x8B:                  // MOV reg,EA
3168                 if ((rmn & 0xC0) == 0xC0 &&
3169                     (rmn & 7) == r1)            // if EA == r1
3170                 {   c1.Irm = cast(ubyte)((rmn & modregrm(3,7,0)) | r2);
3171                     //printf("schedule 6\n");
3172                     if (c1.Irm == modregrm(3,r2,r2))
3173                         goto Lnop;
3174                 }
3175                 break;
3176 
3177             case 0x3C:                  // CMP AL,imm8
3178                 if (r1 == AX && r2 < 4)
3179                 {   c1.Iop = 0x80;
3180                     c1.Irm = modregrm(3,7,r2);
3181                     //printf("schedule 7, r2 = %d\n", r2);
3182                 }
3183                 break;
3184 
3185             case 0x3D:                  // CMP AX,imm16
3186                 if (r1 == AX)
3187                 {   c1.Iop = 0x81;
3188                     c1.Irm = modregrm(3,7,r2);
3189                     if (c1.IFL2 == FLconst &&
3190                         c1.IEV2.Vuns == cast(byte)c1.IEV2.Vuns)
3191                         c1.Iop = 0x83;
3192                     //printf("schedule 8\n");
3193                 }
3194                 break;
3195 
3196             default:
3197                 break;
3198         }
3199         continue;
3200 Lnop:
3201         c1.Iop = NOP;
3202         c1 = cnext(c1);
3203         goto Ln;
3204     }
3205     return cstart;
3206 }
3207 
3208 /*****************************************************************/
3209 
3210 /**********************************************
3211  * Replace complex instructions with simple ones more conducive
3212  * to scheduling.
3213  */
3214 
3215 @trusted
3216 code *simpleops(code *c,regm_t scratch)
3217 {   code *cstart;
3218     uint reg;
3219     code *c2;
3220 
3221     // Worry about using registers not saved yet by prolog
3222     scratch &= ~fregsaved;
3223 
3224     if (!(scratch & (scratch - 1)))     // if 0 or 1 registers
3225         return c;
3226 
3227     reg = findreg(scratch);
3228 
3229     cstart = c;
3230     for (code** pc = &cstart; *pc; pc = &(*pc).next)
3231     {
3232         c = *pc;
3233         if (c.Iflags & (CFtarg | CFtarg2 | CFopsize))
3234             continue;
3235         if (c.Iop == 0x83 &&
3236             (c.Irm & modregrm(0,7,0)) == modregrm(0,7,0) &&
3237             (c.Irm & modregrm(3,0,0)) != modregrm(3,0,0)
3238            )
3239         {   // Replace CMP mem,imm with:
3240             //  MOV reg,mem
3241             //  CMP reg,imm
3242             targ_long imm;
3243 
3244             //printf("replacing CMP\n");
3245             c.Iop = 0x8B;
3246             c.Irm = (c.Irm & modregrm(3,0,7)) | modregrm(0,reg,0);
3247 
3248             c2 = code_calloc();
3249             if (reg == AX)
3250                 c2.Iop = 0x3D;
3251             else
3252             {   c2.Iop = 0x83;
3253                 c2.Irm = modregrm(3,7,reg);
3254             }
3255             c2.IFL2 = c.IFL2;
3256             c2.IEV2 = c.IEV2;
3257 
3258             // See if c2 should be replaced by a TEST
3259             imm = c2.IEV2.Vuns;
3260             if (!(c2.Iop & 1))
3261                 imm &= 0xFF;
3262             else if (I32 ? c.Iflags & CFopsize : !(c.Iflags & CFopsize))
3263                 imm = cast(short) imm;
3264             if (imm == 0)
3265             {
3266                 c2.Iop = 0x85;                 // TEST reg,reg
3267                 c2.Irm = modregrm(3,reg,reg);
3268             }
3269             goto L1;
3270         }
3271         else if (c.Iop == 0xFF &&
3272             (c.Irm & modregrm(0,7,0)) == modregrm(0,6,0) &&
3273             (c.Irm & modregrm(3,0,0)) != modregrm(3,0,0)
3274            )
3275         {   // Replace PUSH mem with:
3276             //  MOV reg,mem
3277             //  PUSH reg
3278 
3279            // printf("replacing PUSH\n");
3280             c.Iop = 0x8B;
3281             c.Irm = (c.Irm & modregrm(3,0,7)) | modregrm(0,reg,0);
3282 
3283             c2 = gen1(null,0x50 + reg);
3284         L1:
3285 //c.print();
3286 //c2.print();
3287             c2.next = c.next;
3288             c.next = c2;
3289 
3290             // Switch to another reg
3291             if (scratch & ~mask(reg))
3292                 reg = findreg(scratch & ~mask(reg));
3293         }
3294     }
3295     return cstart;
3296 }
3297 
3298 }