1 /**
2 * The core.internal.atomic module comtains the low-level atomic features available in hardware.
3 * This module may be a routing layer for compiler intrinsics.
4 *
5 * Copyright: Copyright Manu Evans 2019.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 * Authors:   Sean Kelly, Alex Rønne Petersen, Manu Evans
8 * Source:    $(DRUNTIMESRC core/internal/_atomic.d)
9 */
10 
11 module core.internal.atomic;
12 
13 import core.atomic : MemoryOrder, has128BitCAS;
14 
15 version (DigitalMars)
16 {
17     private
18     {
19         enum : int
20         {
21             AX, BX, CX, DX, DI, SI, R8, R9
22         }
23 
24         immutable string[4][8] registerNames = [
25             [ "AL", "AX", "EAX", "RAX" ],
26             [ "BL", "BX", "EBX", "RBX" ],
27             [ "CL", "CX", "ECX", "RCX" ],
28             [ "DL", "DX", "EDX", "RDX" ],
29             [ "DIL", "DI", "EDI", "RDI" ],
30             [ "SIL", "SI", "ESI", "RSI" ],
31             [ "R8B", "R8W", "R8D", "R8" ],
32             [ "R9B", "R9W", "R9D", "R9" ],
33         ];
34 
35         template RegIndex(T)
36         {
37             static if (T.sizeof == 1)
38                 enum RegIndex = 0;
39             else static if (T.sizeof == 2)
40                 enum RegIndex = 1;
41             else static if (T.sizeof == 4)
42                 enum RegIndex = 2;
43             else static if (T.sizeof == 8)
44                 enum RegIndex = 3;
45             else
46                 static assert(false, "Invalid type");
47         }
48 
49         enum SizedReg(int reg, T = size_t) = registerNames[reg][RegIndex!T];
50     }
51 
52     inout(T) atomicLoad(MemoryOrder order = MemoryOrder.seq, T)(inout(T)* src) pure nothrow @nogc @trusted
53         if (CanCAS!T)
54     {
55         static assert(order != MemoryOrder.rel, "invalid MemoryOrder for atomicLoad()");
56 
57         static if (T.sizeof == size_t.sizeof * 2)
58         {
59             version (D_InlineAsm_X86)
60             {
61                 asm pure nothrow @nogc @trusted
62                 {
63                     push EDI;
64                     push EBX;
65                     mov EBX, 0;
66                     mov ECX, 0;
67                     mov EAX, 0;
68                     mov EDX, 0;
69                     mov EDI, src;
70                     lock; cmpxchg8b [EDI];
71                     pop EBX;
72                     pop EDI;
73                 }
74             }
75             else version (D_InlineAsm_X86_64)
76             {
77                 version (Windows)
78                 {
79                     static if (RegisterReturn!T)
80                     {
81                         enum SrcPtr = SizedReg!CX;
82                         enum RetPtr = null;
83                     }
84                     else
85                     {
86                         enum SrcPtr = SizedReg!DX;
87                         enum RetPtr = SizedReg!CX;
88                     }
89 
90                     mixin (simpleFormat(q{
91                         asm pure nothrow @nogc @trusted
92                         {
93                             naked;
94                             push RBX;
95                             mov R8, %0;
96     ?1                        mov R9, %1;
97                             mov RBX, 0;
98                             mov RCX, 0;
99                             mov RAX, 0;
100                             mov RDX, 0;
101                             lock; cmpxchg16b [R8];
102     ?1                        mov [R9], RAX;
103     ?1                        mov 8[R9], RDX;
104                             pop RBX;
105                             ret;
106                         }
107                     }, [SrcPtr, RetPtr]));
108                 }
109                 else
110                 {
111                     asm pure nothrow @nogc @trusted
112                     {
113                         naked;
114                         push RBX;
115                         mov RBX, 0;
116                         mov RCX, 0;
117                         mov RAX, 0;
118                         mov RDX, 0;
119                         lock; cmpxchg16b [RDI];
120                         pop RBX;
121                         ret;
122                     }
123                 }
124             }
125         }
126         else static if (needsLoadBarrier!order)
127         {
128             version (D_InlineAsm_X86)
129             {
130                 enum SrcReg = SizedReg!CX;
131                 enum ZeroReg = SizedReg!(DX, T);
132                 enum ResReg = SizedReg!(AX, T);
133 
134                 mixin (simpleFormat(q{
135                     asm pure nothrow @nogc @trusted
136                     {
137                         mov %1, 0;
138                         mov %2, 0;
139                         mov %0, src;
140                         lock; cmpxchg [%0], %1;
141                     }
142                 }, [SrcReg, ZeroReg, ResReg]));
143             }
144             else version (D_InlineAsm_X86_64)
145             {
146                 version (Windows)
147                     enum SrcReg = SizedReg!CX;
148                 else
149                     enum SrcReg = SizedReg!DI;
150                 enum ZeroReg = SizedReg!(DX, T);
151                 enum ResReg = SizedReg!(AX, T);
152 
153                 mixin (simpleFormat(q{
154                     asm pure nothrow @nogc @trusted
155                     {
156                         naked;
157                         mov %1, 0;
158                         mov %2, 0;
159                         lock; cmpxchg [%0], %1;
160                         ret;
161                     }
162                 }, [SrcReg, ZeroReg, ResReg]));
163             }
164         }
165         else
166             return *src;
167     }
168 
169     void atomicStore(MemoryOrder order = MemoryOrder.seq, T)(T* dest, T value) pure nothrow @nogc @trusted
170         if (CanCAS!T)
171     {
172         static assert(order != MemoryOrder.acq, "Invalid MemoryOrder for atomicStore()");
173 
174         static if (T.sizeof == size_t.sizeof * 2)
175         {
176             version (D_InlineAsm_X86)
177             {
178                 asm pure nothrow @nogc @trusted
179                 {
180                     push EDI;
181                     push EBX;
182                     lea EDI, value;
183                     mov EBX, [EDI];
184                     mov ECX, 4[EDI];
185                     mov EDI, dest;
186                     mov EAX, [EDI];
187                     mov EDX, 4[EDI];
188                 L1: lock; cmpxchg8b [EDI];
189                     jne L1;
190                     pop EBX;
191                     pop EDI;
192                 }
193             }
194             else version (D_InlineAsm_X86_64)
195             {
196                 version (Windows)
197                 {
198                     asm pure nothrow @nogc @trusted
199                     {
200                         naked;
201                         push RBX;
202                         mov R8, RDX;
203                         mov RAX, [RDX];
204                         mov RDX, 8[RDX];
205                         mov RBX, [RCX];
206                         mov RCX, 8[RCX];
207                     L1: lock; cmpxchg16b [R8];
208                         jne L1;
209                         pop RBX;
210                         ret;
211                     }
212                 }
213                 else
214                 {
215                     asm pure nothrow @nogc @trusted
216                     {
217                         naked;
218                         push RBX;
219                         mov RBX, RDI;
220                         mov RCX, RSI;
221                         mov RDI, RDX;
222                         mov RAX, [RDX];
223                         mov RDX, 8[RDX];
224                     L1: lock; cmpxchg16b [RDI];
225                         jne L1;
226                         pop RBX;
227                         ret;
228                     }
229                 }
230             }
231         }
232         else static if (needsStoreBarrier!order)
233             atomicExchange!(order, false)(dest, value);
234         else
235             *dest = value;
236     }
237 
238     T atomicFetchAdd(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
239         if (is(T : ulong))
240     {
241         version (D_InlineAsm_X86)
242         {
243             static assert(T.sizeof <= 4, "64bit atomicFetchAdd not supported on 32bit target." );
244 
245             enum DestReg = SizedReg!DX;
246             enum ValReg = SizedReg!(AX, T);
247 
248             mixin (simpleFormat(q{
249                 asm pure nothrow @nogc @trusted
250                 {
251                     mov %1, value;
252                     mov %0, dest;
253                     lock; xadd[%0], %1;
254                 }
255             }, [DestReg, ValReg]));
256         }
257         else version (D_InlineAsm_X86_64)
258         {
259             version (Windows)
260             {
261                 enum DestReg = SizedReg!DX;
262                 enum ValReg = SizedReg!(CX, T);
263             }
264             else
265             {
266                 enum DestReg = SizedReg!SI;
267                 enum ValReg = SizedReg!(DI, T);
268             }
269             enum ResReg = result ? SizedReg!(AX, T) : null;
270 
271             mixin (simpleFormat(q{
272                 asm pure nothrow @nogc @trusted
273                 {
274                     naked;
275                     lock; xadd[%0], %1;
276     ?2                mov %2, %1;
277                     ret;
278                 }
279             }, [DestReg, ValReg, ResReg]));
280         }
281         else
282             static assert (false, "Unsupported architecture.");
283     }
284 
285     T atomicFetchSub(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
286         if (is(T : ulong))
287     {
288         return atomicFetchAdd(dest, cast(T)-cast(IntOrLong!T)value);
289     }
290 
291     T atomicExchange(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
292     if (CanCAS!T)
293     {
294         version (D_InlineAsm_X86)
295         {
296             static assert(T.sizeof <= 4, "64bit atomicExchange not supported on 32bit target." );
297 
298             enum DestReg = SizedReg!CX;
299             enum ValReg = SizedReg!(AX, T);
300 
301             mixin (simpleFormat(q{
302                 asm pure nothrow @nogc @trusted
303                 {
304                     mov %1, value;
305                     mov %0, dest;
306                     xchg [%0], %1;
307                 }
308             }, [DestReg, ValReg]));
309         }
310         else version (D_InlineAsm_X86_64)
311         {
312             version (Windows)
313             {
314                 enum DestReg = SizedReg!DX;
315                 enum ValReg = SizedReg!(CX, T);
316             }
317             else
318             {
319                 enum DestReg = SizedReg!SI;
320                 enum ValReg = SizedReg!(DI, T);
321             }
322             enum ResReg = result ? SizedReg!(AX, T) : null;
323 
324             mixin (simpleFormat(q{
325                 asm pure nothrow @nogc @trusted
326                 {
327                     naked;
328                     xchg [%0], %1;
329     ?2                mov %2, %1;
330                     ret;
331                 }
332             }, [DestReg, ValReg, ResReg]));
333         }
334         else
335             static assert (false, "Unsupported architecture.");
336     }
337 
338     alias atomicCompareExchangeWeak = atomicCompareExchangeStrong;
339 
340     bool atomicCompareExchangeStrong(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, T* compare, T value) pure nothrow @nogc @trusted
341         if (CanCAS!T)
342     {
343         version (D_InlineAsm_X86)
344         {
345             static if (T.sizeof <= 4)
346             {
347                 enum DestAddr = SizedReg!CX;
348                 enum CmpAddr = SizedReg!DI;
349                 enum Val = SizedReg!(DX, T);
350                 enum Cmp = SizedReg!(AX, T);
351 
352                 mixin (simpleFormat(q{
353                     asm pure nothrow @nogc @trusted
354                     {
355                         push %1;
356                         mov %2, value;
357                         mov %1, compare;
358                         mov %3, [%1];
359                         mov %0, dest;
360                         lock; cmpxchg [%0], %2;
361                         mov [%1], %3;
362                         setz AL;
363                         pop %1;
364                     }
365                 }, [DestAddr, CmpAddr, Val, Cmp]));
366             }
367             else static if (T.sizeof == 8)
368             {
369                 asm pure nothrow @nogc @trusted
370                 {
371                     push EDI;
372                     push EBX;
373                     lea EDI, value;
374                     mov EBX, [EDI];
375                     mov ECX, 4[EDI];
376                     mov EDI, compare;
377                     mov EAX, [EDI];
378                     mov EDX, 4[EDI];
379                     mov EDI, dest;
380                     lock; cmpxchg8b [EDI];
381                     mov EDI, compare;
382                     mov [EDI], EAX;
383                     mov 4[EDI], EDX;
384                     setz AL;
385                     pop EBX;
386                     pop EDI;
387                 }
388             }
389             else
390                 static assert(T.sizeof <= 8, "128bit atomicCompareExchangeStrong not supported on 32bit target." );
391         }
392         else version (D_InlineAsm_X86_64)
393         {
394             static if (T.sizeof <= 8)
395             {
396                 version (Windows)
397                 {
398                     enum DestAddr = SizedReg!R8;
399                     enum CmpAddr = SizedReg!DX;
400                     enum Val = SizedReg!(CX, T);
401                 }
402                 else
403                 {
404                     enum DestAddr = SizedReg!DX;
405                     enum CmpAddr = SizedReg!SI;
406                     enum Val = SizedReg!(DI, T);
407                 }
408                 enum Res = SizedReg!(AX, T);
409 
410                 mixin (simpleFormat(q{
411                     asm pure nothrow @nogc @trusted
412                     {
413                         naked;
414                         mov %3, [%1];
415                         lock; cmpxchg [%0], %2;
416                         jne compare_fail;
417                         mov AL, 1;
418                         ret;
419                     compare_fail:
420                         mov [%1], %3;
421                         xor AL, AL;
422                         ret;
423                     }
424                 }, [DestAddr, CmpAddr, Val, Res]));
425             }
426             else
427             {
428                 version (Windows)
429                 {
430                     asm pure nothrow @nogc @trusted
431                     {
432                         naked;
433                         push RBX;
434                         mov R9, RDX;
435                         mov RAX, [RDX];
436                         mov RDX, 8[RDX];
437                         mov RBX, [RCX];
438                         mov RCX, 8[RCX];
439                         lock; cmpxchg16b [R8];
440                         pop RBX;
441                         jne compare_fail;
442                         mov AL, 1;
443                         ret;
444                     compare_fail:
445                         mov [R9], RAX;
446                         mov 8[R9], RDX;
447                         xor AL, AL;
448                         ret;
449                     }
450                 }
451                 else
452                 {
453                     asm pure nothrow @nogc @trusted
454                     {
455                         naked;
456                         push RBX;
457                         mov R8, RCX;
458                         mov R9, RDX;
459                         mov RAX, [RDX];
460                         mov RDX, 8[RDX];
461                         mov RBX, RDI;
462                         mov RCX, RSI;
463                         lock; cmpxchg16b [R8];
464                         pop RBX;
465                         jne compare_fail;
466                         mov AL, 1;
467                         ret;
468                     compare_fail:
469                         mov [R9], RAX;
470                         mov 8[R9], RDX;
471                         xor AL, AL;
472                         ret;
473                     }
474                 }
475             }
476         }
477         else
478             static assert (false, "Unsupported architecture.");
479     }
480 
481     alias atomicCompareExchangeWeakNoResult = atomicCompareExchangeStrongNoResult;
482 
483     bool atomicCompareExchangeStrongNoResult(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, const T compare, T value) pure nothrow @nogc @trusted
484         if (CanCAS!T)
485     {
486         version (D_InlineAsm_X86)
487         {
488             static if (T.sizeof <= 4)
489             {
490                 enum DestAddr = SizedReg!CX;
491                 enum Cmp = SizedReg!(AX, T);
492                 enum Val = SizedReg!(DX, T);
493 
494                 mixin (simpleFormat(q{
495                     asm pure nothrow @nogc @trusted
496                     {
497                         mov %2, value;
498                         mov %1, compare;
499                         mov %0, dest;
500                         lock; cmpxchg [%0], %2;
501                         setz AL;
502                     }
503                 }, [DestAddr, Cmp, Val]));
504             }
505             else static if (T.sizeof == 8)
506             {
507                 asm pure nothrow @nogc @trusted
508                 {
509                     push EDI;
510                     push EBX;
511                     lea EDI, value;
512                     mov EBX, [EDI];
513                     mov ECX, 4[EDI];
514                     lea EDI, compare;
515                     mov EAX, [EDI];
516                     mov EDX, 4[EDI];
517                     mov EDI, dest;
518                     lock; cmpxchg8b [EDI];
519                     setz AL;
520                     pop EBX;
521                     pop EDI;
522                 }
523             }
524             else
525                 static assert(T.sizeof <= 8, "128bit atomicCompareExchangeStrong not supported on 32bit target." );
526         }
527         else version (D_InlineAsm_X86_64)
528         {
529             static if (T.sizeof <= 8)
530             {
531                 version (Windows)
532                 {
533                     enum DestAddr = SizedReg!R8;
534                     enum Cmp = SizedReg!(DX, T);
535                     enum Val = SizedReg!(CX, T);
536                 }
537                 else
538                 {
539                     enum DestAddr = SizedReg!DX;
540                     enum Cmp = SizedReg!(SI, T);
541                     enum Val = SizedReg!(DI, T);
542                 }
543                 enum AXReg = SizedReg!(AX, T);
544 
545                 mixin (simpleFormat(q{
546                     asm pure nothrow @nogc @trusted
547                     {
548                         naked;
549                         mov %3, %1;
550                         lock; cmpxchg [%0], %2;
551                         setz AL;
552                         ret;
553                     }
554                 }, [DestAddr, Cmp, Val, AXReg]));
555             }
556             else
557             {
558                 version (Windows)
559                 {
560                     asm pure nothrow @nogc @trusted
561                     {
562                         naked;
563                         push RBX;
564                         mov RAX, [RDX];
565                         mov RDX, 8[RDX];
566                         mov RBX, [RCX];
567                         mov RCX, 8[RCX];
568                         lock; cmpxchg16b [R8];
569                         setz AL;
570                         pop RBX;
571                         ret;
572                     }
573                 }
574                 else
575                 {
576                     asm pure nothrow @nogc @trusted
577                     {
578                         naked;
579                         push RBX;
580                         mov RAX, RDX;
581                         mov RDX, RCX;
582                         mov RBX, RDI;
583                         mov RCX, RSI;
584                         lock; cmpxchg16b [R8];
585                         setz AL;
586                         pop RBX;
587                         ret;
588                     }
589                 }
590             }
591         }
592         else
593             static assert (false, "Unsupported architecture.");
594     }
595 
596     void atomicFence(MemoryOrder order = MemoryOrder.seq)() pure nothrow @nogc @trusted
597     {
598         // TODO: `mfence` should only be required for seq_cst operations, but this depends on
599         //       the compiler's backend knowledge to not reorder code inappropriately,
600         //       so we'll apply it conservatively.
601         static if (order != MemoryOrder.raw)
602         {
603             version (D_InlineAsm_X86)
604             {
605                 import core.cpuid;
606 
607                 // TODO: review this implementation; it seems way overly complicated
608                 asm pure nothrow @nogc @trusted
609                 {
610                     naked;
611 
612                     call sse2;
613                     test AL, AL;
614                     jne Lcpuid;
615 
616                     // Fast path: We have SSE2, so just use mfence.
617                     mfence;
618                     jmp Lend;
619 
620                 Lcpuid:
621 
622                     // Slow path: We use cpuid to serialize. This is
623                     // significantly slower than mfence, but is the
624                     // only serialization facility we have available
625                     // on older non-SSE2 chips.
626                     push EBX;
627 
628                     mov EAX, 0;
629                     cpuid;
630 
631                     pop EBX;
632 
633                 Lend:
634 
635                     ret;
636                 }
637             }
638             else version (D_InlineAsm_X86_64)
639             {
640                 asm pure nothrow @nogc @trusted
641                 {
642                     naked;
643                     mfence;
644                     ret;
645                 }
646             }
647             else
648                 static assert (false, "Unsupported architecture.");
649         }
650     }
651 
652     void pause() pure nothrow @nogc @trusted
653     {
654         version (D_InlineAsm_X86)
655         {
656             asm pure nothrow @nogc @trusted
657             {
658                 naked;
659                 rep; nop;
660                 ret;
661             }
662         }
663         else version (D_InlineAsm_X86_64)
664         {
665             asm pure nothrow @nogc @trusted
666             {
667                 naked;
668     //            pause; // TODO: DMD should add this opcode to its inline asm
669                 rep; nop;
670                 ret;
671             }
672         }
673         else
674         {
675             // ARM should `yield`
676             // other architectures? otherwise some sort of nop...
677         }
678     }
679 }
680 else version (GNU)
681 {
682     import gcc.builtins;
683     import gcc.config;
684 
685     inout(T) atomicLoad(MemoryOrder order = MemoryOrder.seq, T)(inout(T)* src) pure nothrow @nogc @trusted
686         if (CanCAS!T)
687     {
688         static assert(order != MemoryOrder.rel, "invalid MemoryOrder for atomicLoad()");
689 
690         static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
691         {
692             static if (T.sizeof == ubyte.sizeof)
693             {
694                 ubyte value = __atomic_load_1(cast(shared)src, order);
695                 return *cast(typeof(return)*)&value;
696             }
697             else static if (T.sizeof == ushort.sizeof)
698             {
699                 ushort value = __atomic_load_2(cast(shared)src, order);
700                 return *cast(typeof(return)*)&value;
701             }
702             else static if (T.sizeof == uint.sizeof)
703             {
704                 uint value = __atomic_load_4(cast(shared)src, order);
705                 return *cast(typeof(return)*)&value;
706             }
707             else static if (T.sizeof == ulong.sizeof && GNU_Have_64Bit_Atomics)
708             {
709                 ulong value = __atomic_load_8(cast(shared)src, order);
710                 return *cast(typeof(return)*)&value;
711             }
712             else static if (GNU_Have_LibAtomic)
713             {
714                 T value;
715                 __atomic_load(T.sizeof, cast(shared)src, &value, order);
716                 return *cast(typeof(return)*)&value;
717             }
718             else
719                 static assert(0, "Invalid template type specified.");
720         }
721         else
722         {
723             getAtomicMutex.lock();
724             scope(exit) getAtomicMutex.unlock();
725             return *cast(typeof(return)*)&src;
726         }
727     }
728 
729     void atomicStore(MemoryOrder order = MemoryOrder.seq, T)(T* dest, T value) pure nothrow @nogc @trusted
730         if (CanCAS!T)
731     {
732         static assert(order != MemoryOrder.acq, "Invalid MemoryOrder for atomicStore()");
733 
734         static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
735         {
736             static if (T.sizeof == ubyte.sizeof)
737                 __atomic_store_1(cast(shared)dest, *cast(ubyte*)&value, order);
738             else static if (T.sizeof == ushort.sizeof)
739                 __atomic_store_2(cast(shared)dest, *cast(ushort*)&value, order);
740             else static if (T.sizeof == uint.sizeof)
741                 __atomic_store_4(cast(shared)dest, *cast(uint*)&value, order);
742             else static if (T.sizeof == ulong.sizeof && GNU_Have_64Bit_Atomics)
743                 __atomic_store_8(cast(shared)dest, *cast(ulong*)&value, order);
744             else static if (GNU_Have_LibAtomic)
745                 __atomic_store(T.sizeof, cast(shared)dest, cast(void*)&value, order);
746             else
747                 static assert(0, "Invalid template type specified.");
748         }
749         else
750         {
751             getAtomicMutex.lock();
752             *dest = value;
753             getAtomicMutex.unlock();
754         }
755     }
756 
757     T atomicFetchAdd(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
758         if (is(T : ulong))
759     {
760         static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
761         {
762             static if (T.sizeof == ubyte.sizeof)
763                 return __atomic_fetch_add_1(cast(shared)dest, value, order);
764             else static if (T.sizeof == ushort.sizeof)
765                 return __atomic_fetch_add_2(cast(shared)dest, value, order);
766             else static if (T.sizeof == uint.sizeof)
767                 return __atomic_fetch_add_4(cast(shared)dest, value, order);
768             else static if (T.sizeof == ulong.sizeof && GNU_Have_64Bit_Atomics)
769                 return __atomic_fetch_add_8(cast(shared)dest, value, order);
770             else static if (GNU_Have_LibAtomic)
771                 return __atomic_fetch_add(T.sizeof, cast(shared)dest, cast(void*)&value, order);
772             else
773                 static assert(0, "Invalid template type specified.");
774         }
775         else
776         {
777             getAtomicMutex.lock();
778             scope(exit) getAtomicMutex.unlock();
779             T tmp = *dest;
780             *dest += value;
781             return tmp;
782         }
783     }
784 
785     T atomicFetchSub(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
786         if (is(T : ulong))
787     {
788         static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
789         {
790             static if (T.sizeof == ubyte.sizeof)
791                 return __atomic_fetch_sub_1(cast(shared)dest, value, order);
792             else static if (T.sizeof == ushort.sizeof)
793                 return __atomic_fetch_sub_2(cast(shared)dest, value, order);
794             else static if (T.sizeof == uint.sizeof)
795                 return __atomic_fetch_sub_4(cast(shared)dest, value, order);
796             else static if (T.sizeof == ulong.sizeof && GNU_Have_64Bit_Atomics)
797                 return __atomic_fetch_sub_8(cast(shared)dest, value, order);
798             else static if (GNU_Have_LibAtomic)
799                 return __atomic_fetch_sub(T.sizeof, cast(shared)dest, cast(void*)&value, order);
800             else
801                 static assert(0, "Invalid template type specified.");
802         }
803         else
804         {
805             getAtomicMutex.lock();
806             scope(exit) getAtomicMutex.unlock();
807             T tmp = *dest;
808             *dest -= value;
809             return tmp;
810         }
811     }
812 
813     T atomicExchange(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
814         if (is(T : ulong) || is(T == class) || is(T == interface) || is(T U : U*))
815     {
816         static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
817         {
818             static if (T.sizeof == byte.sizeof)
819             {
820                 ubyte res = __atomic_exchange_1(cast(shared)dest, *cast(ubyte*)&value, order);
821                 return *cast(typeof(return)*)&res;
822             }
823             else static if (T.sizeof == short.sizeof)
824             {
825                 ushort res = __atomic_exchange_2(cast(shared)dest, *cast(ushort*)&value, order);
826                 return *cast(typeof(return)*)&res;
827             }
828             else static if (T.sizeof == int.sizeof)
829             {
830                 uint res = __atomic_exchange_4(cast(shared)dest, *cast(uint*)&value, order);
831                 return *cast(typeof(return)*)&res;
832             }
833             else static if (T.sizeof == long.sizeof && GNU_Have_64Bit_Atomics)
834             {
835                 ulong res = __atomic_exchange_8(cast(shared)dest, *cast(ulong*)&value, order);
836                 return *cast(typeof(return)*)&res;
837             }
838             else static if (GNU_Have_LibAtomic)
839             {
840                 T res = void;
841                 __atomic_exchange(T.sizeof, cast(shared)dest, cast(void*)&value, &res, order);
842                 return res;
843             }
844             else
845                 static assert(0, "Invalid template type specified.");
846         }
847         else
848         {
849             getAtomicMutex.lock();
850             scope(exit) getAtomicMutex.unlock();
851 
852             T res = *dest;
853             *dest = value;
854             return res;
855         }
856     }
857 
858     bool atomicCompareExchangeWeak(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, T* compare, T value) pure nothrow @nogc @trusted
859         if (CanCAS!T)
860     {
861         return atomicCompareExchangeImpl!(succ, fail, true)(dest, compare, value);
862     }
863 
864     bool atomicCompareExchangeStrong(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, T* compare, T value) pure nothrow @nogc @trusted
865         if (CanCAS!T)
866     {
867         return atomicCompareExchangeImpl!(succ, fail, false)(dest, compare, value);
868     }
869 
870     bool atomicCompareExchangeStrongNoResult(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, const T compare, T value) pure nothrow @nogc @trusted
871         if (CanCAS!T)
872     {
873         return atomicCompareExchangeImpl!(succ, fail, false)(dest, cast(T*)&compare, value);
874     }
875 
876     bool atomicCompareExchangeWeakNoResult(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, const T compare, T value) pure nothrow @nogc @trusted
877         if (CanCAS!T)
878     {
879         return atomicCompareExchangeImpl!(succ, fail, true)(dest, cast(T*)&compare, value);
880     }
881 
882     private bool atomicCompareExchangeImpl(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, bool weak, T)(T* dest, T* compare, T value) pure nothrow @nogc @trusted
883         if (CanCAS!T)
884     {
885         bool res = void;
886 
887         static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
888         {
889             static if (T.sizeof == byte.sizeof)
890                 res = __atomic_compare_exchange_1(cast(shared)dest, compare, *cast(ubyte*)&value,
891                                                   weak, succ, fail);
892             else static if (T.sizeof == short.sizeof)
893                 res = __atomic_compare_exchange_2(cast(shared)dest, compare, *cast(ushort*)&value,
894                                                   weak, succ, fail);
895             else static if (T.sizeof == int.sizeof)
896                 res = __atomic_compare_exchange_4(cast(shared)dest, compare, *cast(uint*)&value,
897                                                   weak, succ, fail);
898             else static if (T.sizeof == long.sizeof && GNU_Have_64Bit_Atomics)
899                 res = __atomic_compare_exchange_8(cast(shared)dest, compare, *cast(ulong*)&value,
900                                                   weak, succ, fail);
901             else static if (GNU_Have_LibAtomic)
902                 res = __atomic_compare_exchange(T.sizeof, cast(shared)dest, compare, cast(void*)&value,
903                                                 succ, fail);
904             else
905                 static assert(0, "Invalid template type specified.");
906         }
907         else
908         {
909             static if (T.sizeof == byte.sizeof)
910                 alias U = byte;
911             else static if (T.sizeof == short.sizeof)
912                 alias U = short;
913             else static if (T.sizeof == int.sizeof)
914                 alias U = int;
915             else static if (T.sizeof == long.sizeof)
916                 alias U = long;
917             else
918                 static assert(0, "Invalid template type specified.");
919 
920             getAtomicMutex.lock();
921             scope(exit) getAtomicMutex.unlock();
922 
923             if (*cast(U*)dest == *cast(U*)&compare)
924             {
925                 *dest = value;
926                 res = true;
927             }
928             else
929             {
930                 *compare = *dest;
931                 res = false;
932             }
933         }
934 
935         return res;
936     }
937 
938     void atomicFence(MemoryOrder order = MemoryOrder.seq)() pure nothrow @nogc @trusted
939     {
940         static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
941             __atomic_thread_fence(order);
942         else
943         {
944             getAtomicMutex.lock();
945             getAtomicMutex.unlock();
946         }
947     }
948 
949     void pause() pure nothrow @nogc @trusted
950     {
951         version (X86)
952         {
953             __builtin_ia32_pause();
954         }
955         else version (X86_64)
956         {
957             __builtin_ia32_pause();
958         }
959         else
960         {
961             // Other architectures? Some sort of nop or barrier.
962         }
963     }
964 
965     static if (!GNU_Have_Atomics && !GNU_Have_LibAtomic)
966     {
967         // Use system mutex for atomics, faking the purity of the functions so
968         // that they can be used in pure/nothrow/@safe code.
969         extern (C) private pure @trusted @nogc nothrow
970         {
971             static if (GNU_Thread_Model == ThreadModel.Posix)
972             {
973                 import core.sys.posix.pthread;
974                 alias atomicMutexHandle = pthread_mutex_t;
975 
976                 pragma(mangle, "pthread_mutex_init") int fakePureMutexInit(pthread_mutex_t*, pthread_mutexattr_t*);
977                 pragma(mangle, "pthread_mutex_lock") int fakePureMutexLock(pthread_mutex_t*);
978                 pragma(mangle, "pthread_mutex_unlock") int fakePureMutexUnlock(pthread_mutex_t*);
979             }
980             else static if (GNU_Thread_Model == ThreadModel.Win32)
981             {
982                 import core.sys.windows.winbase;
983                 alias atomicMutexHandle = CRITICAL_SECTION;
984 
985                 pragma(mangle, "InitializeCriticalSection") int fakePureMutexInit(CRITICAL_SECTION*);
986                 pragma(mangle, "EnterCriticalSection") void fakePureMutexLock(CRITICAL_SECTION*);
987                 pragma(mangle, "LeaveCriticalSection") int fakePureMutexUnlock(CRITICAL_SECTION*);
988             }
989             else
990             {
991                 alias atomicMutexHandle = int;
992             }
993         }
994 
995         // Implements lock/unlock operations.
996         private struct AtomicMutex
997         {
998             int lock() pure @trusted @nogc nothrow
999             {
1000                 static if (GNU_Thread_Model == ThreadModel.Posix)
1001                 {
1002                     if (!_inited)
1003                     {
1004                         fakePureMutexInit(&_handle, null);
1005                         _inited = true;
1006                     }
1007                     return fakePureMutexLock(&_handle);
1008                 }
1009                 else
1010                 {
1011                     static if (GNU_Thread_Model == ThreadModel.Win32)
1012                     {
1013                         if (!_inited)
1014                         {
1015                             fakePureMutexInit(&_handle);
1016                             _inited = true;
1017                         }
1018                         fakePureMutexLock(&_handle);
1019                     }
1020                     return 0;
1021                 }
1022             }
1023 
1024             int unlock() pure @trusted @nogc nothrow
1025             {
1026                 static if (GNU_Thread_Model == ThreadModel.Posix)
1027                     return fakePureMutexUnlock(&_handle);
1028                 else
1029                 {
1030                     static if (GNU_Thread_Model == ThreadModel.Win32)
1031                         fakePureMutexUnlock(&_handle);
1032                     return 0;
1033                 }
1034             }
1035 
1036         private:
1037             atomicMutexHandle _handle;
1038             bool _inited;
1039         }
1040 
1041         // Internal static mutex reference.
1042         private AtomicMutex* _getAtomicMutex() @trusted @nogc nothrow
1043         {
1044             __gshared static AtomicMutex mutex;
1045             return &mutex;
1046         }
1047 
1048         // Pure alias for _getAtomicMutex.
1049         pragma(mangle, _getAtomicMutex.mangleof)
1050         private AtomicMutex* getAtomicMutex() pure @trusted @nogc nothrow @property;
1051     }
1052 }
1053 
1054 private:
1055 
1056 version (Windows)
1057 {
1058     enum RegisterReturn(T) = is(T : U[], U) || is(T : R delegate(A), R, A...);
1059 }
1060 
1061 enum CanCAS(T) = is(T : ulong) ||
1062                  is(T == class) ||
1063                  is(T == interface) ||
1064                  is(T : U*, U) ||
1065                  is(T : U[], U) ||
1066                  is(T : R delegate(A), R, A...) ||
1067                  (is(T == struct) && __traits(isPOD, T) &&
1068                   (T.sizeof <= size_t.sizeof*2 ||       // no more than 2 words
1069                    (T.sizeof == 16 && has128BitCAS)) && // or supports 128-bit CAS
1070                   (T.sizeof & (T.sizeof - 1)) == 0      // is power of 2
1071                  );
1072 
1073 template IntOrLong(T)
1074 {
1075     static if (T.sizeof > 4)
1076         alias IntOrLong = long;
1077     else
1078         alias IntOrLong = int;
1079 }
1080 
1081 // NOTE: x86 loads implicitly have acquire semantics so a memory
1082 //       barrier is only necessary on releases.
1083 template needsLoadBarrier( MemoryOrder ms )
1084 {
1085     enum bool needsLoadBarrier = ms == MemoryOrder.seq;
1086 }
1087 
1088 
1089 // NOTE: x86 stores implicitly have release semantics so a memory
1090 //       barrier is only necessary on acquires.
1091 template needsStoreBarrier( MemoryOrder ms )
1092 {
1093     enum bool needsStoreBarrier = ms == MemoryOrder.seq;
1094 }
1095 
1096 // this is a helper to build asm blocks
1097 string simpleFormat(string format, scope string[] args)
1098 {
1099     string result;
1100     outer: while (format.length)
1101     {
1102         foreach (i; 0 .. format.length)
1103         {
1104             if (format[i] == '%' || format[i] == '?')
1105             {
1106                 bool isQ = format[i] == '?';
1107                 result ~= format[0 .. i++];
1108                 assert (i < format.length, "Invalid format string");
1109                 if (format[i] == '%' || format[i] == '?')
1110                 {
1111                     assert(!isQ, "Invalid format string");
1112                     result ~= format[i++];
1113                 }
1114                 else
1115                 {
1116                     int index = 0;
1117                     assert (format[i] >= '0' && format[i] <= '9', "Invalid format string");
1118                     while (i < format.length && format[i] >= '0' && format[i] <= '9')
1119                         index = index * 10 + (ubyte(format[i++]) - ubyte('0'));
1120                     if (!isQ)
1121                         result ~= args[index];
1122                     else if (!args[index])
1123                     {
1124                         size_t j = i;
1125                         for (; j < format.length;)
1126                         {
1127                             if (format[j++] == '\n')
1128                                 break;
1129                         }
1130                         i = j;
1131                     }
1132                 }
1133                 format = format[i .. $];
1134                 continue outer;
1135             }
1136         }
1137         result ~= format;
1138         break;
1139     }
1140     return result;
1141 }