[llvm] [clang] [clang-tools-extra] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (PR #74275)
https://github.com/RKSimon approved this pull request. LGTM. I'd still like to ensure we have unaligned x86 test coverage. https://github.com/llvm/llvm-project/pull/74275 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (PR #74275)
jyknight wrote: @RKSimon: I'm not sure if you intended your comment to be a blocker; I was about to press the merge button when you commented (and would still wish to now). https://github.com/llvm/llvm-project/pull/74275 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (PR #74275)
jyknight wrote: Underaligned atomic operations are expanded to an appropriate `__atomic_*` libcall via mostly target-independent code in AtomicExpandPass (https://github.com/llvm/llvm-project/blob/7ecfb66c77ad77dabbb705cbb1f3b17a3d1391a4/llvm/lib/CodeGen/AtomicExpandPass.cpp#L210) and never hits any of this. Not sure we have full test coverage for that case on X86, but I know we do for AArch64. https://github.com/llvm/llvm-project/pull/74275 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (PR #74275)
@@ -30113,32 +30120,40 @@ TargetLoweringBase::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { Type *MemType = SI->getValueOperand()->getType(); - bool NoImplicitFloatOps = - SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); - if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && - !Subtarget.useSoftFloat() && !NoImplicitFloatOps && - (Subtarget.hasSSE1() || Subtarget.hasX87())) -return AtomicExpansionKind::None; + if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && + !Subtarget.useSoftFloat()) { +if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && +(Subtarget.hasSSE1() || Subtarget.hasX87())) + return AtomicExpansionKind::None; + +if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() && +Subtarget.hasAVX()) + return AtomicExpansionKind::None; + } return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand : AtomicExpansionKind::None; } // Note: this turns large loads into lock cmpxchg8b/16b. -// TODO: In 32-bit mode, use MOVLPS when SSE1 is available? TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { Type *MemType = LI->getType(); - // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we - // can use movq to do the load. If we have X87 we can load into an 80-bit - // X87 register and store it to a stack temporary. - bool NoImplicitFloatOps = - LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); - if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && - !Subtarget.useSoftFloat() && !NoImplicitFloatOps && - (Subtarget.hasSSE1() || Subtarget.hasX87())) -return AtomicExpansionKind::None; + if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && + !Subtarget.useSoftFloat()) { +// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we +// can use movq to do the load. If we have X87 we can load into an 80-bit +// X87 register and store it to a stack temporary. +if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && arsenm wrote: getPrimitiveSizeInBits doesn't work correctly for pointer typed atomics; you need to use the DataLayout. Is there appropriate pointer typed atomic load test coverage for this? https://github.com/llvm/llvm-project/pull/74275 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (PR #74275)
@@ -30115,12 +30126,16 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we // can use movq to do the load. If we have X87 we can load into an 80-bit // X87 register and store it to a stack temporary. jyknight wrote: Done. https://github.com/llvm/llvm-project/pull/74275 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (PR #74275)
@@ -228,87 +228,86 @@ define void @widen_broadcast_unaligned(ptr %p0, i32 %v) { } define i128 @load_i128(ptr %ptr) { -; CHECK-O0-LABEL: load_i128: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT:pushq %rbx -; CHECK-O0-NEXT:.cfi_def_cfa_offset 16 -; CHECK-O0-NEXT:.cfi_offset %rbx, -16 -; CHECK-O0-NEXT:xorl %eax, %eax -; CHECK-O0-NEXT:movl %eax, %ebx -; CHECK-O0-NEXT:movq %rbx, %rax -; CHECK-O0-NEXT:movq %rbx, %rdx -; CHECK-O0-NEXT:movq %rbx, %rcx -; CHECK-O0-NEXT:lock cmpxchg16b (%rdi) -; CHECK-O0-NEXT:popq %rbx -; CHECK-O0-NEXT:.cfi_def_cfa_offset 8 -; CHECK-O0-NEXT:retq -; -; CHECK-O3-LABEL: load_i128: -; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT:pushq %rbx -; CHECK-O3-NEXT:.cfi_def_cfa_offset 16 -; CHECK-O3-NEXT:.cfi_offset %rbx, -16 -; CHECK-O3-NEXT:xorl %eax, %eax -; CHECK-O3-NEXT:xorl %edx, %edx -; CHECK-O3-NEXT:xorl %ecx, %ecx -; CHECK-O3-NEXT:xorl %ebx, %ebx -; CHECK-O3-NEXT:lock cmpxchg16b (%rdi) -; CHECK-O3-NEXT:popq %rbx -; CHECK-O3-NEXT:.cfi_def_cfa_offset 8 -; CHECK-O3-NEXT:retq +; CHECK-O0-CUR-LABEL: load_i128: preames wrote: Your tests need updated, these check prefixes no longer exist. https://github.com/llvm/llvm-project/pull/74275 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (PR #74275)
https://github.com/jyknight updated https://github.com/llvm/llvm-project/pull/74275 >From 7baffd6d1f4254b1bd725ddc883a360d79267435 Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Sat, 2 Dec 2023 23:05:26 -0500 Subject: [PATCH 1/2] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX In late 2021, both Intel and AMD finally documented that every AVX-capable CPU has always been guaranteed to execute aligned 16-byte loads/stores atomically, and further, guaranteed that all future CPUs with AVX will do so as well. Therefore, we may use normal SSE 128-bit load/store instructions to implement atomics, if AVX is enabled. Also adjust handling of unordered atomic load/store in LegalizeIntegerTypes.cpp; currently, it hardcodes a fallback to ATOMIC_CMP_SWAP_WITH_SUCCESS, but we should instead fallback to ATOMIC_LOAD/ATOMIC_STORE. Per AMD64 Architecture Programmer's manual, 7.3.2 Access Atomicity: """ Processors that report [AVX] extend the atomicity for cacheable, naturally-aligned single loads or stores from a quadword to a double quadword. """ Per Intel's SDM: """ Processors that enumerate support for Intel(R) AVX guarantee that the 16-byte memory operations performed by the following instructions will always be carried out atomically: - MOVAPD, MOVAPS, and MOVDQA. - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128. - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded with EVEX.128 and k0 (masking disabled). """ This was also confirmed to be true for Zhaoxin CPUs with AVX, in https://gcc.gnu.org/PR104688 --- .../SelectionDAG/LegalizeIntegerTypes.cpp | 28 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 94 +-- llvm/test/CodeGen/X86/atomic-non-integer.ll | 24 +- llvm/test/CodeGen/X86/atomic-unordered.ll | 83 +- llvm/test/CodeGen/X86/atomic128.ll| 247 +++--- llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll | 8 +- 6 files changed, 256 insertions(+), 228 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 54698edce7d6f8..5b496feee7a8f4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -3831,17 +3831,14 @@ void DAGTypeLegalizer::ExpandIntRes_XROUND_XRINT(SDNode *N, SDValue , void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N, SDValue , SDValue ) { if (N->isAtomic()) { -// It's typical to have larger CAS than atomic load instructions. SDLoc dl(N); EVT VT = N->getMemoryVT(); -SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other); -SDValue Zero = DAG.getConstant(0, dl, VT); -SDValue Swap = DAG.getAtomicCmpSwap( -ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, -VT, VTs, N->getOperand(0), -N->getOperand(1), Zero, Zero, N->getMemOperand()); -ReplaceValueWith(SDValue(N, 0), Swap.getValue(0)); -ReplaceValueWith(SDValue(N, 1), Swap.getValue(2)); +// We may support larger values in atomic_load than in a normal load +// (without splitting), so switch over if needed. +SDValue New = DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, N->getOperand(0), +N->getOperand(1), N->getMemOperand()); +ReplaceValueWith(SDValue(N, 0), New.getValue(0)); +ReplaceValueWith(SDValue(N, 1), New.getValue(1)); return; } @@ -5399,14 +5396,13 @@ SDValue DAGTypeLegalizer::ExpandIntOp_XINT_TO_FP(SDNode *N) { SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) { if (N->isAtomic()) { -// It's typical to have larger CAS than atomic store instructions. +// We may support larger values in atomic_store than in a normal store +// (without splitting), so switch over if needed. SDLoc dl(N); -SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, - N->getMemoryVT(), - N->getOperand(0), N->getOperand(2), - N->getOperand(1), - N->getMemOperand()); -return Swap.getValue(1); +SDValue New = +DAG.getAtomic(ISD::ATOMIC_STORE, dl, N->getMemoryVT(), N->getOperand(0), + N->getOperand(1), N->getOperand(2), N->getMemOperand()); +return New.getValue(0); } if (ISD::isNormalStore(N)) return ExpandOp_NormalStore(N, OpNo); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6167be7bdf84e9..1880cbc3a5bf35 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -515,6 +515,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine , if (!Subtarget.is64Bit()) setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); + if (Subtarget.is64Bit() && Subtarget.hasAVX()) { +// All CPUs supporting AVX will atomically load/store