[llvm] [clang] [clang-tools-extra] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (PR #74275)

2024-02-06 Thread Simon Pilgrim via cfe-commits

https://github.com/RKSimon approved this pull request.

LGTM. I'd still like to ensure we have unaligned x86 test coverage.

https://github.com/llvm/llvm-project/pull/74275
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (PR #74275)

2024-02-06 Thread James Y Knight via cfe-commits

jyknight wrote:

@RKSimon: I'm not sure if you intended your comment to be a blocker; I was 
about to press the merge button when you commented (and would still wish to 
now).

https://github.com/llvm/llvm-project/pull/74275
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (PR #74275)

2024-02-02 Thread James Y Knight via cfe-commits

jyknight wrote:

Underaligned atomic operations are expanded to an appropriate `__atomic_*` 
libcall via mostly target-independent code in AtomicExpandPass 
(https://github.com/llvm/llvm-project/blob/7ecfb66c77ad77dabbb705cbb1f3b17a3d1391a4/llvm/lib/CodeGen/AtomicExpandPass.cpp#L210)
 and never hits any of this. Not sure we have full test coverage for that case 
on X86, but I know we do for AArch64.

https://github.com/llvm/llvm-project/pull/74275
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (PR #74275)

2024-01-04 Thread Matt Arsenault via cfe-commits


@@ -30113,32 +30120,40 @@ TargetLoweringBase::AtomicExpansionKind
 X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   Type *MemType = SI->getValueOperand()->getType();
 
-  bool NoImplicitFloatOps =
-  SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
-  if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
-  !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
-  (Subtarget.hasSSE1() || Subtarget.hasX87()))
-return AtomicExpansionKind::None;
+  if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
+  !Subtarget.useSoftFloat()) {
+if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
+(Subtarget.hasSSE1() || Subtarget.hasX87()))
+  return AtomicExpansionKind::None;
+
+if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
+Subtarget.hasAVX())
+  return AtomicExpansionKind::None;
+  }
 
   return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
  : AtomicExpansionKind::None;
 }
 
 // Note: this turns large loads into lock cmpxchg8b/16b.
-// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
 TargetLowering::AtomicExpansionKind
 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   Type *MemType = LI->getType();
 
-  // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
-  // can use movq to do the load. If we have X87 we can load into an 80-bit
-  // X87 register and store it to a stack temporary.
-  bool NoImplicitFloatOps =
-  LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
-  if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
-  !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
-  (Subtarget.hasSSE1() || Subtarget.hasX87()))
-return AtomicExpansionKind::None;
+  if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
+  !Subtarget.useSoftFloat()) {
+// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
+// can use movq to do the load. If we have X87 we can load into an 80-bit
+// X87 register and store it to a stack temporary.
+if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

arsenm wrote:

getPrimitiveSizeInBits doesn't work correctly for pointer typed atomics; you 
need to use the DataLayout. Is there appropriate pointer typed atomic load test 
coverage for this?

https://github.com/llvm/llvm-project/pull/74275
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (PR #74275)

2023-12-16 Thread James Y Knight via cfe-commits


@@ -30115,12 +30126,16 @@ 
X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
   // can use movq to do the load. If we have X87 we can load into an 80-bit
   // X87 register and store it to a stack temporary.

jyknight wrote:

Done.

https://github.com/llvm/llvm-project/pull/74275
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (PR #74275)

2023-12-14 Thread Philip Reames via cfe-commits


@@ -228,87 +228,86 @@ define void @widen_broadcast_unaligned(ptr %p0, i32 %v) {
 }
 
 define i128 @load_i128(ptr %ptr) {
-; CHECK-O0-LABEL: load_i128:
-; CHECK-O0:   # %bb.0:
-; CHECK-O0-NEXT:pushq %rbx
-; CHECK-O0-NEXT:.cfi_def_cfa_offset 16
-; CHECK-O0-NEXT:.cfi_offset %rbx, -16
-; CHECK-O0-NEXT:xorl %eax, %eax
-; CHECK-O0-NEXT:movl %eax, %ebx
-; CHECK-O0-NEXT:movq %rbx, %rax
-; CHECK-O0-NEXT:movq %rbx, %rdx
-; CHECK-O0-NEXT:movq %rbx, %rcx
-; CHECK-O0-NEXT:lock cmpxchg16b (%rdi)
-; CHECK-O0-NEXT:popq %rbx
-; CHECK-O0-NEXT:.cfi_def_cfa_offset 8
-; CHECK-O0-NEXT:retq
-;
-; CHECK-O3-LABEL: load_i128:
-; CHECK-O3:   # %bb.0:
-; CHECK-O3-NEXT:pushq %rbx
-; CHECK-O3-NEXT:.cfi_def_cfa_offset 16
-; CHECK-O3-NEXT:.cfi_offset %rbx, -16
-; CHECK-O3-NEXT:xorl %eax, %eax
-; CHECK-O3-NEXT:xorl %edx, %edx
-; CHECK-O3-NEXT:xorl %ecx, %ecx
-; CHECK-O3-NEXT:xorl %ebx, %ebx
-; CHECK-O3-NEXT:lock cmpxchg16b (%rdi)
-; CHECK-O3-NEXT:popq %rbx
-; CHECK-O3-NEXT:.cfi_def_cfa_offset 8
-; CHECK-O3-NEXT:retq
+; CHECK-O0-CUR-LABEL: load_i128:

preames wrote:

Your tests need updated, these check prefixes no longer exist.  

https://github.com/llvm/llvm-project/pull/74275
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (PR #74275)

2023-12-11 Thread James Y Knight via cfe-commits

https://github.com/jyknight updated 
https://github.com/llvm/llvm-project/pull/74275

>From 7baffd6d1f4254b1bd725ddc883a360d79267435 Mon Sep 17 00:00:00 2001
From: James Y Knight 
Date: Sat, 2 Dec 2023 23:05:26 -0500
Subject: [PATCH 1/2] [X86] Use plain load/store instead of cmpxchg16b for
 atomics with AVX

In late 2021, both Intel and AMD finally documented that every
AVX-capable CPU has always been guaranteed to execute aligned 16-byte
loads/stores atomically, and further, guaranteed that all future CPUs
with AVX will do so as well.

Therefore, we may use normal SSE 128-bit load/store instructions to
implement atomics, if AVX is enabled.

Also adjust handling of unordered atomic load/store in LegalizeIntegerTypes.cpp;
currently, it hardcodes a fallback to ATOMIC_CMP_SWAP_WITH_SUCCESS,
but we should instead fallback to ATOMIC_LOAD/ATOMIC_STORE.

Per AMD64 Architecture Programmer's manual, 7.3.2 Access Atomicity:
"""
Processors that report [AVX] extend the atomicity for cacheable,
naturally-aligned single loads or stores from a quadword to a double
quadword.
"""

Per Intel's SDM:
"""
Processors that enumerate support for Intel(R) AVX guarantee that the
16-byte memory operations performed by the following instructions will
always be carried out atomically:
- MOVAPD, MOVAPS, and MOVDQA.
- VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
- VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded with
  EVEX.128 and k0 (masking disabled).
"""

This was also confirmed to be true for Zhaoxin CPUs with AVX, in
https://gcc.gnu.org/PR104688
---
 .../SelectionDAG/LegalizeIntegerTypes.cpp |  28 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp   |  94 +--
 llvm/test/CodeGen/X86/atomic-non-integer.ll   |  24 +-
 llvm/test/CodeGen/X86/atomic-unordered.ll |  83 +-
 llvm/test/CodeGen/X86/atomic128.ll| 247 +++---
 llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll  |   8 +-
 6 files changed, 256 insertions(+), 228 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 54698edce7d6f8..5b496feee7a8f4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3831,17 +3831,14 @@ void DAGTypeLegalizer::ExpandIntRes_XROUND_XRINT(SDNode 
*N, SDValue ,
 void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
  SDValue , SDValue ) {
   if (N->isAtomic()) {
-// It's typical to have larger CAS than atomic load instructions.
 SDLoc dl(N);
 EVT VT = N->getMemoryVT();
-SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
-SDValue Zero = DAG.getConstant(0, dl, VT);
-SDValue Swap = DAG.getAtomicCmpSwap(
-ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl,
-VT, VTs, N->getOperand(0),
-N->getOperand(1), Zero, Zero, N->getMemOperand());
-ReplaceValueWith(SDValue(N, 0), Swap.getValue(0));
-ReplaceValueWith(SDValue(N, 1), Swap.getValue(2));
+// We may support larger values in atomic_load than in a normal load
+// (without splitting), so switch over if needed.
+SDValue New = DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, N->getOperand(0),
+N->getOperand(1), N->getMemOperand());
+ReplaceValueWith(SDValue(N, 0), New.getValue(0));
+ReplaceValueWith(SDValue(N, 1), New.getValue(1));
 return;
   }
 
@@ -5399,14 +5396,13 @@ SDValue DAGTypeLegalizer::ExpandIntOp_XINT_TO_FP(SDNode 
*N) {
 
 SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
   if (N->isAtomic()) {
-// It's typical to have larger CAS than atomic store instructions.
+// We may support larger values in atomic_store than in a normal store
+// (without splitting), so switch over if needed.
 SDLoc dl(N);
-SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
- N->getMemoryVT(),
- N->getOperand(0), N->getOperand(2),
- N->getOperand(1),
- N->getMemOperand());
-return Swap.getValue(1);
+SDValue New =
+DAG.getAtomic(ISD::ATOMIC_STORE, dl, N->getMemoryVT(), 
N->getOperand(0),
+  N->getOperand(1), N->getOperand(2), N->getMemOperand());
+return New.getValue(0);
   }
   if (ISD::isNormalStore(N))
 return ExpandOp_NormalStore(N, OpNo);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6167be7bdf84e9..1880cbc3a5bf35 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -515,6 +515,13 @@ X86TargetLowering::X86TargetLowering(const 
X86TargetMachine ,
   if (!Subtarget.is64Bit())
 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
 
+  if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
+// All CPUs supporting AVX will atomically load/store