https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/125432
>From fc2debee17c4ded2edbe2f1803f3184cea78bfdc Mon Sep 17 00:00:00 2001 From: jofrn <jofer...@amd.com> Date: Fri, 31 Jan 2025 13:12:56 -0500 Subject: [PATCH] [SelectionDAG][X86] Remove unused elements from atomic vector. After splitting, all elements are created. The two components must be found by looking at the upper and lower half of EXTRACT_ELEMENT. This change extends EltsFromConsecutiveLoads to understand AtomicSDNode so that unused elements can be removed. commit-id:b83937a8 --- llvm/include/llvm/CodeGen/SelectionDAG.h | 4 +- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 20 ++- .../SelectionDAGAddressAnalysis.cpp | 30 ++-- .../SelectionDAG/SelectionDAGBuilder.cpp | 6 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 43 +++-- llvm/test/CodeGen/X86/atomic-load-store.ll | 167 ++---------------- 6 files changed, 83 insertions(+), 187 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index ba11ddbb5b731..d3cd81c146280 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1843,7 +1843,7 @@ class SelectionDAG { /// chain to the token factor. This ensures that the new memory node will have /// the same relative memory dependency position as the old load. Returns the /// new merged load chain. - SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp); + SDValue makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp); /// Topological-sort the AllNodes list and a /// assign a unique node id for each node in the DAG based on their @@ -2281,7 +2281,7 @@ class SelectionDAG { /// merged. Check that both are nonvolatile and if LD is loading /// 'Bytes' bytes from a location that is 'Dist' units away from the /// location that the 'Base' load is loading from. - bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, + bool areNonVolatileConsecutiveLoads(MemSDNode *LD, MemSDNode *Base, unsigned Bytes, int Dist) const; /// Infer alignment of a load / store address. Return std::nullopt if it diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 2a68903c34cef..8e77a542ab029 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -12218,7 +12218,7 @@ SDValue SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain, return TokenFactor; } -SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, +SDValue SelectionDAG::makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp) { assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node"); SDValue OldChain = SDValue(OldLoad, 1); @@ -12911,17 +12911,21 @@ std::pair<SDValue, SDValue> SelectionDAG::UnrollVectorOverflowOp( getBuildVector(NewOvVT, dl, OvScalars)); } -bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD, - LoadSDNode *Base, +bool SelectionDAG::areNonVolatileConsecutiveLoads(MemSDNode *LD, + MemSDNode *Base, unsigned Bytes, int Dist) const { if (LD->isVolatile() || Base->isVolatile()) return false; - // TODO: probably too restrictive for atomics, revisit - if (!LD->isSimple()) - return false; - if (LD->isIndexed() || Base->isIndexed()) - return false; + if (auto Ld = dyn_cast<LoadSDNode>(LD)) { + if (!Ld->isSimple()) + return false; + if (Ld->isIndexed()) + return false; + } + if (auto Ld = dyn_cast<LoadSDNode>(Base)) + if (Ld->isIndexed()) + return false; if (LD->getChain() != Base->getChain()) return false; EVT VT = LD->getMemoryVT(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp index f2ab88851b780..c29cb424c7a4c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -195,8 +195,8 @@ bool BaseIndexOffset::contains(const SelectionDAG &DAG, int64_t BitSize, } /// Parses tree in Ptr for base, index, offset addresses. -static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, - const SelectionDAG &DAG) { +template <typename T> +static BaseIndexOffset matchSDNode(const T *N, const SelectionDAG &DAG) { SDValue Ptr = N->getBasePtr(); // (((B + I*M) + c)) + c ... @@ -206,16 +206,18 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, bool IsIndexSignExt = false; // pre-inc/pre-dec ops are components of EA. - if (N->getAddressingMode() == ISD::PRE_INC) { - if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset())) - Offset += C->getSExtValue(); - else // If unknown, give up now. - return BaseIndexOffset(SDValue(), SDValue(), 0, false); - } else if (N->getAddressingMode() == ISD::PRE_DEC) { - if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset())) - Offset -= C->getSExtValue(); - else // If unknown, give up now. - return BaseIndexOffset(SDValue(), SDValue(), 0, false); + if constexpr (std::is_same_v<T, LSBaseSDNode>) { + if (N->getAddressingMode() == ISD::PRE_INC) { + if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset())) + Offset += C->getSExtValue(); + else // If unknown, give up now. + return BaseIndexOffset(SDValue(), SDValue(), 0, false); + } else if (N->getAddressingMode() == ISD::PRE_DEC) { + if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset())) + Offset -= C->getSExtValue(); + else // If unknown, give up now. + return BaseIndexOffset(SDValue(), SDValue(), 0, false); + } } // Consume constant adds & ors with appropriate masking. @@ -300,8 +302,10 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, BaseIndexOffset BaseIndexOffset::match(const SDNode *N, const SelectionDAG &DAG) { + if (const auto *AN = dyn_cast<AtomicSDNode>(N)) + return matchSDNode(AN, DAG); if (const auto *LS0 = dyn_cast<LSBaseSDNode>(N)) - return matchLSNode(LS0, DAG); + return matchSDNode(LS0, DAG); if (const auto *LN = dyn_cast<LifetimeSDNode>(N)) { if (LN->hasOffset()) return BaseIndexOffset(LN->getOperand(1), SDValue(), LN->getOffset(), diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 744a0fa572b0c..894aadaef4f56 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5172,7 +5172,11 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) { L = DAG.getPtrExtOrTrunc(L, dl, VT); setValue(&I, L); - DAG.setRoot(OutChain); + + if (VT.isVector()) + DAG.setRoot(InChain); + else + DAG.setRoot(OutChain); } void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4e59a3fb16369..70480f86f0a30 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7191,15 +7191,19 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, } // Recurse to find a LoadSDNode source and the accumulated ByteOffest. -static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { - if (ISD::isNON_EXTLoad(Elt.getNode())) { - auto *BaseLd = cast<LoadSDNode>(Elt); - if (!BaseLd->isSimple()) - return false; +static bool findEltLoadSrc(SDValue Elt, MemSDNode *&Ld, int64_t &ByteOffset) { + if (auto *BaseLd = dyn_cast<AtomicSDNode>(Elt)) { Ld = BaseLd; ByteOffset = 0; return true; - } + } else if (auto *BaseLd = dyn_cast<LoadSDNode>(Elt)) + if (ISD::isNON_EXTLoad(Elt.getNode())) { + if (!BaseLd->isSimple()) + return false; + Ld = BaseLd; + ByteOffset = 0; + return true; + } switch (Elt.getOpcode()) { case ISD::BITCAST: @@ -7228,6 +7232,20 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { } } break; + case ISD::EXTRACT_ELEMENT: + if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) { + SDValue Src = Elt.getOperand(0); + unsigned SrcSizeInBits = Src.getScalarValueSizeInBits(); + unsigned DstSizeInBits = Elt.getScalarValueSizeInBits(); + if (2 * DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 && + findEltLoadSrc(Src, Ld, ByteOffset)) { + uint64_t Idx = IdxC->getZExtValue(); + if (Idx == 1) // Get the upper half. + ByteOffset += SrcSizeInBits / (1 << 2 * 2); + return true; + } + } + break; } return false; @@ -7252,7 +7270,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, APInt ZeroMask = APInt::getZero(NumElems); APInt UndefMask = APInt::getZero(NumElems); - SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr); + SmallVector<MemSDNode *, 8> Loads(NumElems, nullptr); SmallVector<int64_t, 8> ByteOffsets(NumElems, 0); // For each element in the initializer, see if we've found a load, zero or an @@ -7302,7 +7320,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, EVT EltBaseVT = EltBase.getValueType(); assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && "Register/Memory size mismatch"); - LoadSDNode *LDBase = Loads[FirstLoadedElt]; + MemSDNode *LDBase = Loads[FirstLoadedElt]; assert(LDBase && "Did not find base load for merging consecutive loads"); unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); unsigned BaseSizeInBytes = BaseSizeInBits / 8; @@ -7316,8 +7334,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, // Check to see if the element's load is consecutive to the base load // or offset from a previous (already checked) load. - auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) { - LoadSDNode *Ld = Loads[EltIdx]; + auto CheckConsecutiveLoad = [&](MemSDNode *Base, int EltIdx) { + MemSDNode *Ld = Loads[EltIdx]; int64_t ByteOffset = ByteOffsets[EltIdx]; if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) { int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes); @@ -7345,7 +7363,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, } } - auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { + auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, MemSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); assert(LDBase->isSimple() && "Cannot merge volatile or atomic loads."); @@ -9402,8 +9420,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { { SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems)); if (SDValue LD = - EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) + EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) { return LD; + } } // If this is a splat of pairs of 32-bit elements, we can use a narrower diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 42b0955824293..08d0405345f57 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -205,63 +205,19 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) { } define <2 x half> @atomic_vec2_half(ptr %x) { -; CHECK3-LABEL: atomic_vec2_half: -; CHECK3: ## %bb.0: -; CHECK3-NEXT: movl (%rdi), %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: shrl $16, %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK3-NEXT: retq -; -; CHECK0-LABEL: atomic_vec2_half: -; CHECK0: ## %bb.0: -; CHECK0-NEXT: movl (%rdi), %eax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: movw %cx, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK0-NEXT: movw %ax, %cx -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK0-NEXT: retq +; CHECK-LABEL: atomic_vec2_half: +; CHECK: ## %bb.0: +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: retq %ret = load atomic <2 x half>, ptr %x acquire, align 4 ret <2 x half> %ret } define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) { -; CHECK3-LABEL: atomic_vec2_bfloat: -; CHECK3: ## %bb.0: -; CHECK3-NEXT: movl (%rdi), %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: shrl $16, %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK3-NEXT: retq -; -; CHECK0-LABEL: atomic_vec2_bfloat: -; CHECK0: ## %bb.0: -; CHECK0-NEXT: movl (%rdi), %eax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx -; CHECK0-NEXT: movw %ax, %dx -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %dx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK0-NEXT: retq +; CHECK-LABEL: atomic_vec2_bfloat: +; CHECK: ## %bb.0: +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: retq %ret = load atomic <2 x bfloat>, ptr %x acquire, align 4 ret <2 x bfloat> %ret } @@ -439,110 +395,19 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind { } define <4 x half> @atomic_vec4_half(ptr %x) nounwind { -; CHECK3-LABEL: atomic_vec4_half: -; CHECK3: ## %bb.0: -; CHECK3-NEXT: movq (%rdi), %rax -; CHECK3-NEXT: movl %eax, %ecx -; CHECK3-NEXT: shrl $16, %ecx -; CHECK3-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: movq %rax, %rcx -; CHECK3-NEXT: shrq $32, %rcx -; CHECK3-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK3-NEXT: shrq $48, %rax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK3-NEXT: retq -; -; CHECK0-LABEL: atomic_vec4_half: -; CHECK0: ## %bb.0: -; CHECK0-NEXT: movq (%rdi), %rax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: movw %cx, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm2 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK0-NEXT: movw %ax, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm0 -; CHECK0-NEXT: movq %rax, %rcx -; CHECK0-NEXT: shrq $32, %rcx -; CHECK0-NEXT: movw %cx, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK0-NEXT: shrq $48, %rax -; CHECK0-NEXT: movw %ax, %cx -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm3 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK0-NEXT: retq +; CHECK-LABEL: atomic_vec4_half: +; CHECK: ## %bb.0: +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: retq %ret = load atomic <4 x half>, ptr %x acquire, align 8 ret <4 x half> %ret } define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind { -; CHECK3-LABEL: atomic_vec4_bfloat: -; CHECK3: ## %bb.0: -; CHECK3-NEXT: movq (%rdi), %rax -; CHECK3-NEXT: movq %rax, %rcx -; CHECK3-NEXT: movq %rax, %rdx -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: ## kill: def $eax killed $eax killed $rax -; CHECK3-NEXT: shrl $16, %eax -; CHECK3-NEXT: shrq $32, %rcx -; CHECK3-NEXT: shrq $48, %rdx -; CHECK3-NEXT: pinsrw $0, %edx, %xmm1 -; CHECK3-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK3-NEXT: retq -; -; CHECK0-LABEL: atomic_vec4_bfloat: -; CHECK0: ## %bb.0: -; CHECK0-NEXT: movq (%rdi), %rax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx -; CHECK0-NEXT: movw %ax, %dx -; CHECK0-NEXT: movq %rax, %rsi -; CHECK0-NEXT: shrq $32, %rsi -; CHECK0-NEXT: ## kill: def $si killed $si killed $rsi -; CHECK0-NEXT: shrq $48, %rax -; CHECK0-NEXT: movw %ax, %di -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %di, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %si, %ax -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %dx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm2 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm2 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK0-NEXT: retq +; CHECK-LABEL: atomic_vec4_bfloat: +; CHECK: ## %bb.0: +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: retq %ret = load atomic <4 x bfloat>, ptr %x acquire, align 8 ret <4 x bfloat> %ret } _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits