https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/125432
>From bf8fc80f870022c2a42d01a500e2b16d648dd376 Mon Sep 17 00:00:00 2001 From: jofrn <jofer...@amd.com> Date: Fri, 31 Jan 2025 13:12:56 -0500 Subject: [PATCH] [SelectionDAG][X86] Remove unused elements from atomic vector. After splitting, all elements are created. The two components must be found by looking at the upper and lower half of EXTRACT_ELEMENT. This change extends EltsFromConsecutiveLoads to understand AtomicSDNode so that unused elements can be removed. commit-id:b83937a8 --- llvm/include/llvm/CodeGen/SelectionDAG.h | 4 +- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 20 ++- .../SelectionDAGAddressAnalysis.cpp | 30 ++-- llvm/lib/Target/X86/X86ISelLowering.cpp | 59 +++++-- llvm/test/CodeGen/X86/atomic-load-store.ll | 149 ++---------------- 5 files changed, 90 insertions(+), 172 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 87b6914f8a0ee..ab8bb517e6ae4 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1873,7 +1873,7 @@ class SelectionDAG { /// chain to the token factor. This ensures that the new memory node will have /// the same relative memory dependency position as the old load. Returns the /// new merged load chain. - SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp); + SDValue makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp); /// Topological-sort the AllNodes list and a /// assign a unique node id for each node in the DAG based on their @@ -2311,7 +2311,7 @@ class SelectionDAG { /// merged. Check that both are nonvolatile and if LD is loading /// 'Bytes' bytes from a location that is 'Dist' units away from the /// location that the 'Base' load is loading from. - bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, + bool areNonVolatileConsecutiveLoads(MemSDNode *LD, MemSDNode *Base, unsigned Bytes, int Dist) const; /// Infer alignment of a load / store address. Return std::nullopt if it diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index bbf1b0fd590ef..38b22078c8c44 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -12215,7 +12215,7 @@ SDValue SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain, return TokenFactor; } -SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, +SDValue SelectionDAG::makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp) { assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node"); SDValue OldChain = SDValue(OldLoad, 1); @@ -12905,17 +12905,21 @@ std::pair<SDValue, SDValue> SelectionDAG::UnrollVectorOverflowOp( getBuildVector(NewOvVT, dl, OvScalars)); } -bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD, - LoadSDNode *Base, +bool SelectionDAG::areNonVolatileConsecutiveLoads(MemSDNode *LD, + MemSDNode *Base, unsigned Bytes, int Dist) const { if (LD->isVolatile() || Base->isVolatile()) return false; - // TODO: probably too restrictive for atomics, revisit - if (!LD->isSimple()) - return false; - if (LD->isIndexed() || Base->isIndexed()) - return false; + if (auto Ld = dyn_cast<LoadSDNode>(LD)) { + if (!Ld->isSimple()) + return false; + if (Ld->isIndexed()) + return false; + } + if (auto Ld = dyn_cast<LoadSDNode>(Base)) + if (Ld->isIndexed()) + return false; if (LD->getChain() != Base->getChain()) return false; EVT VT = LD->getMemoryVT(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp index f2ab88851b780..c29cb424c7a4c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -195,8 +195,8 @@ bool BaseIndexOffset::contains(const SelectionDAG &DAG, int64_t BitSize, } /// Parses tree in Ptr for base, index, offset addresses. -static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, - const SelectionDAG &DAG) { +template <typename T> +static BaseIndexOffset matchSDNode(const T *N, const SelectionDAG &DAG) { SDValue Ptr = N->getBasePtr(); // (((B + I*M) + c)) + c ... @@ -206,16 +206,18 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, bool IsIndexSignExt = false; // pre-inc/pre-dec ops are components of EA. - if (N->getAddressingMode() == ISD::PRE_INC) { - if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset())) - Offset += C->getSExtValue(); - else // If unknown, give up now. - return BaseIndexOffset(SDValue(), SDValue(), 0, false); - } else if (N->getAddressingMode() == ISD::PRE_DEC) { - if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset())) - Offset -= C->getSExtValue(); - else // If unknown, give up now. - return BaseIndexOffset(SDValue(), SDValue(), 0, false); + if constexpr (std::is_same_v<T, LSBaseSDNode>) { + if (N->getAddressingMode() == ISD::PRE_INC) { + if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset())) + Offset += C->getSExtValue(); + else // If unknown, give up now. + return BaseIndexOffset(SDValue(), SDValue(), 0, false); + } else if (N->getAddressingMode() == ISD::PRE_DEC) { + if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset())) + Offset -= C->getSExtValue(); + else // If unknown, give up now. + return BaseIndexOffset(SDValue(), SDValue(), 0, false); + } } // Consume constant adds & ors with appropriate masking. @@ -300,8 +302,10 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, BaseIndexOffset BaseIndexOffset::match(const SDNode *N, const SelectionDAG &DAG) { + if (const auto *AN = dyn_cast<AtomicSDNode>(N)) + return matchSDNode(AN, DAG); if (const auto *LS0 = dyn_cast<LSBaseSDNode>(N)) - return matchLSNode(LS0, DAG); + return matchSDNode(LS0, DAG); if (const auto *LN = dyn_cast<LifetimeSDNode>(N)) { if (LN->hasOffset()) return BaseIndexOffset(LN->getOperand(1), SDValue(), LN->getOffset(), diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 20d400c669693..3cfbf68be7ed6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7193,15 +7193,19 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, } // Recurse to find a LoadSDNode source and the accumulated ByteOffest. -static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { - if (ISD::isNON_EXTLoad(Elt.getNode())) { - auto *BaseLd = cast<LoadSDNode>(Elt); - if (!BaseLd->isSimple()) - return false; +static bool findEltLoadSrc(SDValue Elt, MemSDNode *&Ld, int64_t &ByteOffset) { + if (auto *BaseLd = dyn_cast<AtomicSDNode>(Elt)) { Ld = BaseLd; ByteOffset = 0; return true; - } + } else if (auto *BaseLd = dyn_cast<LoadSDNode>(Elt)) + if (ISD::isNON_EXTLoad(Elt.getNode())) { + if (!BaseLd->isSimple()) + return false; + Ld = BaseLd; + ByteOffset = 0; + return true; + } switch (Elt.getOpcode()) { case ISD::BITCAST: @@ -7254,7 +7258,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, APInt ZeroMask = APInt::getZero(NumElems); APInt UndefMask = APInt::getZero(NumElems); - SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr); + SmallVector<MemSDNode *, 8> Loads(NumElems, nullptr); SmallVector<int64_t, 8> ByteOffsets(NumElems, 0); // For each element in the initializer, see if we've found a load, zero or an @@ -7304,7 +7308,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, EVT EltBaseVT = EltBase.getValueType(); assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && "Register/Memory size mismatch"); - LoadSDNode *LDBase = Loads[FirstLoadedElt]; + MemSDNode *LDBase = Loads[FirstLoadedElt]; assert(LDBase && "Did not find base load for merging consecutive loads"); unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); unsigned BaseSizeInBytes = BaseSizeInBits / 8; @@ -7318,8 +7322,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, // Check to see if the element's load is consecutive to the base load // or offset from a previous (already checked) load. - auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) { - LoadSDNode *Ld = Loads[EltIdx]; + auto CheckConsecutiveLoad = [&](MemSDNode *Base, int EltIdx) { + MemSDNode *Ld = Loads[EltIdx]; int64_t ByteOffset = ByteOffsets[EltIdx]; if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) { int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes); @@ -7347,7 +7351,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, } } - auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { + auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, MemSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); assert(LDBase->isSimple() && "Cannot merge volatile or atomic loads."); @@ -9452,8 +9456,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { { SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems)); if (SDValue LD = - EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) + EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) { return LD; + } } // If this is a splat of pairs of 32-bit elements, we can use a narrower @@ -60388,6 +60393,35 @@ static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineVZEXT_LOAD(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + // Find the TokenFactor to locate the associated AtomicLoad. + SDNode *ALD = nullptr; + for (auto &TF : DAG.allnodes()) + if (TF.getOpcode() == ISD::TokenFactor) { + SDValue L = TF.getOperand(0); + SDValue R = TF.getOperand(1); + if (L.getNode() == N) + ALD = R.getNode(); + else if (R.getNode() == N) + ALD = L.getNode(); + } + + if (!ALD) + return SDValue(); + if (!isa<AtomicSDNode>(ALD)) + return SDValue(); + + // Replace the VZEXT_LOAD with the AtomicLoad. + SDLoc dl(N); + SDValue SV = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + N->getValueType(0).changeTypeToInteger(), SDValue(ALD, 0)); + SDValue BC = DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), SV); + BC = DCI.CombineTo(N, BC, SDValue(ALD, 1)); + return BC; +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -60584,6 +60618,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI); case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget); + case X86ISD::VZEXT_LOAD: return combineVZEXT_LOAD(N, DAG, DCI); // clang-format on } diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 6e2e9d4b21891..f72970d12b6eb 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -208,29 +208,12 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) { define <2 x half> @atomic_vec2_half(ptr %x) { ; CHECK3-LABEL: atomic_vec2_half: ; CHECK3: ## %bb.0: -; CHECK3-NEXT: movl (%rdi), %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: shrl $16, %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK3-NEXT: retq ; ; CHECK0-LABEL: atomic_vec2_half: ; CHECK0: ## %bb.0: -; CHECK0-NEXT: movl (%rdi), %eax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: movw %cx, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK0-NEXT: movw %ax, %cx -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK0-NEXT: retq %ret = load atomic <2 x half>, ptr %x acquire, align 4 ret <2 x half> %ret @@ -239,29 +222,12 @@ define <2 x half> @atomic_vec2_half(ptr %x) { define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) { ; CHECK3-LABEL: atomic_vec2_bfloat: ; CHECK3: ## %bb.0: -; CHECK3-NEXT: movl (%rdi), %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: shrl $16, %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK3-NEXT: retq ; ; CHECK0-LABEL: atomic_vec2_bfloat: ; CHECK0: ## %bb.0: -; CHECK0-NEXT: movl (%rdi), %eax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx -; CHECK0-NEXT: movw %ax, %dx -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %dx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK0-NEXT: retq %ret = load atomic <2 x bfloat>, ptr %x acquire, align 4 ret <2 x bfloat> %ret @@ -440,110 +406,19 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind { } define <4 x half> @atomic_vec4_half(ptr %x) nounwind { -; CHECK3-LABEL: atomic_vec4_half: -; CHECK3: ## %bb.0: -; CHECK3-NEXT: movq (%rdi), %rax -; CHECK3-NEXT: movl %eax, %ecx -; CHECK3-NEXT: shrl $16, %ecx -; CHECK3-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: movq %rax, %rcx -; CHECK3-NEXT: shrq $32, %rcx -; CHECK3-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK3-NEXT: shrq $48, %rax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK3-NEXT: retq -; -; CHECK0-LABEL: atomic_vec4_half: -; CHECK0: ## %bb.0: -; CHECK0-NEXT: movq (%rdi), %rax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: movw %cx, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm2 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK0-NEXT: movw %ax, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm0 -; CHECK0-NEXT: movq %rax, %rcx -; CHECK0-NEXT: shrq $32, %rcx -; CHECK0-NEXT: movw %cx, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK0-NEXT: shrq $48, %rax -; CHECK0-NEXT: movw %ax, %cx -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm3 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK0-NEXT: retq +; CHECK-LABEL: atomic_vec4_half: +; CHECK: ## %bb.0: +; CHECK-NEXT: movq (%rdi), %xmm0 +; CHECK-NEXT: retq %ret = load atomic <4 x half>, ptr %x acquire, align 8 ret <4 x half> %ret } define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind { -; CHECK3-LABEL: atomic_vec4_bfloat: -; CHECK3: ## %bb.0: -; CHECK3-NEXT: movq (%rdi), %rax -; CHECK3-NEXT: movq %rax, %rcx -; CHECK3-NEXT: movq %rax, %rdx -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: ## kill: def $eax killed $eax killed $rax -; CHECK3-NEXT: shrl $16, %eax -; CHECK3-NEXT: shrq $32, %rcx -; CHECK3-NEXT: shrq $48, %rdx -; CHECK3-NEXT: pinsrw $0, %edx, %xmm1 -; CHECK3-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK3-NEXT: retq -; -; CHECK0-LABEL: atomic_vec4_bfloat: -; CHECK0: ## %bb.0: -; CHECK0-NEXT: movq (%rdi), %rax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx -; CHECK0-NEXT: movw %ax, %dx -; CHECK0-NEXT: movq %rax, %rsi -; CHECK0-NEXT: shrq $32, %rsi -; CHECK0-NEXT: ## kill: def $si killed $si killed $rsi -; CHECK0-NEXT: shrq $48, %rax -; CHECK0-NEXT: movw %ax, %di -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %di, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %si, %ax -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %dx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm2 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm2 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK0-NEXT: retq +; CHECK-LABEL: atomic_vec4_bfloat: +; CHECK: ## %bb.0: +; CHECK-NEXT: movq (%rdi), %xmm0 +; CHECK-NEXT: retq %ret = load atomic <4 x bfloat>, ptr %x acquire, align 8 ret <4 x bfloat> %ret } _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits