https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/125432
>From 684a54284458cae0b700737126715384b9fddab1 Mon Sep 17 00:00:00 2001 From: jofrn <jofer...@amd.com> Date: Fri, 31 Jan 2025 13:12:56 -0500 Subject: [PATCH] [SelectionDAG][X86] Remove unused elements from atomic vector. After splitting, all elements are created. The two components must be found by looking at the upper and lower half of EXTRACT_ELEMENT. This change extends EltsFromConsecutiveLoads to understand AtomicSDNode so that unused elements can be removed. commit-id:b83937a8 --- llvm/include/llvm/CodeGen/SelectionDAG.h | 2 +- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 65 ++++++-- llvm/test/CodeGen/X86/atomic-load-store.ll | 149 ++---------------- 4 files changed, 65 insertions(+), 153 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 87b6914f8a0ee..40550d96a5b3d 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1873,7 +1873,7 @@ class SelectionDAG { /// chain to the token factor. This ensures that the new memory node will have /// the same relative memory dependency position as the old load. Returns the /// new merged load chain. - SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp); + SDValue makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp); /// Topological-sort the AllNodes list and a /// assign a unique node id for each node in the DAG based on their diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index bbf1b0fd590ef..d6e5cd1078776 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -12215,7 +12215,7 @@ SDValue SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain, return TokenFactor; } -SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, +SDValue SelectionDAG::makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp) { assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node"); SDValue OldChain = SDValue(OldLoad, 1); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3ab548f64d04c..409a8c7e73c0e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7193,15 +7193,19 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, } // Recurse to find a LoadSDNode source and the accumulated ByteOffest. -static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { - if (ISD::isNON_EXTLoad(Elt.getNode())) { - auto *BaseLd = cast<LoadSDNode>(Elt); - if (!BaseLd->isSimple()) - return false; +static bool findEltLoadSrc(SDValue Elt, MemSDNode *&Ld, int64_t &ByteOffset) { + if (auto *BaseLd = dyn_cast<AtomicSDNode>(Elt)) { Ld = BaseLd; ByteOffset = 0; return true; - } + } else if (auto *BaseLd = dyn_cast<LoadSDNode>(Elt)) + if (ISD::isNON_EXTLoad(Elt.getNode())) { + if (!BaseLd->isSimple()) + return false; + Ld = BaseLd; + ByteOffset = 0; + return true; + } switch (Elt.getOpcode()) { case ISD::BITCAST: @@ -7254,7 +7258,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, APInt ZeroMask = APInt::getZero(NumElems); APInt UndefMask = APInt::getZero(NumElems); - SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr); + SmallVector<MemSDNode *, 8> Loads(NumElems, nullptr); SmallVector<int64_t, 8> ByteOffsets(NumElems, 0); // For each element in the initializer, see if we've found a load, zero or an @@ -7304,7 +7308,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, EVT EltBaseVT = EltBase.getValueType(); assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && "Register/Memory size mismatch"); - LoadSDNode *LDBase = Loads[FirstLoadedElt]; + MemSDNode *LDBase = Loads[FirstLoadedElt]; assert(LDBase && "Did not find base load for merging consecutive loads"); unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); unsigned BaseSizeInBytes = BaseSizeInBits / 8; @@ -7318,16 +7322,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, // Check to see if the element's load is consecutive to the base load // or offset from a previous (already checked) load. - auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) { - LoadSDNode *Ld = Loads[EltIdx]; + auto CheckConsecutiveLoad = [&](MemSDNode *Base, int EltIdx) { + MemSDNode *Ld = Loads[EltIdx]; int64_t ByteOffset = ByteOffsets[EltIdx]; if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) { int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes); return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] && Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0); } - return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, - EltIdx - FirstLoadedElt); + auto *L = dyn_cast<LoadSDNode>(Ld); + auto *B = dyn_cast<LoadSDNode>(Base); + return L && B && DAG.areNonVolatileConsecutiveLoads(L, B, BaseSizeInBytes, + EltIdx - FirstLoadedElt); }; // Consecutive loads can contain UNDEFS but not ZERO elements. @@ -7347,7 +7353,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, } } - auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { + auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, MemSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); assert(LDBase->isSimple() && "Cannot merge volatile or atomic loads."); @@ -9452,8 +9458,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { { SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems)); if (SDValue LD = - EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) + EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) { return LD; + } } // If this is a splat of pairs of 32-bit elements, we can use a narrower @@ -60388,6 +60395,35 @@ static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineVZEXT_LOAD(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + // Find the TokenFactor to locate the associated AtomicLoad. + SDNode *ALD = nullptr; + for (auto &TF : N->uses()) + if (TF.getUser()->getOpcode() == ISD::TokenFactor) { + SDValue L = TF.getUser()->getOperand(0); + SDValue R = TF.getUser()->getOperand(1); + if (L.getNode() == N) + ALD = R.getNode(); + else if (R.getNode() == N) + ALD = L.getNode(); + } + + if (!ALD) + return SDValue(); + if (!isa<AtomicSDNode>(ALD)) + return SDValue(); + + // Replace the VZEXT_LOAD with the AtomicLoad. + SDLoc dl(N); + SDValue SV = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + N->getValueType(0).changeTypeToInteger(), SDValue(ALD, 0)); + SDValue BC = DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), SV); + BC = DCI.CombineTo(N, BC, SDValue(ALD, 1)); + return BC; +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -60584,6 +60620,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI); case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget); + case X86ISD::VZEXT_LOAD: return combineVZEXT_LOAD(N, DAG, DCI); // clang-format on } diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 6e2e9d4b21891..f72970d12b6eb 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -208,29 +208,12 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) { define <2 x half> @atomic_vec2_half(ptr %x) { ; CHECK3-LABEL: atomic_vec2_half: ; CHECK3: ## %bb.0: -; CHECK3-NEXT: movl (%rdi), %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: shrl $16, %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK3-NEXT: retq ; ; CHECK0-LABEL: atomic_vec2_half: ; CHECK0: ## %bb.0: -; CHECK0-NEXT: movl (%rdi), %eax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: movw %cx, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK0-NEXT: movw %ax, %cx -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK0-NEXT: retq %ret = load atomic <2 x half>, ptr %x acquire, align 4 ret <2 x half> %ret @@ -239,29 +222,12 @@ define <2 x half> @atomic_vec2_half(ptr %x) { define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) { ; CHECK3-LABEL: atomic_vec2_bfloat: ; CHECK3: ## %bb.0: -; CHECK3-NEXT: movl (%rdi), %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: shrl $16, %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK3-NEXT: retq ; ; CHECK0-LABEL: atomic_vec2_bfloat: ; CHECK0: ## %bb.0: -; CHECK0-NEXT: movl (%rdi), %eax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx -; CHECK0-NEXT: movw %ax, %dx -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %dx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK0-NEXT: retq %ret = load atomic <2 x bfloat>, ptr %x acquire, align 4 ret <2 x bfloat> %ret @@ -440,110 +406,19 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind { } define <4 x half> @atomic_vec4_half(ptr %x) nounwind { -; CHECK3-LABEL: atomic_vec4_half: -; CHECK3: ## %bb.0: -; CHECK3-NEXT: movq (%rdi), %rax -; CHECK3-NEXT: movl %eax, %ecx -; CHECK3-NEXT: shrl $16, %ecx -; CHECK3-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: movq %rax, %rcx -; CHECK3-NEXT: shrq $32, %rcx -; CHECK3-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK3-NEXT: shrq $48, %rax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK3-NEXT: retq -; -; CHECK0-LABEL: atomic_vec4_half: -; CHECK0: ## %bb.0: -; CHECK0-NEXT: movq (%rdi), %rax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: movw %cx, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm2 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK0-NEXT: movw %ax, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm0 -; CHECK0-NEXT: movq %rax, %rcx -; CHECK0-NEXT: shrq $32, %rcx -; CHECK0-NEXT: movw %cx, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK0-NEXT: shrq $48, %rax -; CHECK0-NEXT: movw %ax, %cx -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm3 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK0-NEXT: retq +; CHECK-LABEL: atomic_vec4_half: +; CHECK: ## %bb.0: +; CHECK-NEXT: movq (%rdi), %xmm0 +; CHECK-NEXT: retq %ret = load atomic <4 x half>, ptr %x acquire, align 8 ret <4 x half> %ret } define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind { -; CHECK3-LABEL: atomic_vec4_bfloat: -; CHECK3: ## %bb.0: -; CHECK3-NEXT: movq (%rdi), %rax -; CHECK3-NEXT: movq %rax, %rcx -; CHECK3-NEXT: movq %rax, %rdx -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: ## kill: def $eax killed $eax killed $rax -; CHECK3-NEXT: shrl $16, %eax -; CHECK3-NEXT: shrq $32, %rcx -; CHECK3-NEXT: shrq $48, %rdx -; CHECK3-NEXT: pinsrw $0, %edx, %xmm1 -; CHECK3-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK3-NEXT: retq -; -; CHECK0-LABEL: atomic_vec4_bfloat: -; CHECK0: ## %bb.0: -; CHECK0-NEXT: movq (%rdi), %rax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx -; CHECK0-NEXT: movw %ax, %dx -; CHECK0-NEXT: movq %rax, %rsi -; CHECK0-NEXT: shrq $32, %rsi -; CHECK0-NEXT: ## kill: def $si killed $si killed $rsi -; CHECK0-NEXT: shrq $48, %rax -; CHECK0-NEXT: movw %ax, %di -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %di, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %si, %ax -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %dx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm2 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm2 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK0-NEXT: retq +; CHECK-LABEL: atomic_vec4_bfloat: +; CHECK: ## %bb.0: +; CHECK-NEXT: movq (%rdi), %xmm0 +; CHECK-NEXT: retq %ret = load atomic <4 x bfloat>, ptr %x acquire, align 8 ret <4 x bfloat> %ret } _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits