llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-selectiondag Author: Nikita Popov (nikic) <details> <summary>Changes</summary> Cherry-pick of 6420099bcc62a09e002e500870216b2dd9d256a9. --- Patch is 28.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/181147.diff 16 Files Affected: - (modified) llvm/include/llvm/CodeGen/SelectionDAG.h (+5) - (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+10-1) - (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (+7) - (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+12-9) - (modified) llvm/test/CodeGen/AArch64/neon-dotreduce.ll (+3-2) - (modified) llvm/test/CodeGen/X86/insertelement-zero.ll (+40) - (modified) llvm/test/CodeGen/X86/pr134602.ll (+1-1) - (modified) llvm/test/CodeGen/X86/pr173924.ll (+1-1) - (modified) llvm/test/CodeGen/X86/vector-fshl-256.ll (+6) - (modified) llvm/test/CodeGen/X86/vector-fshl-512.ll (+4) - (modified) llvm/test/CodeGen/X86/vector-fshl-rot-256.ll (+5) - (modified) llvm/test/CodeGen/X86/vector-fshl-rot-512.ll (+4) - (modified) llvm/test/CodeGen/X86/vector-fshr-512.ll (+24-22) - (modified) llvm/test/CodeGen/X86/vector-fshr-rot-512.ll (+20-18) - (modified) llvm/test/CodeGen/X86/vector-rotate-256.ll (+5) - (modified) llvm/test/CodeGen/X86/vector-rotate-512.ll (+4) ``````````diff diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 604319095e74f..e70a59d9fb0fd 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1723,6 +1723,11 @@ class SelectionDAG { /// Return a freeze using the SDLoc of the value operand. LLVM_ABI SDValue getFreeze(SDValue V); + /// Return a freeze of V if any of the demanded elts may be undef or poison. + /// If \p PoisonOnly is true, then only check for poison elements. + LLVM_ABI SDValue getFreeze(SDValue V, const APInt &DemandedElts, + bool PoisonOnly = false); + /// Return an AssertAlignSDNode. LLVM_ABI SDValue getAssertAlign(const SDLoc &DL, SDValue V, Align A); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b071a12587826..2f8fe09c3dc98 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -24060,8 +24060,17 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { // Build the mask and return the corresponding DAG node. auto BuildMaskAndNode = [&](SDValue TrueVal, SDValue FalseVal, unsigned MaskOpcode) { - for (unsigned I = 0; I != NumElts; ++I) + APInt InsertedEltMask = APInt::getZero(NumElts); + for (unsigned I = 0; I != NumElts; ++I) { Mask[I] = Ops[I] ? TrueVal : FalseVal; + if (Ops[I]) + InsertedEltMask.setBit(I); + } + // Make sure to freeze the source vector in case any of the elements + // overwritten by the insert may be poison. Otherwise those elements + // could end up being poison instead of 0/-1 after the AND/OR. + CurVec = + DAG.getFreeze(CurVec, InsertedEltMask, /*PoisonOnly=*/true); return DAG.getNode(MaskOpcode, DL, VT, CurVec, DAG.getBuildVector(VT, DL, Mask)); }; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 8e609bc443da5..5a926b8bf6211 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2502,6 +2502,13 @@ SDValue SelectionDAG::getFreeze(SDValue V) { return getNode(ISD::FREEZE, SDLoc(V), V.getValueType(), V); } +SDValue SelectionDAG::getFreeze(SDValue V, const APInt &DemandedElts, + bool PoisonOnly) { + if (isGuaranteedNotToBeUndefOrPoison(V, DemandedElts, PoisonOnly)) + return V; + return getFreeze(V); +} + /// getShiftAmountOperand - Return the specified value casted to /// the target's desired shift amount type. SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) { diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index ff5e046f0fcd5..ee489f9fc74f9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3794,19 +3794,22 @@ bool TargetLowering::SimplifyDemandedVectorElts( if (SimplifyDemandedVectorElts(Op1, DemandedElts, SrcUndef, SrcZero, TLO, Depth + 1)) return true; - // If we know that a demanded element was zero in Op1 we don't need to - // demand it in Op0 - its guaranteed to be zero. - APInt DemandedElts0 = DemandedElts & ~SrcZero; - if (SimplifyDemandedVectorElts(Op0, DemandedElts0, KnownUndef, KnownZero, + // FIXME: If we know that a demanded element was zero in Op1 we don't need + // to demand it in Op0 - its guaranteed to be zero. There is however a + // restriction, as we must not make any of the originally demanded elements + // more poisonous. We could reduce amount of elements demanded, but then we + // also need a to inform SimplifyDemandedVectorElts that some elements must + // not be made more poisonous. + if (SimplifyDemandedVectorElts(Op0, DemandedElts, KnownUndef, KnownZero, TLO, Depth + 1)) return true; - KnownUndef &= DemandedElts0; - KnownZero &= DemandedElts0; + KnownUndef &= DemandedElts; + KnownZero &= DemandedElts; - // If every element pair has a zero/undef then just fold to zero. - // fold (and x, undef) -> 0 / (and x, 0) -> 0 - // fold (mul x, undef) -> 0 / (mul x, 0) -> 0 + // If every element pair has a zero/undef/poison then just fold to zero. + // fold (and x, undef/poison) -> 0 / (and x, 0) -> 0 + // fold (mul x, undef/poison) -> 0 / (mul x, 0) -> 0 if (DemandedElts.isSubsetOf(SrcZero | KnownZero | SrcUndef | KnownUndef)) return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index dbbe00c89eecf..8854d8ab80798 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -445,8 +445,9 @@ entry: define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) { ; CHECK-SD-LABEL: test_udot_v5i8_nomla: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: ldr x8, [x0] ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 +; CHECK-SD-NEXT: fmov d0, x8 ; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0 ; CHECK-SD-NEXT: mov v1.s[0], v2.s[0] @@ -2681,8 +2682,8 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) { ; CHECK-SD-NEXT: ldp q2, q1, [x0] ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 ; CHECK-SD-NEXT: ushll2 v3.8h, v1.16b, #0 -; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-SD-NEXT: ushll v4.8h, v2.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-SD-NEXT: ushll2 v2.8h, v2.16b, #0 ; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0 ; CHECK-SD-NEXT: uaddl2 v5.4s, v4.8h, v1.8h diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll index b66ad07c466e1..e1c8cefa73d8a 100644 --- a/llvm/test/CodeGen/X86/insertelement-zero.ll +++ b/llvm/test/CodeGen/X86/insertelement-zero.ll @@ -539,3 +539,43 @@ define <4 x i32> @PR41512_loads(ptr %p1, ptr %p2) { %r = shufflevector <4 x i32> %ins1, <4 x i32> %ins2, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x i32> %r } + +; Reproducer for bugs in DAGCombiner and SimplifyDemandedVectorElts. +; +; Problem was that DAGCombiner replaced INSERT_VECTOR_ELT by AND, without +; considering that %i has poison elements. So instead of overwriting those +; poison elements by inserting zeroes, we got "AND poison, 0" which is poison +; and not guaranteed to be folded as zero. +; +; When solving the above by inserting a FREEZE another bug +; surfaced. SimplifyDemandedVectorElts was not demanding elements that were +; known to be AND:ed by zero. So the FREEZE ended up being removed and we +; still got "AND poison, 0". +; +; Expected result is that the add reduction computes the sum 0+0+0+0+0+77+0+77 = 154. +define i64 @fold_insertelement_to_and(i32 noundef %arg) { +; SSE-LABEL: fold_insertelement_to_and: +; SSE: # %bb.0: +; SSE-NEXT: movl $154, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: fold_insertelement_to_and: +; AVX1: # %bb.0: +; AVX1-NEXT: movl $154, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_insertelement_to_and: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,77] +; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: retq + %i = shufflevector <8 x i64> zeroinitializer, <8 x i64> splat (i64 77), <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 4, i32 8, i32 6, i32 10> + %i1 = insertelement <8 x i64> %i, i64 0, i64 0 + %i2 = insertelement <8 x i64> %i1, i64 0, i64 2 + %i3 = shufflevector <8 x i64> %i2, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 5, i32 6, i32 7> + %i4 = tail call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %i3) + ret i64 %i4 +} diff --git a/llvm/test/CodeGen/X86/pr134602.ll b/llvm/test/CodeGen/X86/pr134602.ll index 063b6f31fe630..50efcde462532 100644 --- a/llvm/test/CodeGen/X86/pr134602.ll +++ b/llvm/test/CodeGen/X86/pr134602.ll @@ -17,7 +17,7 @@ define i32 @PR134602(i16 %a0) { ; X64-NEXT: movzwl %di, %eax ; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,2,2,2,4,5,6,7] ; X64-NEXT: paddw %xmm0, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: psrld $16, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr173924.ll b/llvm/test/CodeGen/X86/pr173924.ll index f5059da10da7c..d130014a8fa62 100644 --- a/llvm/test/CodeGen/X86/pr173924.ll +++ b/llvm/test/CodeGen/X86/pr173924.ll @@ -7,7 +7,7 @@ define i256 @PR173924(<8 x i256> %a0) { ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edi -; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovdqu {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r8d diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index 2fadf5f101626..919450857171b 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -1004,6 +1004,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -1015,6 +1016,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -1026,6 +1028,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -1037,6 +1040,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -1057,6 +1061,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -1092,6 +1097,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 +; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll index 34ad667f01171..fed534a7b9440 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -552,6 +552,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 +; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3 @@ -570,6 +571,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3 @@ -584,6 +586,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 @@ -601,6 +604,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index 5f7e4070b3783..1f6df959f6d00 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -773,6 +773,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX2-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0 @@ -784,6 +785,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 @@ -795,6 +797,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 @@ -806,6 +809,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 +; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 @@ -817,6 +821,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll index 4c6680ac4a19a..da2d41ee19d5a 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -308,6 +308,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm6 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm6, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw %xmm1, %ymm4, %ymm2 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 @@ -325,6 +326,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm6 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm6, %ymm3 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm4, %ymm2 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 @@ -338,6 +340,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 @@ -349,6 +352,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll index 25f8f94eb834c..4257bcc0e3f99 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -547,36 +547,38 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5 -; AVX512F-NEXT: vpsrlw %xmm4, %ymm5, %ymm5 -; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpaddw %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3 +; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512F-NEXT: vpaddw %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw %xmm4, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vpsllw %xmm4, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/181147 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
