https://github.com/nikic created https://github.com/llvm/llvm-project/pull/181147
Cherry-pick of 6420099bcc62a09e002e500870216b2dd9d256a9. >From d0b6943132a40219d6253b0bb5dbf2e4e31cb4c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Pettersson?= <[email protected]> Date: Thu, 12 Feb 2026 10:58:29 +0100 Subject: [PATCH] [SelectionDAG] Make sure demanded lanes for AND/MUL-by-zero are frozen (#180727) DAGCombiner can fold a chain of INSERT_VECTOR_ELT into a vector AND/OR operation. This patch adds protection to avoid that we end up making the vector more poisonous by freezing the source vector when the elements that should be set to 0/-1 may be poison in the source vector. The patch also fixes a bug in SimplifyDemandedVectorElts for MUL/MULHU/MULHS/AND that could result in making the vector more poisonous. Problem was that we skipped demanding elements from Op0 that were known to be zero in Op1. But that could result in elements being simplified into poison when simplifying Op0, and then the result would be poison and not zero after the MUL/MULHU/MULHS/AND. The solution is to defensively make sure that we demand all the elements originally demanded also when simplifying Op0. This bugs were found when analysing the miscompiles in https://github.com/llvm/llvm-project/issues/179448 Main culprit in #179448 seems to have been the bug in DAGCombiner. The bug in SimplifyDemandedVectorElts surfaced when fixing the DAGCombiner, as that fix typically introduce the (AND (FREEZE x), y) pattern that wasn't handled correctly in SimplifyDemandedVectorElts. Also fixes #180409. Also fixes #176682. (cherry picked from commit 6420099bcc62a09e002e500870216b2dd9d256a9) --- llvm/include/llvm/CodeGen/SelectionDAG.h | 5 ++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 ++++- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 7 +++ .../CodeGen/SelectionDAG/TargetLowering.cpp | 21 +++++---- llvm/test/CodeGen/AArch64/neon-dotreduce.ll | 5 +- llvm/test/CodeGen/X86/insertelement-zero.ll | 40 ++++++++++++++++ llvm/test/CodeGen/X86/pr134602.ll | 2 +- llvm/test/CodeGen/X86/pr173924.ll | 2 +- llvm/test/CodeGen/X86/vector-fshl-256.ll | 6 +++ llvm/test/CodeGen/X86/vector-fshl-512.ll | 4 ++ llvm/test/CodeGen/X86/vector-fshl-rot-256.ll | 5 ++ llvm/test/CodeGen/X86/vector-fshl-rot-512.ll | 4 ++ llvm/test/CodeGen/X86/vector-fshr-512.ll | 46 ++++++++++--------- llvm/test/CodeGen/X86/vector-fshr-rot-512.ll | 38 +++++++-------- llvm/test/CodeGen/X86/vector-rotate-256.ll | 5 ++ llvm/test/CodeGen/X86/vector-rotate-512.ll | 4 ++ 16 files changed, 151 insertions(+), 54 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 604319095e74f..e70a59d9fb0fd 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1723,6 +1723,11 @@ class SelectionDAG { /// Return a freeze using the SDLoc of the value operand. LLVM_ABI SDValue getFreeze(SDValue V); + /// Return a freeze of V if any of the demanded elts may be undef or poison. + /// If \p PoisonOnly is true, then only check for poison elements. + LLVM_ABI SDValue getFreeze(SDValue V, const APInt &DemandedElts, + bool PoisonOnly = false); + /// Return an AssertAlignSDNode. LLVM_ABI SDValue getAssertAlign(const SDLoc &DL, SDValue V, Align A); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b071a12587826..2f8fe09c3dc98 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -24060,8 +24060,17 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { // Build the mask and return the corresponding DAG node. auto BuildMaskAndNode = [&](SDValue TrueVal, SDValue FalseVal, unsigned MaskOpcode) { - for (unsigned I = 0; I != NumElts; ++I) + APInt InsertedEltMask = APInt::getZero(NumElts); + for (unsigned I = 0; I != NumElts; ++I) { Mask[I] = Ops[I] ? TrueVal : FalseVal; + if (Ops[I]) + InsertedEltMask.setBit(I); + } + // Make sure to freeze the source vector in case any of the elements + // overwritten by the insert may be poison. Otherwise those elements + // could end up being poison instead of 0/-1 after the AND/OR. + CurVec = + DAG.getFreeze(CurVec, InsertedEltMask, /*PoisonOnly=*/true); return DAG.getNode(MaskOpcode, DL, VT, CurVec, DAG.getBuildVector(VT, DL, Mask)); }; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 8e609bc443da5..5a926b8bf6211 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2502,6 +2502,13 @@ SDValue SelectionDAG::getFreeze(SDValue V) { return getNode(ISD::FREEZE, SDLoc(V), V.getValueType(), V); } +SDValue SelectionDAG::getFreeze(SDValue V, const APInt &DemandedElts, + bool PoisonOnly) { + if (isGuaranteedNotToBeUndefOrPoison(V, DemandedElts, PoisonOnly)) + return V; + return getFreeze(V); +} + /// getShiftAmountOperand - Return the specified value casted to /// the target's desired shift amount type. SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) { diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index ff5e046f0fcd5..ee489f9fc74f9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3794,19 +3794,22 @@ bool TargetLowering::SimplifyDemandedVectorElts( if (SimplifyDemandedVectorElts(Op1, DemandedElts, SrcUndef, SrcZero, TLO, Depth + 1)) return true; - // If we know that a demanded element was zero in Op1 we don't need to - // demand it in Op0 - its guaranteed to be zero. - APInt DemandedElts0 = DemandedElts & ~SrcZero; - if (SimplifyDemandedVectorElts(Op0, DemandedElts0, KnownUndef, KnownZero, + // FIXME: If we know that a demanded element was zero in Op1 we don't need + // to demand it in Op0 - its guaranteed to be zero. There is however a + // restriction, as we must not make any of the originally demanded elements + // more poisonous. We could reduce amount of elements demanded, but then we + // also need a to inform SimplifyDemandedVectorElts that some elements must + // not be made more poisonous. + if (SimplifyDemandedVectorElts(Op0, DemandedElts, KnownUndef, KnownZero, TLO, Depth + 1)) return true; - KnownUndef &= DemandedElts0; - KnownZero &= DemandedElts0; + KnownUndef &= DemandedElts; + KnownZero &= DemandedElts; - // If every element pair has a zero/undef then just fold to zero. - // fold (and x, undef) -> 0 / (and x, 0) -> 0 - // fold (mul x, undef) -> 0 / (mul x, 0) -> 0 + // If every element pair has a zero/undef/poison then just fold to zero. + // fold (and x, undef/poison) -> 0 / (and x, 0) -> 0 + // fold (mul x, undef/poison) -> 0 / (mul x, 0) -> 0 if (DemandedElts.isSubsetOf(SrcZero | KnownZero | SrcUndef | KnownUndef)) return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index dbbe00c89eecf..8854d8ab80798 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -445,8 +445,9 @@ entry: define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) { ; CHECK-SD-LABEL: test_udot_v5i8_nomla: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: ldr x8, [x0] ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 +; CHECK-SD-NEXT: fmov d0, x8 ; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0 ; CHECK-SD-NEXT: mov v1.s[0], v2.s[0] @@ -2681,8 +2682,8 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) { ; CHECK-SD-NEXT: ldp q2, q1, [x0] ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 ; CHECK-SD-NEXT: ushll2 v3.8h, v1.16b, #0 -; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-SD-NEXT: ushll v4.8h, v2.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-SD-NEXT: ushll2 v2.8h, v2.16b, #0 ; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0 ; CHECK-SD-NEXT: uaddl2 v5.4s, v4.8h, v1.8h diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll index b66ad07c466e1..e1c8cefa73d8a 100644 --- a/llvm/test/CodeGen/X86/insertelement-zero.ll +++ b/llvm/test/CodeGen/X86/insertelement-zero.ll @@ -539,3 +539,43 @@ define <4 x i32> @PR41512_loads(ptr %p1, ptr %p2) { %r = shufflevector <4 x i32> %ins1, <4 x i32> %ins2, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x i32> %r } + +; Reproducer for bugs in DAGCombiner and SimplifyDemandedVectorElts. +; +; Problem was that DAGCombiner replaced INSERT_VECTOR_ELT by AND, without +; considering that %i has poison elements. So instead of overwriting those +; poison elements by inserting zeroes, we got "AND poison, 0" which is poison +; and not guaranteed to be folded as zero. +; +; When solving the above by inserting a FREEZE another bug +; surfaced. SimplifyDemandedVectorElts was not demanding elements that were +; known to be AND:ed by zero. So the FREEZE ended up being removed and we +; still got "AND poison, 0". +; +; Expected result is that the add reduction computes the sum 0+0+0+0+0+77+0+77 = 154. +define i64 @fold_insertelement_to_and(i32 noundef %arg) { +; SSE-LABEL: fold_insertelement_to_and: +; SSE: # %bb.0: +; SSE-NEXT: movl $154, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: fold_insertelement_to_and: +; AVX1: # %bb.0: +; AVX1-NEXT: movl $154, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_insertelement_to_and: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,77] +; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: retq + %i = shufflevector <8 x i64> zeroinitializer, <8 x i64> splat (i64 77), <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 4, i32 8, i32 6, i32 10> + %i1 = insertelement <8 x i64> %i, i64 0, i64 0 + %i2 = insertelement <8 x i64> %i1, i64 0, i64 2 + %i3 = shufflevector <8 x i64> %i2, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 5, i32 6, i32 7> + %i4 = tail call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %i3) + ret i64 %i4 +} diff --git a/llvm/test/CodeGen/X86/pr134602.ll b/llvm/test/CodeGen/X86/pr134602.ll index 063b6f31fe630..50efcde462532 100644 --- a/llvm/test/CodeGen/X86/pr134602.ll +++ b/llvm/test/CodeGen/X86/pr134602.ll @@ -17,7 +17,7 @@ define i32 @PR134602(i16 %a0) { ; X64-NEXT: movzwl %di, %eax ; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,2,2,2,4,5,6,7] ; X64-NEXT: paddw %xmm0, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: psrld $16, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr173924.ll b/llvm/test/CodeGen/X86/pr173924.ll index f5059da10da7c..d130014a8fa62 100644 --- a/llvm/test/CodeGen/X86/pr173924.ll +++ b/llvm/test/CodeGen/X86/pr173924.ll @@ -7,7 +7,7 @@ define i256 @PR173924(<8 x i256> %a0) { ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edi -; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovdqu {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r8d diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index 2fadf5f101626..919450857171b 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -1004,6 +1004,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -1015,6 +1016,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -1026,6 +1028,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -1037,6 +1040,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -1057,6 +1061,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -1092,6 +1097,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 +; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll index 34ad667f01171..fed534a7b9440 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -552,6 +552,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 +; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3 @@ -570,6 +571,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3 @@ -584,6 +586,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 @@ -601,6 +604,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index 5f7e4070b3783..1f6df959f6d00 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -773,6 +773,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX2-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0 @@ -784,6 +785,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 @@ -795,6 +797,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 @@ -806,6 +809,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 +; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 @@ -817,6 +821,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll index 4c6680ac4a19a..da2d41ee19d5a 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -308,6 +308,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm6 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm6, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw %xmm1, %ymm4, %ymm2 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 @@ -325,6 +326,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm6 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm6, %ymm3 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm4, %ymm2 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 @@ -338,6 +340,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 @@ -349,6 +352,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll index 25f8f94eb834c..4257bcc0e3f99 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -547,36 +547,38 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5 -; AVX512F-NEXT: vpsrlw %xmm4, %ymm5, %ymm5 -; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpaddw %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3 +; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512F-NEXT: vpaddw %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw %xmm4, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vpsllw %xmm4, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] -; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5 -; AVX512VL-NEXT: vpsrlw %xmm4, %ymm5, %ymm5 -; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vpaddw %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3 +; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512VL-NEXT: vpaddw %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw %xmm4, %ymm5, %ymm5 ; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vpsllw %xmm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll index 1d089e427bfad..c0dbbf0571c51 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -299,35 +299,37 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] -; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 -; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpaddw %ymm4, %ymm4, %ymm5 +; AVX512F-NEXT: vpsllw %xmm3, %ymm5, %ymm5 +; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm6 +; AVX512F-NEXT: vpsllw %xmm3, %ymm6, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpaddw %ymm4, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpsrlw %xmm1, %ymm4, %ymm2 +; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] -; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 -; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm5 -; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpaddw %ymm4, %ymm4, %ymm5 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm6 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm6, %ymm3 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpaddw %ymm4, %ymm4, %ymm2 -; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2 -; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm4, %ymm2 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index 5ae3e2f5d7621..6a6997e607648 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -664,6 +664,7 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX2-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0 @@ -675,6 +676,7 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 @@ -686,6 +688,7 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 @@ -697,6 +700,7 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 +; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 @@ -708,6 +712,7 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll index 2cde988ed7762..f2622d9572f51 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -321,6 +321,7 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm6 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm6, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw %xmm1, %ymm4, %ymm2 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 @@ -338,6 +339,7 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm6 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm6, %ymm3 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm4, %ymm2 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 @@ -351,6 +353,7 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 @@ -362,6 +365,7 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
