https://github.com/nikic created 
https://github.com/llvm/llvm-project/pull/181147

Cherry-pick of 6420099bcc62a09e002e500870216b2dd9d256a9.

>From d0b6943132a40219d6253b0bb5dbf2e4e31cb4c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Pettersson?= <[email protected]>
Date: Thu, 12 Feb 2026 10:58:29 +0100
Subject: [PATCH] [SelectionDAG] Make sure demanded lanes for AND/MUL-by-zero
 are frozen (#180727)

DAGCombiner can fold a chain of INSERT_VECTOR_ELT into a vector AND/OR
operation. This patch adds protection to avoid that we end up making the
vector more poisonous by freezing the source vector when the elements
that should be set to 0/-1 may be poison in the source vector.

The patch also fixes a bug in SimplifyDemandedVectorElts for
MUL/MULHU/MULHS/AND that could result in making the vector more
poisonous. Problem was that we skipped demanding elements from Op0 that
were known to be zero in Op1. But that could result in elements being
simplified into poison when simplifying Op0, and then the result would
be poison and not zero after the MUL/MULHU/MULHS/AND. The solution is to
defensively make sure that we demand all the elements originally
demanded also when simplifying Op0.

This bugs were found when analysing the miscompiles in
https://github.com/llvm/llvm-project/issues/179448

Main culprit in #179448 seems to have been the bug in DAGCombiner. The
bug in SimplifyDemandedVectorElts surfaced when fixing the DAGCombiner,
as that fix typically introduce the (AND (FREEZE x), y) pattern that
wasn't handled correctly in SimplifyDemandedVectorElts.

Also fixes #180409.
Also fixes #176682.

(cherry picked from commit 6420099bcc62a09e002e500870216b2dd9d256a9)
---
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  5 ++
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 ++++-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  7 +++
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 21 +++++----
 llvm/test/CodeGen/AArch64/neon-dotreduce.ll   |  5 +-
 llvm/test/CodeGen/X86/insertelement-zero.ll   | 40 ++++++++++++++++
 llvm/test/CodeGen/X86/pr134602.ll             |  2 +-
 llvm/test/CodeGen/X86/pr173924.ll             |  2 +-
 llvm/test/CodeGen/X86/vector-fshl-256.ll      |  6 +++
 llvm/test/CodeGen/X86/vector-fshl-512.ll      |  4 ++
 llvm/test/CodeGen/X86/vector-fshl-rot-256.ll  |  5 ++
 llvm/test/CodeGen/X86/vector-fshl-rot-512.ll  |  4 ++
 llvm/test/CodeGen/X86/vector-fshr-512.ll      | 46 ++++++++++---------
 llvm/test/CodeGen/X86/vector-fshr-rot-512.ll  | 38 +++++++--------
 llvm/test/CodeGen/X86/vector-rotate-256.ll    |  5 ++
 llvm/test/CodeGen/X86/vector-rotate-512.ll    |  4 ++
 16 files changed, 151 insertions(+), 54 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h 
b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 604319095e74f..e70a59d9fb0fd 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1723,6 +1723,11 @@ class SelectionDAG {
   /// Return a freeze using the SDLoc of the value operand.
   LLVM_ABI SDValue getFreeze(SDValue V);
 
+  /// Return a freeze of V if any of the demanded elts may be undef or poison.
+  /// If \p PoisonOnly is true, then only check for poison elements.
+  LLVM_ABI SDValue getFreeze(SDValue V, const APInt &DemandedElts,
+                             bool PoisonOnly = false);
+
   /// Return an AssertAlignSDNode.
   LLVM_ABI SDValue getAssertAlign(const SDLoc &DL, SDValue V, Align A);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp 
b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b071a12587826..2f8fe09c3dc98 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -24060,8 +24060,17 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) 
{
           // Build the mask and return the corresponding DAG node.
           auto BuildMaskAndNode = [&](SDValue TrueVal, SDValue FalseVal,
                                       unsigned MaskOpcode) {
-            for (unsigned I = 0; I != NumElts; ++I)
+            APInt InsertedEltMask = APInt::getZero(NumElts);
+            for (unsigned I = 0; I != NumElts; ++I) {
               Mask[I] = Ops[I] ? TrueVal : FalseVal;
+              if (Ops[I])
+                InsertedEltMask.setBit(I);
+            }
+            // Make sure to freeze the source vector in case any of the 
elements
+            // overwritten by the insert may be poison. Otherwise those 
elements
+            // could end up being poison instead of 0/-1 after the AND/OR.
+            CurVec =
+                DAG.getFreeze(CurVec, InsertedEltMask, /*PoisonOnly=*/true);
             return DAG.getNode(MaskOpcode, DL, VT, CurVec,
                                DAG.getBuildVector(VT, DL, Mask));
           };
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp 
b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 8e609bc443da5..5a926b8bf6211 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2502,6 +2502,13 @@ SDValue SelectionDAG::getFreeze(SDValue V) {
   return getNode(ISD::FREEZE, SDLoc(V), V.getValueType(), V);
 }
 
+SDValue SelectionDAG::getFreeze(SDValue V, const APInt &DemandedElts,
+                                bool PoisonOnly) {
+  if (isGuaranteedNotToBeUndefOrPoison(V, DemandedElts, PoisonOnly))
+    return V;
+  return getFreeze(V);
+}
+
 /// getShiftAmountOperand - Return the specified value casted to
 /// the target's desired shift amount type.
 SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp 
b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index ff5e046f0fcd5..ee489f9fc74f9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3794,19 +3794,22 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     if (SimplifyDemandedVectorElts(Op1, DemandedElts, SrcUndef, SrcZero, TLO,
                                    Depth + 1))
       return true;
-    // If we know that a demanded element was zero in Op1 we don't need to
-    // demand it in Op0 - its guaranteed to be zero.
-    APInt DemandedElts0 = DemandedElts & ~SrcZero;
-    if (SimplifyDemandedVectorElts(Op0, DemandedElts0, KnownUndef, KnownZero,
+    // FIXME: If we know that a demanded element was zero in Op1 we don't need
+    // to demand it in Op0 - its guaranteed to be zero. There is however a
+    // restriction, as we must not make any of the originally demanded elements
+    // more poisonous. We could reduce amount of elements demanded, but then we
+    // also need a to inform SimplifyDemandedVectorElts that some elements must
+    // not be made more poisonous.
+    if (SimplifyDemandedVectorElts(Op0, DemandedElts, KnownUndef, KnownZero,
                                    TLO, Depth + 1))
       return true;
 
-    KnownUndef &= DemandedElts0;
-    KnownZero &= DemandedElts0;
+    KnownUndef &= DemandedElts;
+    KnownZero &= DemandedElts;
 
-    // If every element pair has a zero/undef then just fold to zero.
-    // fold (and x, undef) -> 0  /  (and x, 0) -> 0
-    // fold (mul x, undef) -> 0  /  (mul x, 0) -> 0
+    // If every element pair has a zero/undef/poison then just fold to zero.
+    // fold (and x, undef/poison) -> 0  /  (and x, 0) -> 0
+    // fold (mul x, undef/poison) -> 0  /  (mul x, 0) -> 0
     if (DemandedElts.isSubsetOf(SrcZero | KnownZero | SrcUndef | KnownUndef))
       return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
 
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll 
b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index dbbe00c89eecf..8854d8ab80798 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -445,8 +445,9 @@ entry:
 define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK-SD-LABEL: test_udot_v5i8_nomla:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ldr x8, [x0]
 ; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT:    fmov d0, x8
 ; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
 ; CHECK-SD-NEXT:    mov v1.s[0], v2.s[0]
@@ -2681,8 +2682,8 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly 
%a1) {
 ; CHECK-SD-NEXT:    ldp q2, q1, [x0]
 ; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-SD-NEXT:    ushll2 v3.8h, v1.16b, #0
-; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-SD-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-SD-NEXT:    ushll2 v2.8h, v2.16b, #0
 ; CHECK-SD-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-SD-NEXT:    uaddl2 v5.4s, v4.8h, v1.8h
diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll 
b/llvm/test/CodeGen/X86/insertelement-zero.ll
index b66ad07c466e1..e1c8cefa73d8a 100644
--- a/llvm/test/CodeGen/X86/insertelement-zero.ll
+++ b/llvm/test/CodeGen/X86/insertelement-zero.ll
@@ -539,3 +539,43 @@ define <4 x i32> @PR41512_loads(ptr %p1, ptr %p2) {
   %r = shufflevector <4 x i32> %ins1, <4 x i32> %ins2, <4 x i32> <i32 0, i32 
1, i32 4, i32 5>
   ret <4 x i32> %r
 }
+
+; Reproducer for bugs in DAGCombiner and SimplifyDemandedVectorElts.
+;
+; Problem was that DAGCombiner replaced INSERT_VECTOR_ELT by AND, without
+; considering that %i has poison elements.  So instead of overwriting those
+; poison elements by inserting zeroes, we got "AND poison, 0" which is poison
+; and not guaranteed to be folded as zero.
+;
+; When solving the above by inserting a FREEZE another bug
+; surfaced. SimplifyDemandedVectorElts was not demanding elements that were
+; known to be AND:ed by zero. So the FREEZE ended up being removed and we
+; still got "AND poison, 0".
+;
+; Expected result is that the add reduction computes the sum 0+0+0+0+0+77+0+77 
= 154.
+define i64 @fold_insertelement_to_and(i32 noundef %arg) {
+; SSE-LABEL: fold_insertelement_to_and:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movl $154, %eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: fold_insertelement_to_and:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movl $154, %eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fold_insertelement_to_and:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [0,77]
+; AVX2-NEXT:    vpaddq %xmm0, %xmm0, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    retq
+  %i = shufflevector <8 x i64> zeroinitializer, <8 x i64> splat (i64 77), <8 x 
i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 4, i32 8, i32 6, i32 
10>
+  %i1 = insertelement <8 x i64> %i, i64 0, i64 0
+  %i2 = insertelement <8 x i64> %i1, i64 0, i64 2
+  %i3 = shufflevector <8 x i64> %i2, <8 x i64> poison, <8 x i32> <i32 0, i32 
0, i32 2, i32 2, i32 4, i32 5, i32 6, i32 7>
+  %i4 = tail call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %i3)
+  ret i64 %i4
+}
diff --git a/llvm/test/CodeGen/X86/pr134602.ll 
b/llvm/test/CodeGen/X86/pr134602.ll
index 063b6f31fe630..50efcde462532 100644
--- a/llvm/test/CodeGen/X86/pr134602.ll
+++ b/llvm/test/CodeGen/X86/pr134602.ll
@@ -17,7 +17,7 @@ define i32 @PR134602(i16 %a0) {
 ; X64-NEXT:    movzwl %di, %eax
 ; X64-NEXT:    movd %eax, %xmm0
 ; X64-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[2,2,2,2,4,5,6,7]
 ; X64-NEXT:    paddw %xmm0, %xmm1
 ; X64-NEXT:    movdqa %xmm1, %xmm0
 ; X64-NEXT:    psrld $16, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr173924.ll 
b/llvm/test/CodeGen/X86/pr173924.ll
index f5059da10da7c..d130014a8fa62 100644
--- a/llvm/test/CodeGen/X86/pr173924.ll
+++ b/llvm/test/CodeGen/X86/pr173924.ll
@@ -7,7 +7,7 @@ define i256 @PR173924(<8 x i256> %a0) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %edx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r8d
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll 
b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 2fadf5f101626..919450857171b 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -1004,6 +1004,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, 
<16 x i16> %y, <16 x i
 ; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
@@ -1015,6 +1016,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, 
<16 x i16> %y, <16 x i
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
@@ -1026,6 +1028,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, 
<16 x i16> %y, <16 x i
 ; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
@@ -1037,6 +1040,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, 
<16 x i16> %y, <16 x i
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
@@ -1057,6 +1061,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, 
<16 x i16> %y, <16 x i
 ; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
@@ -1092,6 +1097,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, 
<16 x i16> %y, <16 x i
 ; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll 
b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index 34ad667f01171..fed534a7b9440 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -552,6 +552,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, 
<32 x i16> %y, <32 x i
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512F-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
@@ -570,6 +571,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, 
<32 x i16> %y, <32 x i
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512VL-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
@@ -584,6 +586,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, 
<32 x i16> %y, <32 x i
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlw $1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpsllw %xmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
@@ -601,6 +604,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, 
<32 x i16> %y, <32 x i
 ; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlw $1, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpsllw %xmm2, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll 
b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 5f7e4070b3783..1f6df959f6d00 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -773,6 +773,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, 
<16 x i16> %amt) nounw
 ; AVX2-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm4
 ; AVX2-NEXT:    vpsrlw %xmm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
@@ -784,6 +785,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, 
<16 x i16> %amt) nounw
 ; AVX512F-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpsrlw %xmm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
@@ -795,6 +797,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, 
<16 x i16> %amt) nounw
 ; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm4
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm4, %ymm3
+; AVX512VL-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
@@ -806,6 +809,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, 
<16 x i16> %amt) nounw
 ; AVX512BW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512BW-NEXT:    vpsrlw $1, %ymm0, %ymm4
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %ymm4, %ymm3
+; AVX512BW-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
@@ -817,6 +821,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, 
<16 x i16> %amt) nounw
 ; AVX512VLBW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512VLBW-NEXT:    vpsrlw $1, %ymm0, %ymm4
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %ymm4, %ymm3
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll 
b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
index 4c6680ac4a19a..da2d41ee19d5a 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
@@ -308,6 +308,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, 
<32 x i16> %amt) nounw
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm6
 ; AVX512F-NEXT:    vpsrlw %xmm3, %ymm6, %ymm3
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
+; AVX512F-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpsllw %xmm1, %ymm4, %ymm2
 ; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
@@ -325,6 +326,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, 
<32 x i16> %amt) nounw
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm6
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm6, %ymm3
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
+; AVX512VL-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpsllw %xmm1, %ymm4, %ymm2
 ; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
@@ -338,6 +340,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, 
<32 x i16> %amt) nounw
 ; AVX512BW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
@@ -349,6 +352,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, 
<32 x i16> %amt) nounw
 ; AVX512VLBW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512VLBW-NEXT:    vpsrlw $1, %zmm0, %zmm4
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %zmm4, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll 
b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index 25f8f94eb834c..4257bcc0e3f99 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -547,36 +547,38 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, 
<32 x i16> %y, <32 x i
 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
-; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
-; AVX512F-NEXT:    vpsrlw %xmm4, %ymm5, %ymm5
-; AVX512F-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
-; AVX512F-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
+; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
+; AVX512F-NEXT:    vpaddw %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsllw %xmm4, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
 ; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
-; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
-; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm5, %ymm5
-; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
+; AVX512VL-NEXT:    vpaddw %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsllw %xmm4, %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
 ; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll 
b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
index 1d089e427bfad..c0dbbf0571c51 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
@@ -299,35 +299,37 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, 
<32 x i16> %amt) nounw
 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
-; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-NEXT:    vpsrlw %xmm3, %ymm4, %ymm5
-; AVX512F-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
+; AVX512F-NEXT:    vpaddw %ymm4, %ymm4, %ymm5
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm5, %ymm5
+; AVX512F-NEXT:    vpaddw %ymm0, %ymm0, %ymm6
+; AVX512F-NEXT:    vpsllw %xmm3, %ymm6, %ymm3
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
-; AVX512F-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpaddw %ymm4, %ymm4, %ymm2
-; AVX512F-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
-; AVX512F-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpsrlw %xmm1, %ymm4, %ymm2
+; AVX512F-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vporq %zmm0, %zmm3, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
-; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm4, %ymm5
-; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpaddw %ymm4, %ymm4, %ymm5
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddw %ymm0, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm6, %ymm3
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
-; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpaddw %ymm4, %ymm4, %ymm2
-; AVX512VL-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm4, %ymm2
+; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vporq %zmm0, %zmm3, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v32i16:
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll 
b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index 5ae3e2f5d7621..6a6997e607648 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -664,6 +664,7 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, 
<16 x i16> %b) nounwind
 ; AVX2-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm4
 ; AVX2-NEXT:    vpsrlw %xmm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
@@ -675,6 +676,7 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, 
<16 x i16> %b) nounwind
 ; AVX512F-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpsrlw %xmm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
@@ -686,6 +688,7 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, 
<16 x i16> %b) nounwind
 ; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm4
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm4, %ymm3
+; AVX512VL-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
@@ -697,6 +700,7 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, 
<16 x i16> %b) nounwind
 ; AVX512BW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512BW-NEXT:    vpsrlw $1, %ymm0, %ymm4
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %ymm4, %ymm3
+; AVX512BW-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
@@ -708,6 +712,7 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, 
<16 x i16> %b) nounwind
 ; AVX512VLBW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512VLBW-NEXT:    vpsrlw $1, %ymm0, %ymm4
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %ymm4, %ymm3
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll 
b/llvm/test/CodeGen/X86/vector-rotate-512.ll
index 2cde988ed7762..f2622d9572f51 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-512.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll
@@ -321,6 +321,7 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, 
<32 x i16> %b) nounwind
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm6
 ; AVX512F-NEXT:    vpsrlw %xmm3, %ymm6, %ymm3
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
+; AVX512F-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpsllw %xmm1, %ymm4, %ymm2
 ; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
@@ -338,6 +339,7 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, 
<32 x i16> %b) nounwind
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm6
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm6, %ymm3
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
+; AVX512VL-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpsllw %xmm1, %ymm4, %ymm2
 ; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
@@ -351,6 +353,7 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, 
<32 x i16> %b) nounwind
 ; AVX512BW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
@@ -362,6 +365,7 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, 
<32 x i16> %b) nounwind
 ; AVX512VLBW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512VLBW-NEXT:    vpsrlw $1, %zmm0, %zmm4
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %zmm4, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to