Author: Simon Pilgrim Date: 2020-12-13T17:18:07Z New Revision: d5c434d7dda25909cd7886e419baf3db3578953e
URL: https://github.com/llvm/llvm-project/commit/d5c434d7dda25909cd7886e419baf3db3578953e DIFF: https://github.com/llvm/llvm-project/commit/d5c434d7dda25909cd7886e419baf3db3578953e.diff LOG: [X86][SSE] combineX86ShufflesRecursively - add basic handling for combining shuffles of different widths (PR45974) If a faux shuffle uses smaller shuffle inputs, try to recursively combine with those inputs directly instead of widening them immediately. Then widen all smaller inputs at the bottom of the recursion. This will still mean we're generating nodes on the fly (PR45974) even if we don't combine to a new shuffle but it does help AVX2+ targets combine across xmm/ymm/zmm types, mainly as variable shuffles. Added: Modified: llvm/lib/Target/X86/X86ISelLowering.cpp llvm/test/CodeGen/X86/min-legal-vector-width.ll llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll llvm/test/CodeGen/X86/vector-shuffle-v1.ll llvm/test/CodeGen/X86/x86-interleaved-access.ll Removed: ################################################################################ diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 04987a2b9abe..b4a397080284 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36135,8 +36135,8 @@ static SDValue combineX86ShufflesRecursively( if (!VT.isVector() || !VT.isSimple()) return SDValue(); // Bail if we hit a non-simple non-vector. - assert(VT.getSizeInBits() == RootSizeInBits && - "Can only combine shuffles of the same vector register size."); + assert((RootSizeInBits % VT.getSizeInBits()) == 0 && + "Can only combine shuffles upto size of the root op."); // Extract target shuffle mask and resolve sentinels and inputs. // TODO - determine Op's demanded elts from RootMask. @@ -36149,17 +36149,32 @@ static SDValue combineX86ShufflesRecursively( OpZero, DAG, Depth, false)) return SDValue(); - // Shuffle inputs must be the same size as the result, bail on any larger - // inputs and widen any smaller inputs. - if (llvm::any_of(OpInputs, [RootSizeInBits](SDValue Op) { - return Op.getValueSizeInBits() > RootSizeInBits; + // Shuffle inputs must not be larger than the shuffle result. + // TODO: Relax this for single input faux shuffles (trunc/extract_subvector). + if (llvm::any_of(OpInputs, [VT](SDValue OpInput) { + return OpInput.getValueSizeInBits() > VT.getSizeInBits(); })) return SDValue(); - for (SDValue &Op : OpInputs) - if (Op.getValueSizeInBits() < RootSizeInBits) - Op = widenSubVector(peekThroughOneUseBitcasts(Op), false, Subtarget, DAG, - SDLoc(Op), RootSizeInBits); + // If the shuffle result was smaller than the root, we need to adjust the + // mask indices and pad the mask with undefs. + if (RootSizeInBits > VT.getSizeInBits()) { + unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits(); + unsigned OpMaskSize = OpMask.size(); + if (OpInputs.size() > 1) { + unsigned PaddedMaskSize = NumSubVecs * OpMaskSize; + for (int &M : OpMask) { + if (M < 0) + continue; + int EltIdx = M % OpMaskSize; + int OpIdx = M / OpMaskSize; + M = (PaddedMaskSize * OpIdx) + EltIdx; + } + } + OpZero = OpZero.zext(NumSubVecs * OpMaskSize); + OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize); + OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef); + } SmallVector<int, 64> Mask; SmallVector<SDValue, 16> Ops; @@ -36337,6 +36352,18 @@ static SDValue combineX86ShufflesRecursively( } } + // Widen any subvector shuffle inputs we've collected. + if (any_of(Ops, [RootSizeInBits](SDValue Op) { + return Op.getValueSizeInBits() < RootSizeInBits; + })) { + for (SDValue &Op : Ops) + if (Op.getValueSizeInBits() < RootSizeInBits) + Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op), + RootSizeInBits); + // Reresolve - we might have repeated subvector sources. + resolveTargetShuffleInputsAndMask(Ops, Mask); + } + // Attempt to constant fold all of the constant source ops. if (SDValue Cst = combineX86ShufflesConstants( Ops, Mask, Root, HasVariableMask, DAG, Subtarget)) diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index a39fbf878fd9..5456cd2e753a 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -1682,25 +1682,45 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-leg } define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: splatvar_rotate_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastb %xmm1, %xmm1 -; CHECK-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpsllw %xmm2, %ymm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; CHECK-NEXT: vpsubb %xmm1, %xmm4, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpsllw %xmm2, %xmm4, %xmm2 -; CHECK-NEXT: vpbroadcastb %xmm2, %ymm2 -; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm5 -; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm2 -; CHECK-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 -; CHECK-NEXT: vpsrlw $8, %xmm0, %xmm0 -; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 -; CHECK-NEXT: vpternlogq $236, %ymm5, %ymm2, %ymm0 -; CHECK-NEXT: retq +; CHECK-AVX512-LABEL: splatvar_rotate_v32i8: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpbroadcastb %xmm1, %xmm1 +; CHECK-AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; CHECK-AVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm3 +; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; CHECK-AVX512-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; CHECK-AVX512-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; CHECK-AVX512-NEXT: vpsllw %xmm2, %xmm4, %xmm2 +; CHECK-AVX512-NEXT: vpbroadcastb %xmm2, %ymm2 +; CHECK-AVX512-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; CHECK-AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm5 +; CHECK-AVX512-NEXT: vpand %ymm2, %ymm3, %ymm2 +; CHECK-AVX512-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 +; CHECK-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpbroadcastb %xmm0, %ymm0 +; CHECK-AVX512-NEXT: vpternlogq $236, %ymm5, %ymm2, %ymm0 +; CHECK-AVX512-NEXT: retq +; +; CHECK-VBMI-LABEL: splatvar_rotate_v32i8: +; CHECK-VBMI: # %bb.0: +; CHECK-VBMI-NEXT: vpbroadcastb %xmm1, %xmm1 +; CHECK-VBMI-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-VBMI-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; CHECK-VBMI-NEXT: vpsllw %xmm2, %ymm0, %ymm3 +; CHECK-VBMI-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; CHECK-VBMI-NEXT: vpsllw %xmm2, %xmm4, %xmm2 +; CHECK-VBMI-NEXT: vpbroadcastb %xmm2, %ymm2 +; CHECK-VBMI-NEXT: vpand %ymm2, %ymm3, %ymm2 +; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; CHECK-VBMI-NEXT: vpsubb %xmm1, %xmm3, %xmm1 +; CHECK-VBMI-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; CHECK-VBMI-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 +; CHECK-VBMI-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 +; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; CHECK-VBMI-NEXT: vpternlogq $236, %ymm3, %ymm2, %ymm0 +; CHECK-VBMI-NEXT: retq %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat %shl = shl <32 x i8> %a, %splat diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 5eb4b1039bf9..19ea28085d75 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -59,18 +59,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] -; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; XOPAVX1: # %bb.0: @@ -115,18 +108,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] -; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; XOPAVX1: # %bb.0: @@ -171,18 +157,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] -; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] -; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; XOPAVX1: # %bb.0: @@ -218,17 +197,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9] -; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] -; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -262,17 +235,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1] -; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] -; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -306,17 +273,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1] -; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] -; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -350,17 +311,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1] -; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1040,19 +995,12 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_0 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] -; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0] -; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0] +; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: ; XOPAVX1: # %bb.0: @@ -1094,19 +1042,12 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_0 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] -; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0] -; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0] +; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: ; XOPAVX1: # %bb.0: @@ -1148,19 +1089,12 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_0 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] -; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0] -; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: ; XOPAVX1: # %bb.0: @@ -1192,18 +1126,12 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0,0,0,0,4,0,0,0,0] -; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1233,18 +1161,12 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0,0,0,5,0,0,0,0,0] -; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1274,18 +1196,12 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0,0,6,0,0,0,0,0,0] -; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1315,18 +1231,12 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index 4c8c8f6312f1..7bbff8767922 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -272,11 +272,23 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8f32_08080808: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2OR512VL-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v8f32_08080808: +; AVX2: # %bb.0: +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8f32_08080808: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8f32_08080808: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [34359738368,34359738368,34359738368,34359738368] +; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8> ret <8 x float> %shuffle } @@ -1617,11 +1629,23 @@ define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i32_08080808: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2OR512VL-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v8i32_08080808: +; AVX2: # %bb.0: +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_08080808: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_08080808: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [34359738368,34359738368,34359738368,34359738368] +; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8> ret <8 x i32> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index 6d8306bf01bb..015f2e9bfe24 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -500,8 +500,8 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { ; AVX512VL-NEXT: kmovw %edi, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2] +; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1 ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} @@ -513,8 +513,8 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 -; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %ymm0 +; VL_BW_DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] +; VL_BW_DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 ; VL_BW_DQ-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index a540d04626ae..42808b3910e4 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -810,85 +810,79 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) { ; ; AVX512-LABEL: interleaved_load_vf32_i8_stride4: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa 112(%rdi), %xmm14 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm0 -; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX512-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX512-NEXT: vmovdqa 112(%rdi), %xmm14 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm4 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5 -; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm6 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm4 +; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm6 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm7 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm5 -; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm5 -; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm4 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3 +; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm2 +; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm3 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm4 +; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm5 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm5, %xmm11, %xmm6 -; AVX512-NEXT: vpshufb %xmm5, %xmm10, %xmm1 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm4 +; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm3 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm4 -; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm5 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm3 +; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm4 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5 -; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm6 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm1 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm2 ; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm3 -; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm4 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vpcmpeqb %zmm8, %zmm9, %k0 -; AVX512-NEXT: vpcmpeqb %zmm0, %zmm1, %k1 +; AVX512-NEXT: vpcmpeqb %zmm0, %zmm15, %k1 ; AVX512-NEXT: kxnord %k1, %k0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits