Author: Simon Pilgrim Date: 2025-04-03T16:00:07+01:00 New Revision: 7f18a2fa9567050a245f3992963752a74cdff884
URL: https://github.com/llvm/llvm-project/commit/7f18a2fa9567050a245f3992963752a74cdff884 DIFF: https://github.com/llvm/llvm-project/commit/7f18a2fa9567050a245f3992963752a74cdff884.diff LOG: Revert "[X86] SimplifyDemandedVectorEltsForTargetNode - reduce the size of VP…" This reverts commit bf516098fb7c7d428cae03296b92766467f76c9e. Added: Modified: llvm/lib/Target/X86/X86ISelLowering.cpp llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll Removed: ################################################################################ diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d1be19539b642..546a2d22fa58e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43827,69 +43827,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } break; } - case X86ISD::VPERMV: { - SmallVector<int, 16> Mask; - SmallVector<SDValue, 2> Ops; - if ((VT.is256BitVector() || Subtarget.hasVLX()) && - getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) { - // For lane-crossing shuffles, only split in half in case we're still - // referencing higher elements. - unsigned HalfElts = NumElts / 2; - unsigned HalfSize = SizeInBits / 2; - Mask.resize(HalfElts); - if (all_of(Mask, - [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) { - MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT(); - SDLoc DL(Op); - SDValue Ext; - SDValue M = - extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize); - SDValue V = - extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize); - // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS. - if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16) - Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V); - else - Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, HalfVT, V, M); - SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false, - Subtarget, TLO.DAG, DL, SizeInBits); - return TLO.CombineTo(Op, Insert); - } - } - break; - } - case X86ISD::VPERMV3: { - SmallVector<int, 16> Mask; - SmallVector<SDValue, 2> Ops; - if (Subtarget.hasVLX() && - getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) { - // For lane-crossing shuffles, only split in half in case we're still - // referencing higher elements. - unsigned HalfElts = NumElts / 2; - unsigned HalfSize = SizeInBits / 2; - Mask.resize(HalfElts); - if (all_of(Mask, [&](int M) { - return isUndefOrInRange(M, 0, HalfElts) || - isUndefOrInRange(M, NumElts, NumElts + HalfElts); - })) { - // Adjust mask elements for 2nd operand to point to half width. - for (int &M : Mask) - M = M <= NumElts ? M : (M - HalfElts); - MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT(); - MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger(); - SDLoc DL(Op); - SDValue Ext = TLO.DAG.getNode( - Opc, DL, HalfVT, - extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize), - getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true), - extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize)); - SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false, - Subtarget, TLO.DAG, DL, SizeInBits); - return TLO.CombineTo(Op, Insert); - } - } - break; - } case X86ISD::VPERM2X128: { // Simplify VPERM2F128/VPERM2I128 to extract_subvector. SDLoc DL(Op); diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index b075d48627b18..6f4e7abda8b00 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -749,10 +749,10 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -763,7 +763,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax ; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -870,7 +870,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -883,7 +883,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -1000,10 +1000,10 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -4610,10 +4610,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15] -; AVX512F-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23] +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -4623,10 +4623,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15] -; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -4868,10 +4868,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7] -; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -4881,10 +4881,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7] -; AVX512DQ-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 61e122b1aba36..52f856befa130 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -644,7 +644,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -653,7 +653,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -738,7 +738,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -748,7 +748,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll index a84466bc1ca1a..26af46263c0e2 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -1113,8 +1113,8 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind ; ; AVX512VBMI-FAST-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8: ; AVX512VBMI-FAST: # %bb.0: -; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [64,65,66,67,68,69,24,28,32,36,40,44,48,52,56,79] -; AVX512VBMI-FAST-NEXT: vpmovdb %ymm0, %xmm2 +; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,79] +; AVX512VBMI-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VBMI-FAST-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 ; AVX512VBMI-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm0 ; AVX512VBMI-FAST-NEXT: vpextrw $6, %xmm0, %eax @@ -1124,14 +1124,14 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind ; ; AVX512VBMI-SLOW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8: ; AVX512VBMI-SLOW: # %bb.0: -; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,92,96,100,104,108,112,13,14,15] -; AVX512VBMI-SLOW-NEXT: vpmovdb %ymm0, %xmm2 -; AVX512VBMI-SLOW-NEXT: vpermt2b %zmm0, %zmm1, %zmm2 +; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,77,78,79] +; AVX512VBMI-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VBMI-SLOW-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 ; AVX512VBMI-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm0 ; AVX512VBMI-SLOW-NEXT: vpextrw $6, %xmm0, %eax ; AVX512VBMI-SLOW-NEXT: vpextrw $4, %xmm0, %ecx ; AVX512VBMI-SLOW-NEXT: vpextrw $2, %xmm0, %edx -; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm2, %xmm0 +; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm1, %xmm0 ; AVX512VBMI-SLOW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; AVX512VBMI-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512VBMI-SLOW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 9b19ec15c6f55..739e6e2369e36 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -593,104 +593,100 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i16_stride5_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax -; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 +; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm2, %zmm4, %zmm4 +; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm2, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-NEXT: vmovq %xmm0, (%rdx) ; AVX512BW-NEXT: vmovq %xmm3, (%rcx) ; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-NEXT: vmovq %xmm1, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride5_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax -; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride5_vf4: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax -; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax -; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <20 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll index f41123c5c3cfd..05c111ae5049f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -123,8 +123,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa %xmm3, 64(%r9) @@ -140,8 +140,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm3, 64(%r9) @@ -157,8 +157,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512DQ-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa %xmm3, 64(%r9) @@ -174,8 +174,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 64(%r9) @@ -191,8 +191,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa %xmm3, 64(%r9) @@ -208,8 +208,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, 64(%r9) @@ -225,8 +225,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512DQ-BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, 64(%r9) @@ -242,8 +242,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, 64(%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll index aac6a1bddd08a..c2f1723d8031e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -139,12 +139,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -158,12 +158,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -177,12 +177,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512DQ-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -196,12 +196,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -215,12 +215,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -234,12 +234,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -253,12 +253,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512DQ-BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -272,12 +272,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll index f5cd3e580d017..ec09c3117c77f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -187,8 +187,10 @@ define <8 x i32> @concat_vrotlv_v4i32(<4 x i32> %a0, <4 x i32> %a1, <8 x i32> %a define <8 x i16> @demandedelts_vpermvar_32i16_v8i16(<32 x i16> %x0) { ; CHECK-LABEL: demandedelts_vpermvar_32i16_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [7,0,6,1,5,2,4,3] -; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,0,6,1,5,2,4,3,7,0,6,1,5,2,4,3,7,0,6,1,5,2,4,3,7,0,6,1,5,2,4,3] +; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %shuffle = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> <i16 7, i16 0, i16 6, i16 1, i16 5, i16 2, i16 4, i16 3, i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8>) diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index ea0e3b3a2b9aa..35f25d36cb2e9 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -749,10 +749,10 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -763,7 +763,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax ; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -870,7 +870,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -883,7 +883,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -1000,10 +1000,10 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index a3e2fb5321f32..a598e30845579 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -644,7 +644,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -653,7 +653,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -738,7 +738,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -748,7 +748,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits