https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/201271
>From 7bebe28727d9c6722239e5da3da4080837bc6bd1 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak <[email protected]> Date: Sat, 30 May 2026 02:19:01 +0000 Subject: [PATCH] [SelectionDAG] Fold extracts of subvector inserts Fold extract_subvector(insert_subvector(...)) when the extraction is outside the inserted subvector or the inserted subvector only amends the extracted In particular, 1. vA extract_subvector (vB insert_subvector(vB X, vC Y, C1), C2) => vA extract_subvector(X, C2) when [C2, C2 + A) intersect [C1, C1 + C) is the empty set 2. ... => extract_subvector(Y, C2 - C1) if [C2, C2 + Y) is a subset of [C1, C1 + C) - an existing simplification 3. ... => vA insert_subvector(vA extract_subvector(vB X, C2), vC Y, C1 - C2) if [C1, C1 + C) is a subset of [C2, C2 + A) - that is, if you're only updating the extracted sub-part. Adds a regresssion tests for an infinite SelectionDAG cycle that is fixed by a stack of commits that ends with this one. AI note: an LLM generated the code and the test, I've read them Co-Authored-By: OpenAI Codex <[email protected]> --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 35 ++++- ...agcombine-freeze-extract-subvector-loop.ll | 45 ++++++ .../CodeGen/X86/dagcombine-extract-insert.ll | 47 ++++--- .../vector-interleaved-store-i16-stride-3.ll | 12 +- .../vector-interleaved-store-i16-stride-6.ll | 92 ++++++------- .../vector-interleaved-store-i64-stride-6.ll | 128 ++++++++++-------- .../vector-interleaved-store-i8-stride-6.ll | 12 +- .../CodeGen/X86/vector-replicaton-i1-mask.ll | 32 ++--- 8 files changed, 237 insertions(+), 166 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/dagcombine-freeze-extract-subvector-loop.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 0dcaeb5b22c9a..2cf455c89a4f4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -27590,20 +27590,41 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { return DAG.getSplatVector(NVT, DL, V.getOperand(0)); // extract_subvector(insert_subvector(x,y,c1),c2) + // --> extract_subvector(x,c2) + // iff we're extracting wholly outside the inserted subvector. + // // --> extract_subvector(y,c2-c1) - // iff we're just extracting from the inserted subvector. + // iff we're extracting wholly from the inserted subvector. + // + // --> insert_subvector(extract_subvector(x,c2), y, c1-c2) + // iff the inserted subvector is wholly contained by the extraction. if (V.getOpcode() == ISD::INSERT_SUBVECTOR) { + SDValue Src = V.getOperand(0); SDValue InsSub = V.getOperand(1); EVT InsSubVT = InsSub.getValueType(); unsigned NumInsElts = InsSubVT.getVectorMinNumElements(); unsigned InsIdx = V.getConstantOperandVal(2); unsigned NumSubElts = NVT.getVectorMinNumElements(); - if (InsIdx <= ExtIdx && (ExtIdx + NumSubElts) <= (InsIdx + NumInsElts) && - TLI.isExtractSubvectorCheap(NVT, InsSubVT, ExtIdx - InsIdx) && - InsSubVT.isFixedLengthVector() && NVT.isFixedLengthVector() && - V.getValueType().isFixedLengthVector()) - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, InsSub, - DAG.getVectorIdxConstant(ExtIdx - InsIdx, DL)); + if (InsSubVT.isFixedLengthVector() && NVT.isFixedLengthVector() && + V.getValueType().isFixedLengthVector()) { + uint64_t ExtEnd = ExtIdx + NumSubElts; + uint64_t InsEnd = InsIdx + NumInsElts; + if (ExtEnd <= InsIdx || InsEnd <= ExtIdx) + return DAG.getExtractSubvector(DL, NVT, Src, ExtIdx); + + if (InsIdx <= ExtIdx && ExtEnd <= InsEnd && + TLI.isExtractSubvectorCheap(NVT, InsSubVT, ExtIdx - InsIdx)) + return DAG.getExtractSubvector(DL, NVT, InsSub, ExtIdx - InsIdx); + + if (ExtIdx <= InsIdx && InsEnd <= ExtEnd && + InsSubVT.getVectorElementType() == NVT.getVectorElementType() && + (InsIdx - ExtIdx) % NumInsElts == 0 && + hasOperation(ISD::INSERT_SUBVECTOR, NVT)) { + SDValue NewExtract = DAG.getExtractSubvector(DL, NVT, Src, ExtIdx); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NVT, NewExtract, InsSub, + DAG.getVectorIdxConstant(InsIdx - ExtIdx, DL)); + } + } } // Try to move vector bitcast after extract_subv by scaling extraction index: diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-freeze-extract-subvector-loop.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-freeze-extract-subvector-loop.ll new file mode 100644 index 0000000000000..8e929b55bc1f1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-freeze-extract-subvector-loop.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck %s + +target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +; Regression test for an infinite DAGCombine loop involving freeze sinking +; through extract_subvector users of this shuffle/select chain. +; See https://github.com/ROCm/llvm-project/issues/2616 for the original report. +define amdgpu_kernel void @freeze_loop(<2 x i16> %0, i1 %1) { +; CHECK-LABEL: freeze_loop: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_bitcmp1_b32 s1, 0 +; CHECK-NEXT: s_cselect_b32 s0, s0, 0x10001 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: ds_write_b32 v0, v1 +; CHECK-NEXT: s_endpgm + %3 = shufflevector <2 x i16> %0, <2 x i16> zeroinitializer, <23 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> + %4 = select i1 %1, <23 x i16> %3, <23 x i16> zeroinitializer + %5 = shufflevector <23 x i16> %4, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 23, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22> + %6 = select i1 %1, <23 x i16> zeroinitializer, <23 x i16> %5 + %7 = shufflevector <23 x i16> %6, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 24, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22> + %8 = select i1 %1, <23 x i16> zeroinitializer, <23 x i16> %7 + %9 = shufflevector <23 x i16> %8, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 23, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22> + %10 = shufflevector <23 x i16> %4, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 23, i32 poison, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22> + %11 = select i1 %1, <23 x i16> zeroinitializer, <23 x i16> %9 + %12 = shufflevector <23 x i16> %11, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 24, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22> + %13 = select i1 %1, <23 x i16> %10, <23 x i16> %12 + %14 = shufflevector <23 x i16> %13, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 23, i32 poison, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22> + %15 = select i1 %1, <23 x i16> %14, <23 x i16> %10 + %16 = shufflevector <23 x i16> %15, <23 x i16> %3, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 23, i32 poison, i32 18, i32 19, i32 20, i32 21, i32 22> + %17 = shufflevector <23 x i16> %15, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22> + %18 = select i1 %1, <23 x i16> %16, <23 x i16> %17 + %19 = shufflevector <23 x i16> %18, <23 x i16> %3, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 23, i32 poison, i32 20, i32 21, i32 22> + %20 = shufflevector <23 x i16> %13, <23 x i16> zeroinitializer, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22> + %21 = select i1 %1, <23 x i16> %19, <23 x i16> %20 + %22 = shufflevector <23 x i16> %21, <23 x i16> %3, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 23, i32 poison, i32 22> + %23 = select i1 %1, <23 x i16> %22, <23 x i16> splat (i16 1) + %24 = shufflevector <23 x i16> %23, <23 x i16> zeroinitializer, <2 x i32> <i32 20, i32 21> + store <2 x i16> %24, ptr addrspace(3) null, align 2 + ret void +} diff --git a/llvm/test/CodeGen/X86/dagcombine-extract-insert.ll b/llvm/test/CodeGen/X86/dagcombine-extract-insert.ll index c0595fce4117d..11c33a0490a67 100644 --- a/llvm/test/CodeGen/X86/dagcombine-extract-insert.ll +++ b/llvm/test/CodeGen/X86/dagcombine-extract-insert.ll @@ -7,30 +7,33 @@ define void @extract_insert_interleaved_store(ptr %in.vecptr0, ptr %in.vecptr1, ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; CHECK-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13] -; CHECK-NEXT: vpshufb %ymm4, %ymm3, %ymm5 -; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; CHECK-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; CHECK-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] -; CHECK-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] -; CHECK-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; CHECK-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] -; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; CHECK-NEXT: vmovdqa %xmm0, 32(%rax) -; CHECK-NEXT: vmovdqa %ymm3, (%rax) +; CHECK-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; CHECK-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm6 = [5,13,u,u,6,14,6,14,u,u,7,15,7,15,u,u] +; CHECK-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3] +; CHECK-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm5[0] +; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] +; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] +; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; CHECK-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,ymm2[2,10],zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,ymm2[20,28],zero,zero,zero,zero +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,ymm0[21,29] +; CHECK-NEXT: vpor %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2 +; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; CHECK-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; CHECK-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vmovdqa %ymm0, (%rax) +; CHECK-NEXT: vmovdqa %xmm1, 32(%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index 7dbff047e4f87..6967b87a47b81 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -543,9 +543,8 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX512-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512-NEXT: vmovdqa %ymm4, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -564,9 +563,8 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rcx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -588,9 +586,8 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm4, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -609,9 +606,8 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index bc7ed7552e77c..80b11e572050f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -489,28 +489,27 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [0,3,7,0] -; AVX512-NEXT: vpermi2d %xmm6, %xmm7, %xmm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm8[1,2],xmm6[3] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29] -; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] -; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa %xmm6, 32(%rax) -; AVX512-NEXT: vmovdqa %ymm0, (%rax) +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29] +; AVX512-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] +; AVX512-NEXT: vpshufb %ymm7, %ymm8, %ymm7 +; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,3,7,0] +; AVX512-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX512-NEXT: vmovdqa %xmm0, 32(%rax) +; AVX512-NEXT: vmovdqa %ymm6, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -543,9 +542,8 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,5,6,3] ; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm2, 32(%rax) -; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) +; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -558,28 +556,27 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [0,3,7,0] -; AVX512DQ-NEXT: vpermi2d %xmm6, %xmm7, %xmm8 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm8[1,2],xmm6[3] -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa %xmm6, 32(%rax) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax) +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm8, %ymm7 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,3,7,0] +; AVX512DQ-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX512DQ-NEXT: vmovdqa %xmm0, 32(%rax) +; AVX512DQ-NEXT: vmovdqa %ymm6, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -612,9 +609,8 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,5,6,3] ; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 32(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll index cba11be4d8456..3d5a6a36377f7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -135,16 +135,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vmovdqa (%r8), %xmm2 +; AVX512-NEXT: vmovdqa (%r9), %xmm3 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3] +; AVX512-NEXT: vpermi2q %zmm4, %zmm0, %zmm5 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 +; AVX512-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -154,16 +156,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512-FCP-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 +; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -173,16 +177,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm3 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512DQ-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm0, %zmm5 +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512DQ-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 +; AVX512DQ-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -192,16 +198,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512DQ-FCP-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -211,16 +219,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-NEXT: vmovdqa (%r9), %xmm3 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm5 +; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512BW-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 +; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -230,16 +240,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512BW-FCP-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 +; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -249,16 +261,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm3 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512DQ-BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512DQ-BW-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 +; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -268,16 +282,18 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,4,6,8,10,1,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index d6c65fa82fa6c..f78700d8cd9a1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -945,9 +945,8 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa %xmm1, 32(%rax) ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) +; AVX512BW-NEXT: vmovdqa %xmm1, 32(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -982,9 +981,8 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1} -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax) ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -1019,9 +1017,8 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1} -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 32(%rax) ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 32(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -1056,9 +1053,8 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index 76dfd019c0883..aaeab617d56c8 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -3207,47 +3207,45 @@ define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: vpmovm2w %k1, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovm2w %k0, %zmm1 ; AVX512BW-ONLY-NEXT: movl $1, %eax -; AVX512BW-ONLY-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-NEXT: vmovdqu16 %zmm0, %zmm1 {%k2} -; AVX512BW-ONLY-NEXT: vpmovw2m %zmm1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} -; AVX512BW-ONLY-NEXT: vpmovm2w %k1, %zmm2 +; AVX512BW-ONLY-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-NEXT: vpmovw2m %zmm1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} ; AVX512BW-ONLY-NEXT: vpmovsxbw {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] -; AVX512BW-ONLY-NEXT: vpermw %zmm2, %zmm3, %zmm2 -; AVX512BW-ONLY-NEXT: vpmovw2m %zmm2, %k1 +; AVX512BW-ONLY-NEXT: vpermw %zmm0, %zmm3, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovw2m %zmm0, %k1 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k2} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $48, %k0, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k0, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 192(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 256(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 320(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq ; ; AVX512VBMI-ONLY-LABEL: mask_replication_factor6_vf16: ; AVX512VBMI-ONLY: # %bb.0: -; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k2 -; AVX512VBMI-ONLY-NEXT: vpmovm2b %k2, %zmm0 +; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k0 ; AVX512VBMI-ONLY-NEXT: vpmovm2w %k0, %zmm0 -; AVX512VBMI-ONLY-NEXT: vpmovm2w %k2, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovm2w %k1, %zmm1 ; AVX512VBMI-ONLY-NEXT: movl $1, %eax ; AVX512VBMI-ONLY-NEXT: kmovd %eax, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm0, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vpmovm2w %k2, %zmm1 ; AVX512VBMI-ONLY-NEXT: vpmovsxbw {{.*#+}} zmm2 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512VBMI-ONLY-NEXT: vpermw %zmm1, %zmm2, %zmm1 ; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm1, %k2 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
