Author: Simon Pilgrim Date: 2021-01-13T17:19:40Z New Revision: cbbfc8258615bc971a54c6287abe33c4215d2eac
URL: https://github.com/llvm/llvm-project/commit/cbbfc8258615bc971a54c6287abe33c4215d2eac DIFF: https://github.com/llvm/llvm-project/commit/cbbfc8258615bc971a54c6287abe33c4215d2eac.diff LOG: [X86][SSE] canonicalizeShuffleMaskWithHorizOp - simplify shuffle(HOP(HOP(X,Y),HOP(Z,W))) style chains. See if we can remove the shuffle by resorting a HOP chain so that the HOP args are pre-shuffled. This initial version just handles (the most common) v4i32/v4f32 hadd/hsub reduction patterns - future work can extend this to v8i16 types plus PACK chains (2f64 HADD/HSUB should already be handled in the half-lane combine code later on). Added: Modified: llvm/lib/Target/X86/X86ISelLowering.cpp llvm/test/CodeGen/X86/horizontal-sum.ll Removed: ################################################################################ diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 821cfc5f0c27..d45eb5366bfe 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36115,6 +36115,38 @@ static SDValue canonicalizeShuffleMaskWithHorizOp( int NumEltsPerLane = NumElts / NumLanes; int NumHalfEltsPerLane = NumEltsPerLane / 2; + // See if we can remove the shuffle by resorting the HOP chain so that + // the HOP args are pre-shuffled. + // TODO: Generalize to any sized/depth chain. + // TODO: Add support for PACKSS/PACKUS. + if (isHoriz && NumEltsPerLane == 4 && VT0.is128BitVector() && + shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget)) { + SmallVector<int> ScaledMask; + if (scaleShuffleElements(Mask, 4, ScaledMask)) { + // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand. + auto GetHOpSrc = [&](int M) { + if (M == SM_SentinelUndef) + return DAG.getUNDEF(VT0); + if (M == SM_SentinelZero) + return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL); + SDValue Src0 = BC[M / NumElts]; + SDValue Src1 = Src0.getOperand((M % 4) >= 2); + if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode())) + return Src1.getOperand(M % 2); + return SDValue(); + }; + SDValue M0 = GetHOpSrc(ScaledMask[0]); + SDValue M1 = GetHOpSrc(ScaledMask[1]); + SDValue M2 = GetHOpSrc(ScaledMask[2]); + SDValue M3 = GetHOpSrc(ScaledMask[3]); + if (M0 && M1 && M2 && M3) { + SDValue LHS = DAG.getNode(Opcode0, DL, VT0, M0, M1); + SDValue RHS = DAG.getNode(Opcode0, DL, VT0, M2, M3); + return DAG.getNode(Opcode0, DL, VT0, LHS, RHS); + } + } + } + if (2 < Ops.size()) return SDValue(); diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index 47d44171d99a..315e795d7a37 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -38,13 +38,9 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; ; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32: ; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 -; SSSE3-FAST-NEXT: haddps %xmm1, %xmm1 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0 -; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2 -; SSSE3-FAST-NEXT: haddps %xmm3, %xmm3 -; SSSE3-FAST-NEXT: haddps %xmm2, %xmm3 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[2,0] +; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2 +; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0 ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: pair_sum_v4f32_v4f32: @@ -66,18 +62,12 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX1-SLOW-NEXT: retq ; -; AVX1-FAST-LABEL: pair_sum_v4f32_v4f32: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] -; AVX1-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX1-FAST-NEXT: retq +; AVX-FAST-LABEL: pair_sum_v4f32_v4f32: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm1 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq ; ; AVX2-SLOW-LABEL: pair_sum_v4f32_v4f32: ; AVX2-SLOW: # %bb.0: @@ -97,19 +87,6 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: pair_sum_v4f32_v4f32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] -; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX2-FAST-NEXT: retq %5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2> %6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3> %7 = fadd <2 x float> %5, %6 @@ -160,13 +137,9 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; ; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32: ; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm1 +; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2 ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm2 -; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm3 -; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm3 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[2,0] +; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm0 ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: pair_sum_v4i32_v4i32: @@ -189,20 +162,12 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-SLOW-NEXT: retq ; -; AVX1-FAST-LABEL: pair_sum_v4i32_v4i32: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-FAST-NEXT: retq +; AVX-FAST-LABEL: pair_sum_v4i32_v4i32: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 +; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq ; ; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32: ; AVX2-SLOW: # %bb.0: @@ -222,19 +187,6 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: pair_sum_v4i32_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] -; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX2-FAST-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2> %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3> %7 = add <2 x i32> %5, %6 @@ -295,10 +247,8 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; ; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32: ; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 -; SSSE3-FAST-NEXT: haddps %xmm1, %xmm1 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1 ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2 @@ -352,9 +302,8 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; ; AVX1-FAST-LABEL: pair_sum_v8f32_v4f32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm1 ; AVX1-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm3 @@ -366,13 +315,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] ; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[1] ; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] +; AVX1-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-FAST-NEXT: vhaddps %xmm6, %xmm6, %xmm2 -; AVX1-FAST-NEXT: vhaddps %xmm7, %xmm7, %xmm3 -; AVX1-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] ; AVX1-FAST-NEXT: retq @@ -407,9 +354,8 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; ; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1 ; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4 ; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2 @@ -418,13 +364,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] ; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1] ; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] +; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vhaddps %xmm6, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm7, %xmm3 -; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] ; AVX2-FAST-NEXT: retq @@ -507,10 +451,8 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; ; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32: ; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm1 ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2 ; SSSE3-FAST-NEXT: movdqa %xmm5, %xmm1 ; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm5 @@ -565,10 +507,8 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; ; AVX1-FAST-LABEL: pair_sum_v8i32_v4i32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,1,3] +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm8 ; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm2 @@ -586,10 +526,8 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm8[0],xmm0[0] ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-FAST-NEXT: vphaddd %xmm6, %xmm6, %xmm2 -; AVX1-FAST-NEXT: vphaddd %xmm7, %xmm7, %xmm3 -; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm2 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] ; AVX1-FAST-NEXT: retq @@ -624,25 +562,22 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; ; AVX2-FAST-LABEL: pair_sum_v8i32_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vphaddd %xmm6, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm7, %xmm2 -; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm4[0,3] +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[0] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm2 +; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm2, %xmm1 ; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: retq _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits