https://github.com/llvmbot updated https://github.com/llvm/llvm-project/pull/130215
>From 50343e517992970f62726d601e53054e588df437 Mon Sep 17 00:00:00 2001 From: aankit-ca <quic_aan...@quicinc.com> Date: Thu, 6 Mar 2025 15:02:10 -0800 Subject: [PATCH] [HEXAGON] Fix hvx-isel for extract_subvector op (#129672) Fixes a crash with extract_subvectors in Hexagon backend seen when the source vector is a vector-pair and result vector is not hvx vector size. LLVM Issue: https://github.com/llvm/llvm-project/issues/128775 Fixes #128775 --------- Co-authored-by: aankit-quic <aan...@quicinc.com> (cherry picked from commit 29d3fc3f11d272a72ac255af9277c740f26c3dfc) --- .../Target/Hexagon/HexagonISelLoweringHVX.cpp | 10 +- .../test/CodeGen/Hexagon/autohvx/fp-to-int.ll | 406 +++++++++--------- .../test/CodeGen/Hexagon/autohvx/int-to-fp.ll | 120 +++--- .../CodeGen/Hexagon/isel/extract-subvec.ll | 34 ++ 4 files changed, 302 insertions(+), 268 deletions(-) create mode 100644 llvm/test/CodeGen/Hexagon/isel/extract-subvec.ll diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 816e063f8dbbe..1a19e81a68f08 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -1265,11 +1265,15 @@ HexagonTargetLowering::extractHvxSubvectorReg(SDValue OrigOp, SDValue VecV, // the subvector of interest. The subvector will never overlap two single // vectors. if (isHvxPairTy(VecTy)) { - if (Idx * ElemWidth >= 8*HwLen) + unsigned SubIdx = Hexagon::vsub_lo; + if (Idx * ElemWidth >= 8 * HwLen) { + SubIdx = Hexagon::vsub_hi; Idx -= VecTy.getVectorNumElements() / 2; + } - VecV = OrigOp; - if (typeSplit(VecTy).first == ResTy) + VecTy = typeSplit(VecTy).first; + VecV = DAG.getTargetExtractSubreg(SubIdx, dl, VecTy, VecV); + if (VecTy == ResTy) return VecV; } diff --git a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll index ac51662242de8..196b37678be61 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll @@ -13,13 +13,13 @@ define void @f16s8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(##32768,#1) ; CHECK-NEXT: r4 = #14 -; CHECK-NEXT: v1 = vmem(r0+#0) +; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.h = vsplat(r3) ; CHECK-NEXT: r6 = #5 ; CHECK-NEXT: v3.h = vasl(v0.h,r2) -; CHECK-NEXT: v0.cur = vmem(r0+#1) +; CHECK-NEXT: v1 = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.h = vsplat(r4) @@ -33,55 +33,55 @@ define void @f16s8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = #16 -; CHECK-NEXT: v5.h = vasl(v1.h,r6) -; CHECK-NEXT: q1 = vcmp.gt(v7.h,v0.h) +; CHECK-NEXT: v5.h = vasl(v0.h,r6) +; CHECK-NEXT: q1 = vcmp.gt(v7.h,v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.h = vsplat(r3) -; CHECK-NEXT: v27.h = vasr(v3.h,r5) +; CHECK-NEXT: v28.h = vasr(v3.h,r5) ; CHECK-NEXT: v5 = vor(v5,v2) -; CHECK-NEXT: q0 = vcmp.gt(v7.h,v1.h) +; CHECK-NEXT: q0 = vcmp.gt(v7.h,v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.h = vsplat(r4) ; CHECK-NEXT: v8.h = vasr(v8.h,r5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v26.h = vasl(v0.h,r6) -; CHECK-NEXT: v0.h = vsub(v4.h,v27.h) +; CHECK-NEXT: v27.h = vasl(v1.h,r6) +; CHECK-NEXT: v1.h = vsub(v4.h,v28.h) ; CHECK-NEXT: v4.h = vsub(v4.h,v8.h) -; CHECK-NEXT: v28 = vmux(q0,v2,v9) +; CHECK-NEXT: v29 = vmux(q0,v2,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v1.h = vmin(v1.h,v6.h) +; CHECK-NEXT: v0 = vor(v27,v2) ; CHECK-NEXT: v4.h = vmin(v4.h,v6.h) -; CHECK-NEXT: v1 = vor(v26,v2) -; CHECK-NEXT: v0.h = vmin(v0.h,v6.h) ; CHECK-NEXT: v2 = vmux(q1,v2,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: q2 = vcmp.gt(v4.h,v7.h) -; CHECK-NEXT: q3 = vcmp.gt(v0.h,v7.h) +; CHECK-NEXT: q2 = vcmp.gt(v1.h,v7.h) +; CHECK-NEXT: q3 = vcmp.gt(v4.h,v7.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.h = vlsr(v5.h,v4.h) +; CHECK-NEXT: v5.h = vlsr(v5.h,v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1.h = vlsr(v1.h,v0.h) -; CHECK-NEXT: v29.h = vsub(v7.h,v5.h) +; CHECK-NEXT: v0.h = vlsr(v0.h,v4.h) +; CHECK-NEXT: v30.h = vsub(v7.h,v5.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30.h = vsub(v7.h,v1.h) -; CHECK-NEXT: v5 = vmux(q0,v29,v5) +; CHECK-NEXT: v31.h = vsub(v7.h,v0.h) +; CHECK-NEXT: v5 = vmux(q0,v30,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vmux(q1,v30,v1) -; CHECK-NEXT: v31 = vmux(q2,v5,v28) +; CHECK-NEXT: v0 = vmux(q1,v31,v0) +; CHECK-NEXT: v1 = vmux(q2,v5,v29) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vmux(q3,v1,v2) +; CHECK-NEXT: v0 = vmux(q3,v0,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0.b = vpack(v1.h,v31.h):sat +; CHECK-NEXT: v0.b = vpack(v0.h,v1.h):sat ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } @@ -491,127 +491,127 @@ define void @f32s8_0(ptr %a0, ptr %a1) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r4 = ##-2147483648 ; CHECK-NEXT: r3:2 = combine(#1,#8) -; CHECK-NEXT: v5 = vmem(r0+#0) +; CHECK-NEXT: r4 = ##-2147483648 +; CHECK-NEXT: v5 = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vsplat(r4) +; CHECK-NEXT: v0 = vsplat(r4) ; CHECK-NEXT: r7 = #30 ; CHECK-NEXT: r6 = #24 -; CHECK-NEXT: v2 = vmem(r0+#2) +; CHECK-NEXT: v4 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r7) ; CHECK-NEXT: r5 = #32 -; CHECK-NEXT: v8.w = vasl(v4.w,r3) -; CHECK-NEXT: v4.cur = vmem(r0+#1) +; CHECK-NEXT: v9.w = vasl(v5.w,r3) +; CHECK-NEXT: v1 = vmem(r0+#3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.w = vasl(v5.w,r3) -; CHECK-NEXT: v12 = vxor(v12,v12) -; CHECK-NEXT: v8.w = vsub(v8.w,v1.w) -; CHECK-NEXT: v0 = vmem(r0+#3) +; CHECK-NEXT: v8.w = vasl(v4.w,r3) +; CHECK-NEXT: v14 = vxor(v14,v14) +; CHECK-NEXT: v9.w = vsub(v9.w,v0.w) +; CHECK-NEXT: v2 = vmem(r0+#2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13 = vsplat(r5) -; CHECK-NEXT: v11.w = vasl(v0.w,r3) -; CHECK-NEXT: v7.w = vsub(v7.w,v1.w) -; CHECK-NEXT: q0 = vcmp.gt(v12.w,v5.w) +; CHECK-NEXT: v11.w = vasl(v2.w,r3) +; CHECK-NEXT: v8.w = vsub(v8.w,v0.w) +; CHECK-NEXT: q1 = vcmp.gt(v14.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.w = vasl(v2.w,r3) -; CHECK-NEXT: q1 = vcmp.gt(v12.w,v4.w) -; CHECK-NEXT: v11.w = vsub(v11.w,v1.w) +; CHECK-NEXT: v12.w = vasl(v1.w,r3) +; CHECK-NEXT: q0 = vcmp.gt(v14.w,v4.w) +; CHECK-NEXT: v11.w = vsub(v11.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = ##2147483647 ; CHECK-NEXT: r7 = #64 -; CHECK-NEXT: v8.w = vasr(v8.w,r6) +; CHECK-NEXT: v9.w = vasr(v9.w,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v22 = vsplat(r3) -; CHECK-NEXT: v7.w = vasr(v7.w,r6) -; CHECK-NEXT: v19.w = vsub(v9.w,v1.w) -; CHECK-NEXT: v8.w = vsub(v10.w,v8.w) +; CHECK-NEXT: v18 = vsplat(r3) +; CHECK-NEXT: v7.w = vasl(v5.w,r2) +; CHECK-NEXT: v19.w = vsub(v12.w,v0.w) +; CHECK-NEXT: v9.w = vsub(v10.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v20.w = vasl(v4.w,r2) -; CHECK-NEXT: v27 = vmux(q1,v1,v22) -; CHECK-NEXT: v25 = vmux(q0,v1,v22) -; CHECK-NEXT: v7.w = vsub(v10.w,v7.w) +; CHECK-NEXT: v8.w = vasr(v8.w,r6) +; CHECK-NEXT: v25 = vmux(q1,v0,v18) +; CHECK-NEXT: v23 = vmux(q0,v0,v18) +; CHECK-NEXT: v9.w = vmin(v9.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vasl(v5.w,r2) +; CHECK-NEXT: v6.w = vasl(v4.w,r2) +; CHECK-NEXT: v7 = vor(v7,v0) +; CHECK-NEXT: v8.w = vsub(v10.w,v8.w) +; CHECK-NEXT: q3 = vcmp.gt(v9.w,v14.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v11.w = vasr(v11.w,r6) ; CHECK-NEXT: v8.w = vmin(v8.w,v13.w) -; CHECK-NEXT: v9 = vor(v20,v1) -; CHECK-NEXT: v21.w = vmin(v7.w,v13.w) +; CHECK-NEXT: v6 = vor(v6,v0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vasr(v19.w,r6) -; CHECK-NEXT: q3 = vcmp.gt(v8.w,v12.w) -; CHECK-NEXT: v6 = vor(v6,v1) -; CHECK-NEXT: q2 = vcmp.gt(v21.w,v12.w) +; CHECK-NEXT: v11.w = vsub(v10.w,v11.w) +; CHECK-NEXT: q2 = vcmp.gt(v8.w,v14.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.w = vasr(v11.w,r6) +; CHECK-NEXT: v3.w = vasl(v1.w,r2) ; CHECK-NEXT: v5.w = vsub(v10.w,v5.w) +; CHECK-NEXT: v21.w = vmin(v11.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.w = vasl(v2.w,r2) -; CHECK-NEXT: v10.w = vsub(v10.w,v11.w) +; CHECK-NEXT: v20.w = vasl(v2.w,r2) +; CHECK-NEXT: v3 = vor(v3,v0) ; CHECK-NEXT: v5.w = vmin(v5.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v23.w = vasl(v0.w,r2) -; CHECK-NEXT: v3 = vor(v3,v1) -; CHECK-NEXT: v10.w = vmin(v10.w,v13.w) +; CHECK-NEXT: v7.w = vlsr(v7.w,v9.w) +; CHECK-NEXT: v12 = vor(v20,v0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.w = vlsr(v9.w,v8.w) -; CHECK-NEXT: v4 = vor(v23,v1) +; CHECK-NEXT: v6.w = vlsr(v6.w,v8.w) +; CHECK-NEXT: v24.w = vsub(v14.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vlsr(v6.w,v21.w) -; CHECK-NEXT: v26.w = vsub(v12.w,v8.w) +; CHECK-NEXT: v26.w = vlsr(v12.w,v21.w) +; CHECK-NEXT: v22.w = vsub(v14.w,v6.w) +; CHECK-NEXT: v7 = vmux(q1,v24,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.w = vlsr(v3.w,v5.w) -; CHECK-NEXT: v24.w = vsub(v12.w,v6.w) -; CHECK-NEXT: v8 = vmux(q1,v26,v8) +; CHECK-NEXT: v6 = vmux(q0,v22,v6) +; CHECK-NEXT: q0 = vcmp.gt(v14.w,v2.w) +; CHECK-NEXT: v27.w = vsub(v14.w,v26.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vlsr(v4.w,v10.w) -; CHECK-NEXT: v6 = vmux(q0,v24,v6) -; CHECK-NEXT: q0 = vcmp.gt(v12.w,v2.w) -; CHECK-NEXT: v28.w = vsub(v12.w,v3.w) +; CHECK-NEXT: v2 = vmux(q3,v7,v25) +; CHECK-NEXT: v29.w = vsub(v14.w,v3.w) +; CHECK-NEXT: q3 = vcmp.gt(v14.w,v1.w) +; CHECK-NEXT: v6 = vmux(q2,v6,v23) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vmux(q3,v8,v27) -; CHECK-NEXT: v29.w = vsub(v12.w,v4.w) -; CHECK-NEXT: q3 = vcmp.gt(v12.w,v0.w) -; CHECK-NEXT: v6 = vmux(q2,v6,v25) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q0,v1,v22) -; CHECK-NEXT: v3 = vmux(q0,v28,v3) -; CHECK-NEXT: q2 = vcmp.gt(v5.w,v12.w) -; CHECK-NEXT: v4 = vmux(q3,v29,v4) +; CHECK-NEXT: v30 = vmux(q0,v0,v18) +; CHECK-NEXT: v28 = vmux(q0,v27,v26) +; CHECK-NEXT: q2 = vcmp.gt(v21.w,v14.w) +; CHECK-NEXT: v3 = vmux(q3,v29,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.h = vpack(v2.w,v6.w):sat -; CHECK-NEXT: v1 = vmux(q3,v1,v22) -; CHECK-NEXT: q3 = vcmp.gt(v10.w,v12.w) -; CHECK-NEXT: v0 = vmux(q2,v3,v30) +; CHECK-NEXT: v0 = vmux(q3,v0,v18) +; CHECK-NEXT: q3 = vcmp.gt(v5.w,v14.w) +; CHECK-NEXT: v1 = vmux(q2,v28,v30) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vmux(q3,v4,v1) +; CHECK-NEXT: v0 = vmux(q3,v3,v0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.h = vpack(v1.w,v0.w):sat ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0.h = vpack(v1.w,v0.w):sat +; CHECK-NEXT: v0.h = vpack(v0.w,v1.w):sat ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31.b = vpack(v3.h,v2.h):sat @@ -638,13 +638,13 @@ define void @f32s8_1(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(##-2147483648,#8) ; CHECK-NEXT: r4 = #1 -; CHECK-NEXT: v1 = vmem(r0+#0) +; CHECK-NEXT: v1 = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3 = vsplat(r3) ; CHECK-NEXT: r5 = #30 ; CHECK-NEXT: v4.w = vasl(v0.w,r4) -; CHECK-NEXT: v0.cur = vmem(r0+#1) +; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vasl(v1.w,r4) @@ -653,64 +653,64 @@ define void @f32s8_1(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: r4 = #32 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6 = vsplat(r5) -; CHECK-NEXT: v7 = vsplat(r4) +; CHECK-NEXT: v7 = vsplat(r5) +; CHECK-NEXT: v8 = vsplat(r4) ; CHECK-NEXT: v2.w = vasl(v1.w,r2) ; CHECK-NEXT: v5.w = vsub(v5.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vasr(v4.w,r6) -; CHECK-NEXT: v26 = vxor(v26,v26) +; CHECK-NEXT: v27 = vxor(v27,v27) ; CHECK-NEXT: v2 = vor(v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = ##2147483647 ; CHECK-NEXT: v5.w = vasr(v5.w,r6) -; CHECK-NEXT: q0 = vcmp.gt(v26.w,v1.w) +; CHECK-NEXT: q0 = vcmp.gt(v27.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27 = vsplat(r3) -; CHECK-NEXT: v4.w = vsub(v6.w,v4.w) -; CHECK-NEXT: q2 = vcmp.gt(v26.w,v0.w) -; CHECK-NEXT: v5.w = vsub(v6.w,v5.w) +; CHECK-NEXT: v28 = vsplat(r3) +; CHECK-NEXT: v6.w = vasl(v0.w,r2) +; CHECK-NEXT: v4.w = vsub(v7.w,v4.w) +; CHECK-NEXT: q2 = vcmp.gt(v27.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.w = vasl(v0.w,r2) -; CHECK-NEXT: v4.w = vmin(v4.w,v7.w) -; CHECK-NEXT: v30 = vmux(q0,v3,v27) -; CHECK-NEXT: v5.w = vmin(v5.w,v7.w) +; CHECK-NEXT: v5.w = vsub(v7.w,v5.w) +; CHECK-NEXT: v4.w = vmin(v4.w,v8.w) +; CHECK-NEXT: v31 = vmux(q0,v3,v28) +; CHECK-NEXT: v6 = vor(v6,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v25 = vor(v8,v3) -; CHECK-NEXT: v1 = vmux(q2,v3,v27) -; CHECK-NEXT: q3 = vcmp.gt(v4.w,v26.w) -; CHECK-NEXT: q1 = vcmp.gt(v5.w,v26.w) +; CHECK-NEXT: v5.w = vmin(v5.w,v8.w) +; CHECK-NEXT: q1 = vcmp.gt(v4.w,v27.w) +; CHECK-NEXT: v0 = vmux(q2,v3,v28) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r2 = #64 -; CHECK-NEXT: v2.w = vlsr(v2.w,v5.w) +; CHECK-NEXT: v6.w = vlsr(v6.w,v4.w) +; CHECK-NEXT: q3 = vcmp.gt(v5.w,v27.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v28.w = vlsr(v25.w,v4.w) -; CHECK-NEXT: v29.w = vsub(v26.w,v2.w) +; CHECK-NEXT: v2.w = vlsr(v2.w,v5.w) +; CHECK-NEXT: v29.w = vsub(v27.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vsub(v26.w,v28.w) -; CHECK-NEXT: v0 = vmux(q0,v29,v2) +; CHECK-NEXT: v30.w = vsub(v27.w,v2.w) +; CHECK-NEXT: v1 = vmux(q0,v29,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v31 = vmux(q2,v6,v28) -; CHECK-NEXT: v0 = vmux(q1,v0,v30) +; CHECK-NEXT: v2 = vmux(q2,v30,v2) +; CHECK-NEXT: v1 = vmux(q1,v1,v31) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: q3 = vsetq(r2) -; CHECK-NEXT: v1 = vmux(q3,v31,v1) +; CHECK-NEXT: v0 = vmux(q3,v2,v0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.h = vpack(v1.w,v0.w):sat ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0.h = vpack(v1.w,v0.w):sat +; CHECK-NEXT: v0.h = vpack(v0.w,v1.w):sat ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.b = vpack(v2.h,v0.h):sat @@ -808,13 +808,13 @@ define void @f32s16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(##-2147483648,#1) ; CHECK-NEXT: r4 = #30 -; CHECK-NEXT: v1 = vmem(r0+#0) +; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vsplat(r3) ; CHECK-NEXT: r6 = #8 ; CHECK-NEXT: v3.w = vasl(v0.w,r2) -; CHECK-NEXT: v0.cur = vmem(r0+#1) +; CHECK-NEXT: v1 = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4 = vsplat(r4) @@ -828,55 +828,55 @@ define void @f32s16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = #32 -; CHECK-NEXT: v5.w = vasl(v1.w,r6) -; CHECK-NEXT: q1 = vcmp.gt(v7.w,v0.w) +; CHECK-NEXT: v5.w = vasl(v0.w,r6) +; CHECK-NEXT: q1 = vcmp.gt(v7.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6 = vsplat(r3) -; CHECK-NEXT: v27.w = vasr(v3.w,r5) +; CHECK-NEXT: v28.w = vasr(v3.w,r5) ; CHECK-NEXT: v5 = vor(v5,v2) -; CHECK-NEXT: q0 = vcmp.gt(v7.w,v1.w) +; CHECK-NEXT: q0 = vcmp.gt(v7.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9 = vsplat(r4) ; CHECK-NEXT: v8.w = vasr(v8.w,r5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v26.w = vasl(v0.w,r6) -; CHECK-NEXT: v0.w = vsub(v4.w,v27.w) +; CHECK-NEXT: v27.w = vasl(v1.w,r6) +; CHECK-NEXT: v1.w = vsub(v4.w,v28.w) ; CHECK-NEXT: v4.w = vsub(v4.w,v8.w) -; CHECK-NEXT: v28 = vmux(q0,v2,v9) +; CHECK-NEXT: v29 = vmux(q0,v2,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vmin(v1.w,v6.w) +; CHECK-NEXT: v0 = vor(v27,v2) ; CHECK-NEXT: v4.w = vmin(v4.w,v6.w) -; CHECK-NEXT: v1 = vor(v26,v2) -; CHECK-NEXT: v0.w = vmin(v0.w,v6.w) ; CHECK-NEXT: v2 = vmux(q1,v2,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: q2 = vcmp.gt(v4.w,v7.w) -; CHECK-NEXT: q3 = vcmp.gt(v0.w,v7.w) +; CHECK-NEXT: q2 = vcmp.gt(v1.w,v7.w) +; CHECK-NEXT: q3 = vcmp.gt(v4.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vlsr(v5.w,v4.w) +; CHECK-NEXT: v5.w = vlsr(v5.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1.w = vlsr(v1.w,v0.w) -; CHECK-NEXT: v29.w = vsub(v7.w,v5.w) +; CHECK-NEXT: v0.w = vlsr(v0.w,v4.w) +; CHECK-NEXT: v30.w = vsub(v7.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30.w = vsub(v7.w,v1.w) -; CHECK-NEXT: v5 = vmux(q0,v29,v5) +; CHECK-NEXT: v31.w = vsub(v7.w,v0.w) +; CHECK-NEXT: v5 = vmux(q0,v30,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vmux(q1,v30,v1) -; CHECK-NEXT: v31 = vmux(q2,v5,v28) +; CHECK-NEXT: v0 = vmux(q1,v31,v0) +; CHECK-NEXT: v1 = vmux(q2,v5,v29) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vmux(q3,v1,v2) +; CHECK-NEXT: v0 = vmux(q3,v0,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0.h = vpack(v1.w,v31.w):sat +; CHECK-NEXT: v0.h = vpack(v0.w,v1.w):sat ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } @@ -1097,13 +1097,13 @@ define void @f16u8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(##32768,#1) ; CHECK-NEXT: r4 = #14 -; CHECK-NEXT: v0 = vmem(r0+#1) +; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.h = vsplat(r3) ; CHECK-NEXT: r7:6 = combine(#11,#16) ; CHECK-NEXT: v3.h = vasl(v0.h,r2) -; CHECK-NEXT: v1 = vmem(r0+#0) +; CHECK-NEXT: v1 = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.h = vsplat(r4) @@ -1113,7 +1113,7 @@ define void @f16u8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.h = vsplat(r6) -; CHECK-NEXT: v5.h = vasl(v1.h,r5) +; CHECK-NEXT: v5.h = vasl(v0.h,r5) ; CHECK-NEXT: v4.h = vsub(v4.h,v2.h) ; CHECK-NEXT: v28 = vxor(v28,v28) ; CHECK-NEXT: } @@ -1125,28 +1125,26 @@ define void @f16u8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v29.h = vsplat(r2) ; CHECK-NEXT: v4.h = vasr(v4.h,r7) -; CHECK-NEXT: q2 = vcmp.gt(v28.h,v1.h) +; CHECK-NEXT: q2 = vcmp.gt(v28.h,v0.h) ; CHECK-NEXT: v3.h = vsub(v6.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.h = vasl(v0.h,r5) -; CHECK-NEXT: q3 = vcmp.gt(v28.h,v0.h) +; CHECK-NEXT: v8.h = vasl(v1.h,r5) +; CHECK-NEXT: q3 = vcmp.gt(v28.h,v1.h) ; CHECK-NEXT: v4.h = vsub(v6.h,v4.h) ; CHECK-NEXT: v3.h = vmin(v3.h,v7.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.h = vmin(v4.h,v7.h) ; CHECK-NEXT: v2 = vor(v8,v2) -; CHECK-NEXT: q1 = vcmp.gt(v28.h,v3.h) +; CHECK-NEXT: q0 = vcmp.gt(v28.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: q0 = vcmp.gt(v28.h,v4.h) +; CHECK-NEXT: v5.h = vlsr(v5.h,v3.h) +; CHECK-NEXT: q1 = vcmp.gt(v28.h,v4.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.h = vlsr(v5.h,v4.h) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vlsr(v2.h,v3.h) +; CHECK-NEXT: v2.h = vlsr(v2.h,v4.h) ; CHECK-NEXT: v30 = vmux(q0,v29,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -1552,7 +1550,7 @@ define void @f32u8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v5 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3 = vsplat(r4) +; CHECK-NEXT: v4 = vsplat(r4) ; CHECK-NEXT: r5 = #30 ; CHECK-NEXT: r6 = #24 ; CHECK-NEXT: v2 = vmem(r0+#1) @@ -1561,32 +1559,32 @@ define void @f32u8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v14 = vsplat(r5) ; CHECK-NEXT: r4 = #32 ; CHECK-NEXT: v8.w = vasl(v5.w,r2) -; CHECK-NEXT: v0 = vmem(r0+#3) +; CHECK-NEXT: v0 = vmem(r0+#2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.w = vasl(v2.w,r2) ; CHECK-NEXT: v13 = vxor(v13,v13) -; CHECK-NEXT: v8.w = vsub(v8.w,v3.w) -; CHECK-NEXT: v1 = vmem(r0+#2) +; CHECK-NEXT: v8.w = vsub(v8.w,v4.w) +; CHECK-NEXT: v1 = vmem(r0+#3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v20 = vsplat(r4) -; CHECK-NEXT: v12.w = vasl(v0.w,r2) -; CHECK-NEXT: v9.w = vsub(v9.w,v3.w) +; CHECK-NEXT: v21 = vsplat(r4) +; CHECK-NEXT: v11.w = vasl(v0.w,r2) +; CHECK-NEXT: v9.w = vsub(v9.w,v4.w) ; CHECK-NEXT: q0 = vcmp.gt(v13.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.w = vasl(v1.w,r2) +; CHECK-NEXT: v12.w = vasl(v1.w,r2) ; CHECK-NEXT: q3 = vcmp.gt(v13.w,v2.w) -; CHECK-NEXT: v12.w = vsub(v12.w,v3.w) +; CHECK-NEXT: v11.w = vsub(v11.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r2 = ##2147483647 ; CHECK-NEXT: r7 = #64 -; CHECK-NEXT: v11.w = vsub(v11.w,v3.w) +; CHECK-NEXT: v12.w = vsub(v12.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v22 = vsplat(r2) +; CHECK-NEXT: v23 = vsplat(r2) ; CHECK-NEXT: v8.w = vasr(v8.w,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -1596,68 +1594,68 @@ define void @f32u8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v5.w,r3) ; CHECK-NEXT: v9.w = vsub(v14.w,v9.w) -; CHECK-NEXT: v8.w = vmin(v8.w,v20.w) +; CHECK-NEXT: v8.w = vmin(v8.w,v21.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.w = vasl(v2.w,r3) -; CHECK-NEXT: v6 = vor(v6,v3) -; CHECK-NEXT: v9.w = vmin(v9.w,v20.w) +; CHECK-NEXT: v6 = vor(v6,v4) +; CHECK-NEXT: v9.w = vmin(v9.w,v21.w) ; CHECK-NEXT: q1 = vcmp.gt(v13.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v19.w = vasr(v11.w,r6) -; CHECK-NEXT: v7 = vor(v7,v3) +; CHECK-NEXT: v20.w = vasr(v11.w,r6) +; CHECK-NEXT: v7 = vor(v7,v4) ; CHECK-NEXT: q2 = vcmp.gt(v13.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.w = vasr(v12.w,r6) -; CHECK-NEXT: v5.w = vsub(v14.w,v19.w) +; CHECK-NEXT: v5.w = vsub(v14.w,v20.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vasl(v1.w,r3) -; CHECK-NEXT: v21.w = vsub(v14.w,v12.w) -; CHECK-NEXT: v5.w = vmin(v5.w,v20.w) +; CHECK-NEXT: v3.w = vasl(v1.w,r3) +; CHECK-NEXT: v22.w = vsub(v14.w,v12.w) +; CHECK-NEXT: v5.w = vmin(v5.w,v21.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10.w = vasl(v0.w,r3) -; CHECK-NEXT: v4 = vor(v4,v3) +; CHECK-NEXT: v3 = vor(v3,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vlsr(v6.w,v8.w) -; CHECK-NEXT: v3 = vor(v10,v3) -; CHECK-NEXT: v10.w = vmin(v21.w,v20.w) +; CHECK-NEXT: v10 = vor(v10,v4) +; CHECK-NEXT: v4.w = vmin(v22.w,v21.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.w = vlsr(v7.w,v9.w) -; CHECK-NEXT: v24 = vmux(q1,v22,v6) +; CHECK-NEXT: v6 = vmux(q1,v23,v6) ; CHECK-NEXT: q1 = vcmp.gt(v13.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v23.w = vlsr(v4.w,v5.w) -; CHECK-NEXT: v25 = vmux(q2,v22,v7) -; CHECK-NEXT: q2 = vcmp.gt(v13.w,v10.w) -; CHECK-NEXT: v4 = vmux(q0,v13,v24) +; CHECK-NEXT: v24.w = vlsr(v10.w,v5.w) +; CHECK-NEXT: v7 = vmux(q2,v23,v7) +; CHECK-NEXT: q2 = vcmp.gt(v13.w,v4.w) +; CHECK-NEXT: v25 = vmux(q0,v13,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.w = vlsr(v3.w,v10.w) -; CHECK-NEXT: v26 = vmux(q3,v13,v25) -; CHECK-NEXT: v2 = vmux(q1,v22,v23) -; CHECK-NEXT: q1 = vcmp.gt(v13.w,v1.w) +; CHECK-NEXT: v3.w = vlsr(v3.w,v4.w) +; CHECK-NEXT: v26 = vmux(q3,v13,v7) +; CHECK-NEXT: v2 = vmux(q1,v23,v24) +; CHECK-NEXT: q1 = vcmp.gt(v13.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27 = vmux(q2,v22,v3) -; CHECK-NEXT: q3 = vcmp.gt(v13.w,v0.w) +; CHECK-NEXT: v27 = vmux(q2,v23,v3) +; CHECK-NEXT: q3 = vcmp.gt(v13.w,v1.w) ; CHECK-NEXT: v28 = vmux(q1,v13,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uh = vpack(v26.w,v4.w):sat -; CHECK-NEXT: v1 = vmux(q3,v13,v27) +; CHECK-NEXT: v29.uh = vpack(v26.w,v25.w):sat +; CHECK-NEXT: v0 = vmux(q3,v13,v27) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30.uh = vpack(v1.w,v28.w):sat +; CHECK-NEXT: v30.uh = vpack(v28.w,v0.w):sat ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0.uh = vpack(v1.w,v28.w):sat +; CHECK-NEXT: v0.uh = vpack(v0.w,v28.w):sat ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31.ub = vpack(v30.h,v29.h):sat @@ -1684,13 +1682,13 @@ define void @f32u8_1(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(##-2147483648,#1) ; CHECK-NEXT: r4 = #30 -; CHECK-NEXT: v0 = vmem(r0+#1) +; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vsplat(r3) ; CHECK-NEXT: r7:6 = combine(#24,#32) ; CHECK-NEXT: v3.w = vasl(v0.w,r2) -; CHECK-NEXT: v1 = vmem(r0+#0) +; CHECK-NEXT: v1 = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6 = vsplat(r4) @@ -1700,7 +1698,7 @@ define void @f32u8_1(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r6) -; CHECK-NEXT: v5.w = vasl(v1.w,r5) +; CHECK-NEXT: v5.w = vasl(v0.w,r5) ; CHECK-NEXT: v4.w = vsub(v4.w,v2.w) ; CHECK-NEXT: v27 = vxor(v27,v27) ; CHECK-NEXT: } @@ -1712,13 +1710,13 @@ define void @f32u8_1(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v28 = vsplat(r3) ; CHECK-NEXT: v4.w = vasr(v4.w,r7) -; CHECK-NEXT: q2 = vcmp.gt(v27.w,v1.w) +; CHECK-NEXT: q2 = vcmp.gt(v27.w,v0.w) ; CHECK-NEXT: v3.w = vsub(v6.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r2 = #64 -; CHECK-NEXT: v8.w = vasl(v0.w,r5) -; CHECK-NEXT: q3 = vcmp.gt(v27.w,v0.w) +; CHECK-NEXT: v8.w = vasl(v1.w,r5) +; CHECK-NEXT: q3 = vcmp.gt(v27.w,v1.w) ; CHECK-NEXT: v4.w = vsub(v6.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -1727,14 +1725,14 @@ define void @f32u8_1(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v2 = vor(v8,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: q1 = vcmp.gt(v27.w,v3.w) -; CHECK-NEXT: q0 = vcmp.gt(v27.w,v4.w) +; CHECK-NEXT: q0 = vcmp.gt(v27.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vlsr(v5.w,v4.w) +; CHECK-NEXT: v5.w = vlsr(v5.w,v3.w) +; CHECK-NEXT: q1 = vcmp.gt(v27.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w) +; CHECK-NEXT: v2.w = vlsr(v2.w,v4.w) ; CHECK-NEXT: v29 = vmux(q0,v28,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -1843,13 +1841,13 @@ define void @f32u16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(##-2147483648,#1) ; CHECK-NEXT: r4 = #30 -; CHECK-NEXT: v0 = vmem(r0+#1) +; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vsplat(r3) ; CHECK-NEXT: r7:6 = combine(#24,#32) ; CHECK-NEXT: v3.w = vasl(v0.w,r2) -; CHECK-NEXT: v1 = vmem(r0+#0) +; CHECK-NEXT: v1 = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6 = vsplat(r4) @@ -1859,7 +1857,7 @@ define void @f32u16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r6) -; CHECK-NEXT: v5.w = vasl(v1.w,r5) +; CHECK-NEXT: v5.w = vasl(v0.w,r5) ; CHECK-NEXT: v4.w = vsub(v4.w,v2.w) ; CHECK-NEXT: v28 = vxor(v28,v28) ; CHECK-NEXT: } @@ -1871,28 +1869,26 @@ define void @f32u16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v29 = vsplat(r2) ; CHECK-NEXT: v4.w = vasr(v4.w,r7) -; CHECK-NEXT: q2 = vcmp.gt(v28.w,v1.w) +; CHECK-NEXT: q2 = vcmp.gt(v28.w,v0.w) ; CHECK-NEXT: v3.w = vsub(v6.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.w = vasl(v0.w,r5) -; CHECK-NEXT: q3 = vcmp.gt(v28.w,v0.w) +; CHECK-NEXT: v8.w = vasl(v1.w,r5) +; CHECK-NEXT: q3 = vcmp.gt(v28.w,v1.w) ; CHECK-NEXT: v4.w = vsub(v6.w,v4.w) ; CHECK-NEXT: v3.w = vmin(v3.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vmin(v4.w,v7.w) ; CHECK-NEXT: v2 = vor(v8,v2) -; CHECK-NEXT: q1 = vcmp.gt(v28.w,v3.w) +; CHECK-NEXT: q0 = vcmp.gt(v28.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: q0 = vcmp.gt(v28.w,v4.w) +; CHECK-NEXT: v5.w = vlsr(v5.w,v3.w) +; CHECK-NEXT: q1 = vcmp.gt(v28.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vlsr(v5.w,v4.w) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w) +; CHECK-NEXT: v2.w = vlsr(v2.w,v4.w) ; CHECK-NEXT: v30 = vmux(q0,v29,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { diff --git a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll index c0e38b9243033..c3308ec193995 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll @@ -1042,7 +1042,7 @@ define void @s32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(#8,#1) ; CHECK-NEXT: r6 = #255 -; CHECK-NEXT: v6.w = vabs(v1.w) +; CHECK-NEXT: v4.w = vabs(v1.w) ; CHECK-NEXT: v1.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -1054,102 +1054,102 @@ define void @s32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v9 = vsplat(r4) ; CHECK-NEXT: v8 = vsplat(r6) -; CHECK-NEXT: v3.uw = vcl0(v6.uw) -; CHECK-NEXT: v20 = vxor(v20,v20) +; CHECK-NEXT: r4 = #159 +; CHECK-NEXT: v3.uw = vcl0(v4.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r4 = #159 -; CHECK-NEXT: v4.uw = vcl0(v5.uw) -; CHECK-NEXT: v3.w = vadd(v3.w,v2.w) +; CHECK-NEXT: v6.uw = vcl0(v5.uw) +; CHECK-NEXT: v7.w = vadd(v3.w,v2.w) +; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27 = vsplat(r4) +; CHECK-NEXT: v26 = vsplat(r4) ; CHECK-NEXT: r5 = ##-2147483648 -; CHECK-NEXT: v7.w = vadd(v4.w,v2.w) +; CHECK-NEXT: v6.w = vadd(v6.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13 = vsplat(r5) -; CHECK-NEXT: v6.w = vasl(v6.w,v3.w) -; CHECK-NEXT: q0 = vcmp.gt(v20.w,v1.w) +; CHECK-NEXT: v4.w = vasl(v4.w,v7.w) +; CHECK-NEXT: q0 = vcmp.gt(v3.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vasl(v5.w,v7.w) -; CHECK-NEXT: v26 = vmux(q0,v13,v20) -; CHECK-NEXT: v10.w = vadd(v6.w,v8.w) -; CHECK-NEXT: v11 = vand(v6,v9) +; CHECK-NEXT: v5.w = vasl(v5.w,v6.w) +; CHECK-NEXT: v25 = vmux(q0,v13,v3) +; CHECK-NEXT: v10.w = vadd(v4.w,v8.w) +; CHECK-NEXT: v11 = vand(v4,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9 = vand(v5,v9) -; CHECK-NEXT: q3 = vcmp.eq(v11.w,v20.w) +; CHECK-NEXT: q3 = vcmp.eq(v11.w,v3.w) ; CHECK-NEXT: v8.w = vadd(v5.w,v8.w) -; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v10.uw) +; CHECK-NEXT: q1 = vcmp.gt(v4.uw,v10.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21.uw = vlsr(v10.uw,r3) -; CHECK-NEXT: q2 = vcmp.eq(v9.w,v20.w) -; CHECK-NEXT: v22 = vmux(q3,v20,v2) +; CHECK-NEXT: v20.uw = vlsr(v10.uw,r3) +; CHECK-NEXT: q2 = vcmp.eq(v9.w,v3.w) +; CHECK-NEXT: v21 = vmux(q3,v3,v2) ; CHECK-NEXT: q3 = vcmp.gt(v5.uw,v8.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.uw = vlsr(v8.uw,r3) -; CHECK-NEXT: v9.w = vadd(v21.w,v22.w) -; CHECK-NEXT: v24 = vmux(q2,v20,v2) -; CHECK-NEXT: v23 = vmux(q1,v2,v20) +; CHECK-NEXT: v9.w = vadd(v20.w,v21.w) +; CHECK-NEXT: v23 = vmux(q2,v3,v2) +; CHECK-NEXT: v22 = vmux(q1,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uw = vlsr(v6.uw,r3) -; CHECK-NEXT: v2 = vmux(q3,v2,v20) -; CHECK-NEXT: v25.w = vadd(v8.w,v24.w) -; CHECK-NEXT: v3.w = vsub(v23.w,v3.w) +; CHECK-NEXT: v12.uw = vlsr(v4.uw,r3) +; CHECK-NEXT: v2 = vmux(q3,v2,v3) +; CHECK-NEXT: v24.w = vadd(v8.w,v23.w) +; CHECK-NEXT: v7.w = vsub(v22.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r3) -; CHECK-NEXT: v2.w = vsub(v2.w,v7.w) -; CHECK-NEXT: q3 = vcmp.eq(v12.w,v21.w) -; CHECK-NEXT: v3.w = vadd(v3.w,v27.w) +; CHECK-NEXT: v2.w = vsub(v2.w,v6.w) +; CHECK-NEXT: q3 = vcmp.eq(v12.w,v20.w) +; CHECK-NEXT: v7.w = vadd(v7.w,v26.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = #23 -; CHECK-NEXT: v6.uw = vlsr(v21.uw,r2) +; CHECK-NEXT: v4.uw = vlsr(v20.uw,r2) ; CHECK-NEXT: q2 = vcmp.eq(v5.w,v8.w) -; CHECK-NEXT: v2.w = vadd(v2.w,v27.w) +; CHECK-NEXT: v2.w = vadd(v2.w,v26.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.uw = vlsr(v9.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v28.uw = vlsr(v25.uw,r2) -; CHECK-NEXT: v6 = vmux(q3,v9,v6) -; CHECK-NEXT: q3 = vcmp.gt(v20.w,v0.w) +; CHECK-NEXT: v27.uw = vlsr(v24.uw,r2) +; CHECK-NEXT: v4 = vmux(q3,v9,v4) +; CHECK-NEXT: q3 = vcmp.gt(v3.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uw = vlsr(v8.uw,r2) -; CHECK-NEXT: v30 = vmux(q3,v13,v20) -; CHECK-NEXT: v6 = vor(v26,v6) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v20.w) +; CHECK-NEXT: v28.uw = vlsr(v8.uw,r2) +; CHECK-NEXT: v30 = vmux(q3,v13,v3) +; CHECK-NEXT: v4 = vor(v25,v4) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.w = vasl(v3.w,r3) -; CHECK-NEXT: v5 = vmux(q2,v28,v29) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v20.w) +; CHECK-NEXT: v29.w = vasl(v7.w,r3) +; CHECK-NEXT: v5 = vmux(q2,v27,v28) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.w = vasl(v2.w,r3) ; CHECK-NEXT: v31 = vor(v30,v5) -; CHECK-NEXT: v3 = vor(v6,v3) +; CHECK-NEXT: v4 = vor(v4,v29) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vor(v31,v2) -; CHECK-NEXT: v3 = vmux(q2,v20,v3) +; CHECK-NEXT: v4 = vmux(q2,v3,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vmux(q3,v20,v1) +; CHECK-NEXT: v0 = vmux(q3,v3,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.qf32 = vadd(v3.sf,v20.sf) +; CHECK-NEXT: v2.qf32 = vadd(v4.sf,v3.sf) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v20.sf) +; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v3.sf) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.hf = v3:2.qf32 @@ -2369,20 +2369,20 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(#8,#1) ; CHECK-NEXT: r6 = #255 -; CHECK-NEXT: v3.uw = vcl0(v0.uw) -; CHECK-NEXT: v0.cur = vmem(r0+#1) +; CHECK-NEXT: v3.uw = vcl0(v1.uw) +; CHECK-NEXT: v1.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2 = vsplat(r2) ; CHECK-NEXT: r4 = #512 -; CHECK-NEXT: v4.uw = vcl0(v1.uw) -; CHECK-NEXT: v1.cur = vmem(r0+#0) +; CHECK-NEXT: v4.uw = vcl0(v0.uw) +; CHECK-NEXT: v0.cur = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r4) ; CHECK-NEXT: v6 = vsplat(r6) -; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) ; CHECK-NEXT: v3.w = vadd(v3.w,v2.w) +; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #159 @@ -2390,10 +2390,10 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r4) -; CHECK-NEXT: v5.w = vasl(v1.w,v4.w) +; CHECK-NEXT: v5.w = vasl(v1.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.w = vasl(v0.w,v3.w) +; CHECK-NEXT: v8.w = vasl(v0.w,v4.w) ; CHECK-NEXT: v11.w = vadd(v5.w,v6.w) ; CHECK-NEXT: v13 = vand(v5,v7) ; CHECK-NEXT: } @@ -2416,15 +2416,15 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v2 = vmux(q0,v9,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vsub(v29.w,v4.w) +; CHECK-NEXT: v3.w = vsub(v29.w,v3.w) ; CHECK-NEXT: v7.w = vadd(v27.w,v28.w) -; CHECK-NEXT: v3.w = vsub(v30.w,v3.w) +; CHECK-NEXT: v4.w = vsub(v30.w,v4.w) ; CHECK-NEXT: v2.w = vadd(v6.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uw = vlsr(v5.uw,r3) -; CHECK-NEXT: v4.w = vadd(v4.w,v10.w) ; CHECK-NEXT: v3.w = vadd(v3.w,v10.w) +; CHECK-NEXT: v4.w = vadd(v4.w,v10.w) ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -2448,16 +2448,16 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vasl(v4.w,r3) +; CHECK-NEXT: v3.w = vasl(v3.w,r3) ; CHECK-NEXT: v31 = vmux(q1,v2,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vasl(v3.w,r3) -; CHECK-NEXT: v4 = vor(v5,v4) +; CHECK-NEXT: v2.w = vasl(v4.w,r3) +; CHECK-NEXT: v3 = vor(v5,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vor(v31,v2) -; CHECK-NEXT: v3 = vmux(q2,v9,v4) +; CHECK-NEXT: v3 = vmux(q2,v9,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v9,v1) diff --git a/llvm/test/CodeGen/Hexagon/isel/extract-subvec.ll b/llvm/test/CodeGen/Hexagon/isel/extract-subvec.ll new file mode 100644 index 0000000000000..f7262eacabe8e --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/isel/extract-subvec.ll @@ -0,0 +1,34 @@ +; Check if extract_subvectors is handled properly in Hexagon backend when the +; the source vector is a vector-pair and result vector is not hvx vector size. +; https://github.com/llvm/llvm-project/issues/128775 +; +; Example of such a case: +; ... +; t2: v64i32,ch = CopyFromReg t0, Register:v64i32 %0 +; t17: v2i32 = extract_subvector t2, Constant:i32<4> +; ... + +; RUN: llc -mtriple=hexagon -mattr="hvx-length128b" < %s | FileCheck %s + +; CHECK-LABEL: extract_subvec: +; CHECK: r29 = and(r29,#-128) +; CHECK: [[R1:r([0-9]+)]] = add(r29,#0) +; CHECK: vmem([[R1]]+#0) = v0 +; CHECK-DAG: r[[R4:[0-9]+]] = memw([[R1]]+#0) +; CHECK-DAG: r[[R5:[0-9]+]] = memw([[R1]]+#4) +; CHECK-DAG: r[[R6:[0-9]+]] = memw([[R1]]+#8) +; CHECK-DAG: r[[R7:[0-9]+]] = memw([[R1]]+#12) +; CHECK-DAG: r[[R8:[0-9]+]] = memw([[R1]]+#16) +; CHECK-DAG: r[[R9:[0-9]+]] = memw([[R1]]+#20) +; CHECK-DAG: r[[R2:[0-9]+]] = memw([[R1]]+#24) +; CHECK-DAG: r[[R3:[0-9]+]] = memw([[R1]]+#28) +; CHECK-DAG: memd(r0+#0) = r[[R5]]:[[R4]] +; CHECK-DAG: memd(r0+#8) = r[[R7]]:[[R6]] +; CHECK-DAG: memd(r0+#16) = r[[R9]]:[[R8]] +; CHECK-DAG: memw(r0+#24) = r[[R2]] +define void @extract_subvec(<56 x i32> %val, ptr %buf) { +entry: + %split = shufflevector <56 x i32> %val, <56 x i32> zeroinitializer, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6> + store <7 x i32> %split, ptr %buf, align 32 + ret void +} _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits