https://github.com/zhaoqi5 updated https://github.com/llvm/llvm-project/pull/151634
>From f759464ee797830c998d66d1076d98933336c5a1 Mon Sep 17 00:00:00 2001 From: Qi Zhao <zhaoq...@loongson.cn> Date: Fri, 1 Aug 2025 11:30:19 +0800 Subject: [PATCH 1/2] [LoongArch] Use xvperm.w for cross-lane access within a single vector --- .../LoongArch/LoongArchISelLowering.cpp | 44 +++++++++++++++++++ .../lasx/shuffle-as-permute-and-shuffle.ll | 18 ++------ 2 files changed, 48 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 597650c8229a7..6aa848ca7bd07 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -1832,6 +1832,48 @@ static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG); } +/// Lower VECTOR_SHUFFLE into XVPERM (if possible). +static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + SelectionDAG &DAG) { + // LoongArch LASX only have XVPERM_W. + if (Mask.size() != 8 || (VT != MVT::v8i32 && VT != MVT::v8f32)) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfSize = NumElts / 2; + bool FrontLo = true, FrontHi = true; + bool BackLo = true, BackHi = true; + + auto inRange = [](int val, int low, int high) { + return (val == -1) || (val >= low && val < high); + }; + + for (unsigned i = 0; i < HalfSize; ++i) { + int Fronti = Mask[i]; + int Backi = Mask[i + HalfSize]; + + FrontLo &= inRange(Fronti, 0, HalfSize); + FrontHi &= inRange(Fronti, HalfSize, NumElts); + BackLo &= inRange(Backi, 0, HalfSize); + BackHi &= inRange(Backi, HalfSize, NumElts); + } + + // If both the lower and upper 128-bit parts access only one half of the + // vector (either lower or upper), avoid using xvperm.w. The latency of + // xvperm.w(3) is higher than using xvshuf(1) and xvori(1). + if ((FrontLo && (BackLo || BackHi)) || (FrontHi && (BackLo || BackHi))) + return SDValue(); + + SmallVector<SDValue, 8> Masks; + for (unsigned i = 0; i < NumElts; ++i) + Masks.push_back(Mask[i] == -1 ? DAG.getUNDEF(MVT::i64) + : DAG.getConstant(Mask[i], DL, MVT::i64)); + SDValue MaskVec = DAG.getBuildVector(MVT::v8i32, DL, Masks); + + return DAG.getNode(LoongArchISD::XVPERM, DL, VT, V1, MaskVec); +} + /// Lower VECTOR_SHUFFLE into XVPACKEV (if possible). static SDValue lowerVECTOR_SHUFFLE_XVPACKEV(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, @@ -2235,6 +2277,8 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, return Result; if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG))) return Result; + if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, NewMask, VT, V1, V2, DAG))) + return Result; if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT, V1, V2, DAG))) return Result; diff --git a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll index fed085843485a..5f76d9951df9c 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll @@ -61,13 +61,8 @@ define <8 x i32> @shuffle_v8i32(<8 x i32> %a) { ; CHECK-LABEL: shuffle_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0) -; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI4_0) -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_1) -; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI4_1) -; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78 -; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3 -; CHECK-NEXT: xvshuf.d $xr1, $xr2, $xr0 -; CHECK-NEXT: xvori.b $xr0, $xr1, 0 +; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI4_0) +; CHECK-NEXT: xvperm.w $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %shuffle = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7> ret <8 x i32> %shuffle @@ -117,13 +112,8 @@ define <8 x float> @shuffle_v8f32(<8 x float> %a) { ; CHECK-LABEL: shuffle_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_0) -; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI8_0) -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_1) -; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI8_1) -; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78 -; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3 -; CHECK-NEXT: xvshuf.d $xr1, $xr2, $xr0 -; CHECK-NEXT: xvori.b $xr0, $xr1, 0 +; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI8_0) +; CHECK-NEXT: xvperm.w $xr0, $xr0, $xr1 ; CHECK-NEXT: ret %shuffle = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7> ret <8 x float> %shuffle >From f934beb1436e5c519df78fdab0e3e94268f1e30f Mon Sep 17 00:00:00 2001 From: Qi Zhao <zhaoq...@loongson.cn> Date: Fri, 1 Aug 2025 12:04:02 +0800 Subject: [PATCH 2/2] opt code style --- llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 6aa848ca7bd07..1d8998ad90ddf 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -1862,7 +1862,7 @@ static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef<int> Mask, // If both the lower and upper 128-bit parts access only one half of the // vector (either lower or upper), avoid using xvperm.w. The latency of // xvperm.w(3) is higher than using xvshuf(1) and xvori(1). - if ((FrontLo && (BackLo || BackHi)) || (FrontHi && (BackLo || BackHi))) + if ((FrontLo || FrontHi) && (BackLo || BackHi)) return SDValue(); SmallVector<SDValue, 8> Masks; _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits