https://github.com/zhaoqi5 created https://github.com/llvm/llvm-project/pull/165670
None >From 3294ad152c20608cb8cdba054c6bfd4cb4e9d051 Mon Sep 17 00:00:00 2001 From: Qi Zhao <[email protected]> Date: Thu, 30 Oct 2025 10:34:32 +0800 Subject: [PATCH] [LoongArch] Custom legalize vector_shuffle which elements from halves or quarters --- .../LoongArch/LoongArchISelLowering.cpp | 102 ++++++++++++++++++ .../lsx/shufflevector-halves-quarters.ll | 66 ++++++------ llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll | 58 +++++----- 3 files changed, 165 insertions(+), 61 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 80c96c6dc8eb6..8564fb1fe5560 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -1738,6 +1738,105 @@ lowerVECTOR_SHUFFLE_IsReverse(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, DAG.getConstant(27, DL, Subtarget.getGRLenVT())); } +/// Lower VECTOR_SHUFFLE whose result elements is all undef except for the +/// first two or four elements which are from the half or quarter parts of the +/// source vector. +/// +/// It is possible to do optimization for VECTOR_SHUFFLE whose mask likes: +/// <i, i+n/2, -1, ...> +/// where n is the number of elements in the vector and i is in [0, n/2). Or: +/// <i, i+4, i+8, i+12, -1, ...> (Only v16i8, and the first four can be undef) +/// where i is in [0, 4). +/// +/// For example: <0, 4, -1, ...> or <0, 4, 8, 12, -1, ...>, which appears when +/// legalizing ISD::TRUNCATE in ReplaceNodeResults(). +static SDValue +lowerVECTOR_SHUFFLE_HalvesOrQuarters(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { + if (VT != MVT::v16i8 && VT != MVT::v8i16) + return SDValue(); + + int HalfSize = Mask.size() / 2; + int QuarterSize = Mask.size() / 4; + MVT GRLenVT = Subtarget.getGRLenVT(); + + auto allUndefFrom = [&](unsigned idx) -> bool { + return llvm::all_of(Mask.drop_front(idx), [](int M) { return M == -1; }); + }; + + auto buildShuffled = [&](MVT CastVT, ArrayRef<int> ShuffleMask) { + SDValue Cast = DAG.getBitcast(CastVT, V1); + SDValue Shuf = DAG.getVectorShuffle(CastVT, DL, Cast, Cast, ShuffleMask); + return DAG.getBitcast(VT, Shuf); + }; + + // Check pattern: <i, i+HalfSize, -1, ...> + int M0 = Mask[0], M1 = Mask[1]; + if (M0 >= 0 && M0 < HalfSize && M1 == M0 + HalfSize && allUndefFrom(2)) { + SDValue SrcVec = V1; + // Shuffle vector for various masks to place needed elements at front. + if (M0 >= QuarterSize && M0 < QuarterSize + 2) + SrcVec = buildShuffled(MVT::v4i32, {1, 0, 3, 2}); + else if (M0 >= 2 && M0 < 4) // Only v16i8 meets this. + SrcVec = buildShuffled(MVT::v8i16, {1, 0, 3, 2, 5, 4, 7, 6}); + else if (M0 >= 6 && M0 < 8) // Only v16i8 meets this. + SrcVec = buildShuffled(MVT::v8i16, {3, 2, 1, 0, 7, 6, 5, 4}); + + // Broadcast the needed high part elements. + SDValue VecHi = DAG.getNode(LoongArchISD::VREPLVEI, DL, MVT::v4i32, + DAG.getBitcast(MVT::v4i32, SrcVec), + DAG.getConstant(2, DL, GRLenVT)); + + unsigned Opc = (M0 % 2) ? LoongArchISD::VPACKOD : LoongArchISD::VPACKEV; + return DAG.getNode(Opc, DL, VT, DAG.getBitcast(VT, VecHi), SrcVec); + } + + // Only consider quarter cases for v16i8. + if (VT != MVT::v16i8) + return SDValue(); + + // Check pattern: <i, i+4, i+8, i+12, -1, ...> + // Still succeeds even if the first four elements have undef. + bool FromQuarters = false; + int First = -1; + for (int i = 0; i < QuarterSize && !FromQuarters; ++i) { + FromQuarters = llvm::all_of(llvm::seq<int>(0, 4), [&](int j) { + return Mask[j] == -1 || Mask[j] == i + j * 4; + }); + if (FromQuarters) + First = i; + } + + if (FromQuarters && allUndefFrom(4)) { + SmallVector<int, 8> ShufMask = + (First < 2) ? SmallVector<int, 8>{0, 2, 1, 3, 4, 6, 5, 7} + : SmallVector<int, 8>{1, 3, 0, 2, 5, 7, 4, 6}; + SmallVector<int, 16> ExtractMask = + (First % 2) ? SmallVector<int, 16>{1, 3, 0, 2, 5, 7, 4, 6, + 9, 11, 8, 10, 13, 15, 12, 14} + : SmallVector<int, 16>{0, 2, 1, 3, 4, 6, 5, 7, + 8, 10, 9, 11, 12, 14, 13, 15}; + + // Shuffle vector for various masks to place needed elements at front. + MVT ShufVT = MVT::v8i16; + SDValue SrcVec = buildShuffled(ShufVT, ShufMask); + SDValue Extract = DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, ExtractMask); + + // Broadcast the needed high part elements. + SDValue VecHi = DAG.getNode(LoongArchISD::VREPLVEI, DL, ShufVT, + DAG.getBitcast(ShufVT, Extract), + DAG.getConstant(4, DL, GRLenVT)); + + unsigned Opc = (First % 2) ? LoongArchISD::VPACKOD : LoongArchISD::VPACKEV; + SDValue Result = + DAG.getNode(Opc, DL, ShufVT, VecHi, DAG.getBitcast(ShufVT, Extract)); + return DAG.getBitcast(VT, Result); + } + + return SDValue(); +} + /// Lower VECTOR_SHUFFLE into VPACKEV (if possible). /// /// VPACKEV interleaves the even elements from each vector. @@ -2044,6 +2143,9 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, if ((Result = lowerVECTOR_SHUFFLE_IsReverse(DL, Mask, VT, V1, DAG, Subtarget))) return Result; + if ((Result = lowerVECTOR_SHUFFLE_HalvesOrQuarters(DL, Mask, VT, V1, DAG, + Subtarget))) + return Result; // TODO: This comment may be enabled in the future to better match the // pattern for instruction selection. diff --git a/llvm/test/CodeGen/LoongArch/lsx/shufflevector-halves-quarters.ll b/llvm/test/CodeGen/LoongArch/lsx/shufflevector-halves-quarters.ll index 2a0a107a2b76e..946a4e5524bc0 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/shufflevector-halves-quarters.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/shufflevector-halves-quarters.ll @@ -6,9 +6,8 @@ define void @shufflevector_halves_b(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: shufflevector_halves_b: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI0_0) -; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI0_0) -; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 +; CHECK-NEXT: vpackev.b $vr0, $vr1, $vr0 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -23,9 +22,9 @@ define void @shufflevector_halves_b_1(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: shufflevector_halves_b_1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI1_0) -; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI1_0) -; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 177 +; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 +; CHECK-NEXT: vpackod.b $vr0, $vr1, $vr0 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -40,9 +39,9 @@ define void @shufflevector_halves_b_2(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: shufflevector_halves_b_2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI2_0) -; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI2_0) -; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 177 +; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 +; CHECK-NEXT: vpackod.b $vr0, $vr1, $vr0 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -57,9 +56,9 @@ define void @shufflevector_halves_b_3(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: shufflevector_halves_b_3: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0) -; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI3_0) -; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 27 +; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 +; CHECK-NEXT: vpackev.b $vr0, $vr1, $vr0 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -74,10 +73,9 @@ define void @shufflevector_halves_h(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: shufflevector_halves_h: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI4_0) -; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI4_0) -; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr0 -; CHECK-NEXT: vst $vr1, $a0, 0 +; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 +; CHECK-NEXT: vpackev.h $vr0, $vr1, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %va = load <8 x i16>, ptr %a @@ -91,10 +89,10 @@ define void @shufflevector_halves_h_1(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: shufflevector_halves_h_1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI5_0) -; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI5_0) -; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr0 -; CHECK-NEXT: vst $vr1, $a0, 0 +; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 177 +; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 +; CHECK-NEXT: vpackod.h $vr0, $vr1, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %va = load <8 x i16>, ptr %a @@ -108,9 +106,10 @@ define void @shufflevector_quarters_b(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: shufflevector_quarters_b: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI6_0) -; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI6_0) -; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 216 +; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 216 +; CHECK-NEXT: vreplvei.h $vr1, $vr0, 4 +; CHECK-NEXT: vpackev.h $vr0, $vr1, $vr0 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -125,9 +124,10 @@ define void @shufflevector_quarters_b_1(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: shufflevector_quarters_b_1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI7_0) -; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI7_0) -; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 216 +; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 141 +; CHECK-NEXT: vreplvei.h $vr1, $vr0, 4 +; CHECK-NEXT: vpackod.h $vr0, $vr1, $vr0 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -142,9 +142,10 @@ define void @shufflevector_quarters_b_2(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: shufflevector_quarters_b_2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI8_0) -; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI8_0) -; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 141 +; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 216 +; CHECK-NEXT: vreplvei.h $vr1, $vr0, 4 +; CHECK-NEXT: vpackev.h $vr0, $vr1, $vr0 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -159,9 +160,10 @@ define void @shufflevector_quarters_b_3(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: shufflevector_quarters_b_3: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI9_0) -; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI9_0) -; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 141 +; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 141 +; CHECK-NEXT: vreplvei.h $vr1, $vr0, 4 +; CHECK-NEXT: vpackod.h $vr0, $vr1, $vr0 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll index 314350acd23d6..9b9016b4e5972 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll @@ -28,20 +28,18 @@ define void @load_trunc_2i64_to_2i16(ptr %ptr, ptr %dst) nounwind { ; LA32-LABEL: load_trunc_2i64_to_2i16: ; LA32: # %bb.0: ; LA32-NEXT: vld $vr0, $a0, 0 -; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) -; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI1_0) -; LA32-NEXT: vshuf.h $vr1, $vr0, $vr0 -; LA32-NEXT: vpickve2gr.w $a0, $vr1, 0 +; LA32-NEXT: vreplvei.w $vr1, $vr0, 2 +; LA32-NEXT: vpackev.h $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 ; LA32-NEXT: st.w $a0, $a1, 0 ; LA32-NEXT: ret ; ; LA64-LABEL: load_trunc_2i64_to_2i16: ; LA64: # %bb.0: ; LA64-NEXT: vld $vr0, $a0, 0 -; LA64-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) -; LA64-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI1_0) -; LA64-NEXT: vshuf.h $vr1, $vr0, $vr0 -; LA64-NEXT: vstelm.w $vr1, $a1, 0, 0 +; LA64-NEXT: vreplvei.w $vr1, $vr0, 2 +; LA64-NEXT: vpackev.h $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 ; LA64-NEXT: ret %a = load <2 x i64>, ptr %ptr %trunc = trunc <2 x i64> %a to <2 x i16> @@ -53,18 +51,16 @@ define void @load_trunc_2i64_to_2i8(ptr %ptr, ptr %dst) nounwind { ; LA32-LABEL: load_trunc_2i64_to_2i8: ; LA32: # %bb.0: ; LA32-NEXT: vld $vr0, $a0, 0 -; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) -; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI2_0) -; LA32-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA32-NEXT: vreplvei.w $vr1, $vr0, 2 +; LA32-NEXT: vpackev.b $vr0, $vr1, $vr0 ; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 ; LA32-NEXT: ret ; ; LA64-LABEL: load_trunc_2i64_to_2i8: ; LA64: # %bb.0: ; LA64-NEXT: vld $vr0, $a0, 0 -; LA64-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) -; LA64-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI2_0) -; LA64-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA64-NEXT: vreplvei.w $vr1, $vr0, 2 +; LA64-NEXT: vpackev.b $vr0, $vr1, $vr0 ; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 ; LA64-NEXT: ret %a = load <2 x i64>, ptr %ptr @@ -100,9 +96,10 @@ define void @load_trunc_4i32_to_4i8(ptr %ptr, ptr %dst) nounwind { ; LA32-LABEL: load_trunc_4i32_to_4i8: ; LA32: # %bb.0: ; LA32-NEXT: vld $vr0, $a0, 0 -; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0) -; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI4_0) -; LA32-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA32-NEXT: vshuf4i.h $vr0, $vr0, 216 +; LA32-NEXT: vshuf4i.b $vr0, $vr0, 216 +; LA32-NEXT: vreplvei.h $vr1, $vr0, 4 +; LA32-NEXT: vpackev.h $vr0, $vr1, $vr0 ; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 ; LA32-NEXT: st.w $a0, $a1, 0 ; LA32-NEXT: ret @@ -110,9 +107,10 @@ define void @load_trunc_4i32_to_4i8(ptr %ptr, ptr %dst) nounwind { ; LA64-LABEL: load_trunc_4i32_to_4i8: ; LA64: # %bb.0: ; LA64-NEXT: vld $vr0, $a0, 0 -; LA64-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0) -; LA64-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI4_0) -; LA64-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA64-NEXT: vshuf4i.h $vr0, $vr0, 216 +; LA64-NEXT: vshuf4i.b $vr0, $vr0, 216 +; LA64-NEXT: vreplvei.h $vr1, $vr0, 4 +; LA64-NEXT: vpackev.h $vr0, $vr1, $vr0 ; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 ; LA64-NEXT: ret %a = load <4 x i32>, ptr %ptr @@ -174,21 +172,23 @@ define void @load_trunc_2i32_to_2i8(ptr %ptr, ptr %dst) nounwind { ; LA32: # %bb.0: ; LA32-NEXT: ld.w $a2, $a0, 0 ; LA32-NEXT: ld.w $a0, $a0, 4 -; LA32-NEXT: pcalau12i $a3, %pc_hi20(.LCPI7_0) -; LA32-NEXT: vld $vr0, $a3, %pc_lo12(.LCPI7_0) -; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 0 -; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 -; LA32-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vshuf4i.h $vr0, $vr0, 216 +; LA32-NEXT: vshuf4i.b $vr0, $vr0, 216 +; LA32-NEXT: vreplvei.h $vr1, $vr0, 4 +; LA32-NEXT: vpackev.h $vr0, $vr1, $vr0 ; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 ; LA32-NEXT: ret ; ; LA64-LABEL: load_trunc_2i32_to_2i8: ; LA64: # %bb.0: ; LA64-NEXT: ld.d $a0, $a0, 0 -; LA64-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) -; LA64-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI7_0) -; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; LA64-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vshuf4i.h $vr0, $vr0, 216 +; LA64-NEXT: vshuf4i.b $vr0, $vr0, 216 +; LA64-NEXT: vreplvei.h $vr1, $vr0, 4 +; LA64-NEXT: vpackev.h $vr0, $vr1, $vr0 ; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 ; LA64-NEXT: ret %a = load <2 x i32>, ptr %ptr _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
