In LoongArch, when the permutation idx comes from different vectors and
idx is not repeated, for V8SI/V8SF/V4DI/V4DF type vectors, we can use
two xvperm.w + one xvbitsel.v instructions or two xvpermi.d + one
xvbitsel.v instructions for shuffle optimization.
gcc/ChangeLog:
* config/loongarch/loongarch.cc
(loongarch_expand_vec_perm_generic_bitsel):
Add new vector shuffle optimize function.
(loongarch_expand_vec_perm_const): Adjust.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/vec_perm-xvbitsel-2.c: New test.
* gcc.target/loongarch/vec_perm-xvbitsel-3.c: New test.
---
gcc/config/loongarch/loongarch.cc | 112 ++++++++++++++++++
.../loongarch/vec_perm-xvbitsel-2.c | 18 +++
.../loongarch/vec_perm-xvbitsel-3.c | 22 ++++
3 files changed, 152 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-2.c
create mode 100644 gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-3.c
diff --git a/gcc/config/loongarch/loongarch.cc
b/gcc/config/loongarch/loongarch.cc
index 76011eec1e5..d4beb1af253 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -9457,6 +9457,115 @@ loongarch_expand_vec_perm_bitsel (struct
expand_vec_perm_d *d)
return true;
}
+/* A general shuffle method for 256-bit V8SI/V8SF/V4DI/V4DF types when
+ the permutate idx comes from different vectors and idx is not repeated. */
+static bool
+loongarch_expand_vec_perm_generic_bitsel (struct expand_vec_perm_d *d)
+{
+ if (!ISA_HAS_LASX)
+ return false;
+
+ auto_bitmap used;
+ machine_mode mode = d->vmode;
+ int nelt = d->nelt, val, i;
+
+ /* Due to instruction set restrictions, the following types do not support
+ this optimization method. */
+ if (mode != E_V8SImode && mode != E_V8SFmode
+ && mode != E_V4DImode && mode != E_V4DFmode)
+ return false;
+
+ /* We should ensure that d->perm[i] % nelt has no repeat. */
+ for (i = 0; i < nelt; i += 1)
+ {
+ if (bitmap_bit_p (used, d->perm[i] % nelt))
+ return false;
+ else
+ bitmap_set_bit (used, d->perm[i] % nelt);
+ }
+
+ if (d->testing_p)
+ return true;
+
+ rtx reg_bitsel, tmp_bitsel, sel_bitsel, op0, op1;
+ rtx rmap_bitsel[MAX_VECT_LEN];
+ op0 = gen_reg_rtx (mode);
+ op1 = gen_reg_rtx (mode);
+ reg_bitsel = gen_reg_rtx (mode);
+
+ if (mode == E_V8SImode || mode == E_V8SFmode)
+ {
+ rtx rmap_xvperm[MAX_VECT_LEN];
+ rtx sel_xvperm, reg_xvperm;
+
+ for (i = 0; i < nelt; i += 1)
+ {
+ /* For xvperm insn we just copy original permutate index. */
+ rmap_xvperm[i] = GEN_INT (d->perm[i]);
+ val = d->perm[i] >= nelt ? -1 : 0;
+ /* For xvbitsel insn we should do some conversion, where -1 means
+ the destination element comes from operand1, and 0 means the
+ destination element comes from operand0. */
+ rmap_bitsel[i] = GEN_INT (val);
+ }
+
+ reg_xvperm = gen_reg_rtx (E_V8SImode);
+
+ /* Prepare reg of selective index for xvperm. */
+ sel_xvperm = gen_rtx_CONST_VECTOR (E_V8SImode,
+ gen_rtvec_v (nelt, rmap_xvperm));
+ emit_move_insn (reg_xvperm, sel_xvperm);
+
+ /* Prepare reg of selective index for xvbitsel. */
+ sel_bitsel = gen_rtx_CONST_VECTOR (E_V8SImode,
+ gen_rtvec_v (nelt, rmap_bitsel));
+ if (mode == E_V8SFmode)
+ {
+ tmp_bitsel = simplify_gen_subreg (E_V8SImode, reg_bitsel, mode, 0);
+ emit_move_insn (tmp_bitsel, sel_bitsel);
+ }
+ else
+ emit_move_insn (reg_bitsel, sel_bitsel);
+
+ emit_insn (gen_lasx_xvperm (mode, op0, d->op0, reg_xvperm));
+ emit_insn (gen_lasx_xvperm (mode, op1, d->op1, reg_xvperm));
+ emit_insn (gen_simd_vbitsel (mode, d->target, op0, op1, reg_bitsel));
+ }
+ else
+ {
+ unsigned int imm = 0;
+ unsigned int val2;
+
+ for (i = nelt - 1; i >= 0; i -= 1)
+ {
+ val = d->perm[i] >= nelt ? -1 : 0;
+ rmap_bitsel[i] = GEN_INT (val);
+ val2 = d->perm[i] % nelt;
+ imm |= val2;
+ imm = (i != 0) ? imm << 2 : imm;
+ }
+
+ /* Prepare reg of selective index for xvbitsel. */
+ sel_bitsel = gen_rtx_CONST_VECTOR (E_V4DImode,
+ gen_rtvec_v (nelt, rmap_bitsel));
+ if (mode == E_V4DFmode)
+ {
+ tmp_bitsel = simplify_gen_subreg (E_V4DImode, reg_bitsel, mode, 0);
+ emit_move_insn (tmp_bitsel, sel_bitsel);
+ }
+ else
+ emit_move_insn (reg_bitsel, sel_bitsel);
+
+ gcc_assert (mode == E_V4DFmode || mode == E_V4DImode);
+
+ emit_insn (gen_lasx_xvpermi_d (mode, op0, d->op0, GEN_INT (imm)));
+ emit_insn (gen_lasx_xvpermi_d (mode, op1, d->op1, GEN_INT (imm)));
+ emit_insn (gen_simd_vbitsel (mode, d->target, op0, op1, reg_bitsel));
+ }
+
+ return true;
+}
+
/* Following are the assist function for const vector permutation support. */
static bool
loongarch_is_quad_duplicate (struct expand_vec_perm_d *d)
@@ -9977,6 +10086,9 @@ loongarch_expand_vec_perm_const (struct
expand_vec_perm_d *d)
goto expand_perm_const_end;
}
+ if (loongarch_expand_vec_perm_generic_bitsel (d))
+ return true;
+
expand_perm_const_end:
if (flag)
{
diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-2.c
b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-2.c
new file mode 100644
index 00000000000..3c38199126a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-2.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mlasx" } */
+/* { dg-final { scan-assembler "xvpermi.d" } } */
+/* { dg-final { scan-assembler-not "xvrepli.w" } } */
+/* { dg-final { scan-assembler-not "xvand.v" } } */
+/* { dg-final { scan-assembler-not "xvseq.w" } } */
+
+void
+foo (double a[], double b[], double c[])
+{
+ for (int i = 0; i < 800; i += 4)
+ {
+ c[i + 0] = a[i + 0] + b[i + 0];
+ c[i + 1] = a[i + 2] - b[i + 2];
+ c[i + 2] = a[i + 3] - b[i + 3];
+ c[i + 3] = a[i + 1] + b[i + 1];
+ }
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-3.c
b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-3.c
new file mode 100644
index 00000000000..065c816a15d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-3.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mlasx" } */
+/* { dg-final { scan-assembler "xvperm.w" } } */
+/* { dg-final { scan-assembler-not "xvrepli.w" } } */
+/* { dg-final { scan-assembler-not "xvand.v" } } */
+/* { dg-final { scan-assembler-not "xvseq.w" } } */
+
+void
+foo (float a[], float b[], float c[])
+{
+ for (int i = 0; i < 800; i += 8)
+ {
+ c[i + 0] = a[i + 0] + b[i + 0];
+ c[i + 1] = a[i + 1] + b[i + 1];
+ c[i + 2] = a[i + 4] - b[i + 4];
+ c[i + 3] = a[i + 5] - b[i + 5];
+ c[i + 4] = a[i + 2] - b[i + 2];
+ c[i + 5] = a[i + 3] - b[i + 3];
+ c[i + 6] = a[i + 6] + b[i + 6];
+ c[i + 7] = a[i + 7] + b[i + 7];
+ }
+}
--
2.38.1