In LoongArch, we have xvshuf.{b/h/w/d} instructions which can dealt the
situation that all low 128-bit elements of the target vector are shuffled
by concatenating the low 128-bit elements of the two input vectors, and
all high 128-bit elements of the target vector are similarly shuffled.
Therefore, we added recognition for such situations and used the xvshuf
instruction for optimization.
gcc/ChangeLog:
* config/loongarch/loongarch.cc (loongarch_if_match_xvshuffle):
Add new condition.
(loongarch_expand_vec_perm_const): Add new function.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/vector/lasx/vec_perm-verify-xvshuf.c: New test.
* gcc.target/loongarch/vector/lasx/vec_perm-xvshuf.c: New test.
---
gcc/config/loongarch/loongarch.cc | 69 ++++++++++++
.../vector/lasx/vec_perm-verify-xvshuf.c | 106 ++++++++++++++++++
.../loongarch/vector/lasx/vec_perm-xvshuf.c | 17 +++
3 files changed, 192 insertions(+)
create mode 100644
gcc/testsuite/gcc.target/loongarch/vector/lasx/vec_perm-verify-xvshuf.c
create mode 100644
gcc/testsuite/gcc.target/loongarch/vector/lasx/vec_perm-xvshuf.c
diff --git a/gcc/config/loongarch/loongarch.cc
b/gcc/config/loongarch/loongarch.cc
index c1a53e3e7ca..e7cb798df8b 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -9575,6 +9575,34 @@ loongarch_is_elem_duplicate (struct expand_vec_perm_d *d)
return result;
}
+/* If the target vector low 128-bit element comes from the low 128-bit element
+ of op0 or op1, and the target vector high 128-bit element comes from the
+ high 128-bit element of op0 or op1, the corresponding xvshuf.{h/w/d}
+ instruction can be matched. */
+static bool
+loongarch_if_match_xvshuffle (struct expand_vec_perm_d *d)
+{
+ for (int i = 0; i < d->nelt; i++)
+ {
+ unsigned char buf = d->perm[i];
+
+ if (i < d->nelt / 2)
+ {
+ if ((buf >= d->nelt / 2 && buf < d->nelt)
+ || buf >= (d->nelt + d->nelt / 2))
+ return false;
+ }
+ else
+ {
+ if ((buf >= d->nelt && buf < (d->nelt + d->nelt / 2))
+ || buf < d->nelt / 2)
+ return false;
+ }
+ }
+
+ return true;
+}
+
/* In LASX, some permutation insn does not have the behavior that gcc expects
when compiler wants to emit a vector permutation.
@@ -9827,6 +9855,47 @@ loongarch_expand_vec_perm_const (struct
expand_vec_perm_d *d)
return true;
}
+ if (loongarch_if_match_xvshuffle (d))
+ {
+ if (d->testing_p)
+ return true;
+
+ /* Selector example: E_V8SImode, { 0, 9, 2, 11, 4, 13, 6, 15 }. */
+ /* If target low 128-bit has op1 low 128-bit element {9, 11}, we
+ need subtract half of d->nelt (so index in range (4, 7)) to form
+ the 256-bit intermediate vector vec0.
+ Similarly, if target high 128-bit has op0 high 128-bit element
+ {4, 6}, we need subtract half of d->nelt (so index in range
+ (0, 3)) to form the 256-bit intermediate vector vec1.
+ Especially if target high 128-bit has op1 high 128-bit element
+ {13, 15}, we need modulo d->nelt (so index in range (4, 7)) to
+ form the 256-bit intermediate vector vec1. */
+ for (i = 0; i < d->nelt; i += 1)
+ {
+ if (i < d->nelt / 2)
+ {
+ if (d->perm[i] >= d->nelt)
+ remapped[i] = d->perm[i] - d->nelt / 2;
+ else
+ remapped[i] = d->perm[i];
+ }
+ else
+ {
+ if (d->perm[i] < d->nelt)
+ remapped[i] = d->perm[i] - d->nelt / 2;
+ else
+ remapped[i] = d->perm[i] % d->nelt;
+ }
+ }
+
+ /* Selector after: { 0, 5, 2, 7, 0, 5, 2, 7 }. */
+ for (i = 0; i < d->nelt; i += 1)
+ rperm[i] = GEN_INT (remapped[i]);
+
+ flag = true;
+ goto expand_perm_const_end;
+ }
+
expand_perm_const_end:
if (flag)
{
diff --git
a/gcc/testsuite/gcc.target/loongarch/vector/lasx/vec_perm-verify-xvshuf.c
b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vec_perm-verify-xvshuf.c
new file mode 100644
index 00000000000..17e6e661f06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vec_perm-verify-xvshuf.c
@@ -0,0 +1,106 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mlasx -w -fno-strict-aliasing" } */
+
+#include "../simd_correctness_check.h"
+#include <lasxintrin.h>
+#define N 16
+
+typedef int TYPE;
+
+void
+foo (TYPE a[], TYPE b[], TYPE c[])
+{
+ for (int i = 0; i < N; i += 2)
+ {
+ c[i + 0] = a[i + 0] + b[i + 0];
+ c[i + 1] = a[i + 1] - b[i + 1];
+ }
+}
+
+__m256i
+change_to_256vec (TYPE c[], int offset)
+{
+ __m256i __m256i_op;
+ int type_bit_len = sizeof (TYPE) * 8;
+ long int tmp;
+
+ for (int i = offset; i < 256 / type_bit_len + offset; i += 2)
+ {
+ __m256i_op[(i - offset) / 2] = 0x0;
+ __m256i_op[(i - offset) / 2] |= c[i];
+ tmp = ((long int)c[i + 1] << type_bit_len);
+ __m256i_op[(i - offset) / 2] |= tmp;
+ }
+
+ return __m256i_op;
+}
+
+int
+main ()
+{
+ TYPE a[N], b[N], c[N];
+ for (int i = 0; i < N; ++i)
+ {
+ a[i] = i;
+ b[i] = N + i;
+ }
+
+ // c = {16,-16,20,-16,24,-16,28,-16,32,-16,36,-16,40,-16,44,-16};
+ foo (a, b, c);
+
+ __m256i ans1 = change_to_256vec (c, 0);
+ __m256i ans2 = change_to_256vec (c, N / 2);
+
+ __m256i __m256i_op0, __m256i_op1, __m256i_op2, __m256i_out, __m256i_result;
+ __m256i __m256i_op3, __m256i_op4, __m256i_op5, __m256i_out2, __m256i_result2;
+
+ *((unsigned long *)&__m256i_op0[3]) = 0x0000000700000002;
+ *((unsigned long *)&__m256i_op0[2]) = 0x0000000500000000;
+ *((unsigned long *)&__m256i_op0[1]) = 0x0000000700000002;
+ *((unsigned long *)&__m256i_op0[0]) = 0x0000000500000000;
+
+ *((unsigned long *)&__m256i_op1[3]) = 0x0000001e0000001c;
+ *((unsigned long *)&__m256i_op1[2]) = 0x0000001a00000018;
+ *((unsigned long *)&__m256i_op1[1]) = 0x0000001600000014;
+ *((unsigned long *)&__m256i_op1[0]) = 0x0000001200000010;
+
+ *((unsigned long *)&__m256i_op2[3]) = 0xfffffff0fffffff0;
+ *((unsigned long *)&__m256i_op2[2]) = 0xfffffff0fffffff0;
+ *((unsigned long *)&__m256i_op2[1]) = 0xfffffff0fffffff0;
+ *((unsigned long *)&__m256i_op2[0]) = 0xfffffff0fffffff0;
+
+ // __m256i_result = {16,-16,20,-16,24,-16,28,-16};
+ *((unsigned long *)&__m256i_result[3]) = 0xfffffff00000001c;
+ *((unsigned long *)&__m256i_result[2]) = 0xfffffff000000018;
+ *((unsigned long *)&__m256i_result[1]) = 0xfffffff000000014;
+ *((unsigned long *)&__m256i_result[0]) = 0xfffffff000000010;
+
+ __m256i_out = __lasx_xvshuf_w (__m256i_op0, __m256i_op2, __m256i_op1);
+ ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out);
+ ASSERTEQ_64 (__LINE__, ans1, __m256i_out);
+
+ *((unsigned long *)&__m256i_op3[3]) = 0x0000000700000002;
+ *((unsigned long *)&__m256i_op3[2]) = 0x0000000500000000;
+ *((unsigned long *)&__m256i_op3[1]) = 0x0000000700000002;
+ *((unsigned long *)&__m256i_op3[0]) = 0x0000000500000000;
+
+ *((unsigned long *)&__m256i_op4[3]) = 0x0000002e0000002c;
+ *((unsigned long *)&__m256i_op4[2]) = 0x0000002a00000028;
+ *((unsigned long *)&__m256i_op4[1]) = 0x0000002600000024;
+ *((unsigned long *)&__m256i_op4[0]) = 0x0000002200000020;
+
+ *((unsigned long *)&__m256i_op5[3]) = 0xfffffff0fffffff0;
+ *((unsigned long *)&__m256i_op5[2]) = 0xfffffff0fffffff0;
+ *((unsigned long *)&__m256i_op5[1]) = 0xfffffff0fffffff0;
+ *((unsigned long *)&__m256i_op5[0]) = 0xfffffff0fffffff0;
+
+ // __m256i_result2 = {32,-16,36,-16,40,-16,44,-16};
+ *((unsigned long *)&__m256i_result2[3]) = 0xfffffff00000002c;
+ *((unsigned long *)&__m256i_result2[2]) = 0xfffffff000000028;
+ *((unsigned long *)&__m256i_result2[1]) = 0xfffffff000000024;
+ *((unsigned long *)&__m256i_result2[0]) = 0xfffffff000000020;
+
+ __m256i_out2 = __lasx_xvshuf_w (__m256i_op3, __m256i_op5, __m256i_op4);
+ ASSERTEQ_64 (__LINE__, __m256i_result2, __m256i_out2);
+ ASSERTEQ_64 (__LINE__, ans2, __m256i_out2);
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/vec_perm-xvshuf.c
b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vec_perm-xvshuf.c
new file mode 100644
index 00000000000..6b19c2c2fd8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vec_perm-xvshuf.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mlasx" } */
+/* { dg-final { scan-assembler "xvshuf.w" } } */
+/* { dg-final { scan-assembler-not "xvperm.w" } } */
+/* { dg-final { scan-assembler-not "xvbitsel.v" } } */
+
+void
+foo (int a[], int b[], int c[])
+{
+ for (int i = 0; i < 100; i += 4)
+ {
+ c[i + 0] = a[i + 0] + b[i + 0];
+ c[i + 1] = a[i + 1] - b[i + 1];
+ c[i + 2] = a[i + 2] - b[i + 2];
+ c[i + 3] = a[i + 3] + b[i + 3];
+ }
+}
--
2.38.1