When compiling the following code with SIMDe on AArch64:

        __m128i lo = _mm_srli_si128(a, 12);
        __m128i hi = _mm_slli_si128(b, 4);
        __m128i res = _mm_blend_epi16(hi, lo, 3);

current GCC produces:

        mov     v31.4s, 0
        ext     v30.16b, v0.16b, v31.16b, #12
        ext     v0.16b, v31.16b, v1.16b, #12
        ins     v0.s[0], v30.s[0]

instead of the more efficient:

        ext     v0.16b, v0.16b, v1.16b, #12

GCC builds three VEC_PERM_EXPRs for the intrinsic calls. The first two
implement vector shifts and the final one implements the blend, but they
use different vector modes. The forward propagation fails to optimize
this case because VIEW_CONVERT_EXPRs in between block the folding.

This patch adds a match.pd pattern to recognize the concat-and-extract
idiom and folds the VEC_PERM_EXPR chain, even when VIEW_CONVERT_EXPRs
split the chain.

Bootstrapped and tested on aarch64-linux-gnu and x86_64-linux-gnu.

gcc/ChangeLog:

        * match.pd: Fold VEC_PERM_EXPR chains implementing vector
        concat-and-extract.

gcc/testsuite/ChangeLog:

        * gcc.dg/fold-vecperm-1.c: New test.
---
 gcc/match.pd                          | 53 +++++++++++++++++++++++++++
 gcc/testsuite/gcc.dg/fold-vecperm-1.c | 23 ++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/fold-vecperm-1.c

diff --git a/gcc/match.pd b/gcc/match.pd
index a4248a521cf..7b86db54a78 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -11872,6 +11872,59 @@ and,
       (if (full_perm_p)
        (vec_perm (op@3 @0 @1) @3 @2))))))
 
+/* Fold
+     x = VEC_PERM_EXPR <a, ANY, sel0>;
+     y = VEC_PERM_EXPR <ANY, b, sel0>;
+     c = VEC_PERM_EXPR <x, y, sel1>;
+   into
+     c = VEC_PERM_EXPR <a, b, sel0>;
+   if sel0 combined with sel1 denotes extracting a contiguous subvector from
+   the conceptual concatenated [ a | b ].  */
+(simplify
+ (vec_perm (view_convert? (vec_perm @0 @4 VECTOR_CST@2))
+          (view_convert? (vec_perm @5 @1 VECTOR_CST@2))
+          VECTOR_CST@3)
+ (with
+  {
+    bool can_fold = false;
+    unsigned HOST_WIDE_INT nelts;
+    vec_perm_builder builder;
+    if (TYPE_VECTOR_SUBPARTS (type).is_constant (&nelts)
+       && tree_to_vec_perm_builder (&builder, @2))
+      {
+       /* Set can_fold to true when
+          - sel0 is a vector of consecutive indices.
+          - sel1 is composed of two parts of consecutive indices [ ia | ib ],
+            selecting the elements originally in 'a' and 'b', respectively.  */
+       vec_perm_indices sel0 (builder, 2, VECTOR_CST_NELTS (@2));
+       unsigned int sel0_first_idx = sel0[0].to_constant ();
+       unsigned int elt_size = vector_element_bits (TREE_TYPE (@0));
+       unsigned int ia_size = tree_to_uhwi (TYPE_SIZE (type))
+                              - elt_size * sel0_first_idx;
+       unsigned int ib_start;
+       if (sel0.series_p (0, 1, sel0_first_idx, 1)
+           && multiple_p (ia_size, vector_element_bits (type), &ib_start)
+           && tree_to_vec_perm_builder (&builder, @3))
+         {
+           /* Check if the ib part contains consecutive indices starting from
+              'nelts + ib_start'.  */
+           vec_perm_indices sel1 (builder, 2, VECTOR_CST_NELTS (@3));
+           can_fold = sel1.series_p (ib_start, 1, nelts + ib_start, 1);
+
+           /* Check if the ia part contains indices [0 ... ib_start - 1].  */
+           if (can_fold)
+             for (unsigned int i = 0; i < ib_start; i++)
+               if (sel1[i].to_constant () != i)
+                 {
+                   can_fold = false;
+                   break;
+                 }
+         }
+      }
+  }
+  (if (can_fold)
+    (view_convert (vec_perm @0 @1 @2)))))
+
 #if GIMPLE
 /* Simplify (a >> 1) + (b >> 1) + ((a | b) & 1) to .AVG_CEIL (a, b).
    Similar for (a | b) - ((a ^ b) >> 1).  */
diff --git a/gcc/testsuite/gcc.dg/fold-vecperm-1.c 
b/gcc/testsuite/gcc.dg/fold-vecperm-1.c
new file mode 100644
index 00000000000..5d4456b98b1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-vecperm-1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef short v8hi __attribute__((vector_size(16)));
+
+typedef union {
+  v4si s;
+  v8hi h;
+} int128;
+
+int128 concat (int128 a, int128 b) {
+  int128 x, y, res;
+  v4si zero = { 0, 0, 0, 0 };
+  v4si sel0 = { 3, 4, 5, 6 };
+  v8hi sel1 = { 0, 1, 10, 11, 12, 13, 14, 15 };
+  x.s = __builtin_shuffle (a.s, zero, sel0);
+  y.s = __builtin_shuffle (zero, b.s, sel0);
+  res.h = __builtin_shuffle (x.h, y.h, sel1);
+  return res;
+}
+
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 1 "optimized" } } */
-- 
2.43.0

Reply via email to