https://gcc.gnu.org/g:a46dffee33a3a4bf52eb2d493aace3e77068318f

commit r16-4591-ga46dffee33a3a4bf52eb2d493aace3e77068318f
Author: Pengfei Li <[email protected]>
Date:   Wed Oct 22 11:17:07 2025 +0000

    match.pd: Fold VEC_PERM_EXPR chains implementing concat-and-extract
    
    When compiling the following code with SIMDe on AArch64:
    
            __m128i lo = _mm_srli_si128(a, 12);
            __m128i hi = _mm_slli_si128(b, 4);
            __m128i res = _mm_blend_epi16(hi, lo, 3);
    
    current GCC produces:
    
            mov     v31.4s, 0
            ext     v30.16b, v0.16b, v31.16b, #12
            ext     v0.16b, v31.16b, v1.16b, #12
            ins     v0.s[0], v30.s[0]
    
    instead of the more efficient:
    
            ext     v0.16b, v0.16b, v1.16b, #12
    
    GCC builds three VEC_PERM_EXPRs for the intrinsic calls. The first two
    implement vector shifts and the final one implements the blend, but they
    use different vector modes. The forward propagation fails to optimize
    this case because VIEW_CONVERT_EXPRs in between block the folding.
    
    This patch adds a match.pd pattern to recognize the concat-and-extract
    idiom and folds the VEC_PERM_EXPR chain, even when VIEW_CONVERT_EXPRs
    split the chain.
    
    Bootstrapped and tested on aarch64-linux-gnu and x86_64-linux-gnu.
    
    gcc/ChangeLog:
    
            * match.pd: Fold VEC_PERM_EXPR chains implementing vector
            concat-and-extract.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.dg/fold-vecperm-1.c: New test.

Diff:
---
 gcc/match.pd                          | 53 +++++++++++++++++++++++++++++++++++
 gcc/testsuite/gcc.dg/fold-vecperm-1.c | 23 +++++++++++++++
 2 files changed, 76 insertions(+)

diff --git a/gcc/match.pd b/gcc/match.pd
index 4c05fe2c1a6d..b37a43713360 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -11927,6 +11927,59 @@ and,
       (if (full_perm_p)
        (vec_perm (op@3 @0 @1) @3 @2))))))
 
+/* Fold
+     x = VEC_PERM_EXPR <a, ANY, sel0>;
+     y = VEC_PERM_EXPR <ANY, b, sel0>;
+     c = VEC_PERM_EXPR <x, y, sel1>;
+   into
+     c = VEC_PERM_EXPR <a, b, sel0>;
+   if sel0 combined with sel1 denotes extracting a contiguous subvector from
+   the conceptual concatenated [ a | b ].  */
+(simplify
+ (vec_perm (view_convert? (vec_perm @0 @4 VECTOR_CST@2))
+          (view_convert? (vec_perm @5 @1 VECTOR_CST@2))
+          VECTOR_CST@3)
+ (with
+  {
+    bool can_fold = false;
+    unsigned HOST_WIDE_INT nelts;
+    vec_perm_builder builder;
+    if (TYPE_VECTOR_SUBPARTS (type).is_constant (&nelts)
+       && tree_to_vec_perm_builder (&builder, @2))
+      {
+       /* Set can_fold to true when
+          - sel0 is a vector of consecutive indices.
+          - sel1 is composed of two parts of consecutive indices [ ia | ib ],
+            selecting the elements originally in 'a' and 'b', respectively.  */
+       vec_perm_indices sel0 (builder, 2, VECTOR_CST_NELTS (@2));
+       unsigned int sel0_first_idx = sel0[0].to_constant ();
+       unsigned int elt_size = vector_element_bits (TREE_TYPE (@0));
+       unsigned int ia_size = tree_to_uhwi (TYPE_SIZE (type))
+                              - elt_size * sel0_first_idx;
+       unsigned int ib_start;
+       if (sel0.series_p (0, 1, sel0_first_idx, 1)
+           && multiple_p (ia_size, vector_element_bits (type), &ib_start)
+           && tree_to_vec_perm_builder (&builder, @3))
+         {
+           /* Check if the ib part contains consecutive indices starting from
+              'nelts + ib_start'.  */
+           vec_perm_indices sel1 (builder, 2, VECTOR_CST_NELTS (@3));
+           can_fold = sel1.series_p (ib_start, 1, nelts + ib_start, 1);
+
+           /* Check if the ia part contains indices [0 ... ib_start - 1].  */
+           if (can_fold)
+             for (unsigned int i = 0; i < ib_start; i++)
+               if (sel1[i].to_constant () != i)
+                 {
+                   can_fold = false;
+                   break;
+                 }
+         }
+      }
+  }
+  (if (can_fold)
+    (view_convert (vec_perm @0 @1 @2)))))
+
 #if GIMPLE
 /* Simplify (a >> 1) + (b >> 1) + ((a | b) & 1) to .AVG_CEIL (a, b).
    Similar for (a | b) - ((a ^ b) >> 1).  */
diff --git a/gcc/testsuite/gcc.dg/fold-vecperm-1.c 
b/gcc/testsuite/gcc.dg/fold-vecperm-1.c
new file mode 100644
index 000000000000..5d4456b98b15
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-vecperm-1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef short v8hi __attribute__((vector_size(16)));
+
+typedef union {
+  v4si s;
+  v8hi h;
+} int128;
+
+int128 concat (int128 a, int128 b) {
+  int128 x, y, res;
+  v4si zero = { 0, 0, 0, 0 };
+  v4si sel0 = { 3, 4, 5, 6 };
+  v8hi sel1 = { 0, 1, 10, 11, 12, 13, 14, 15 };
+  x.s = __builtin_shuffle (a.s, zero, sel0);
+  y.s = __builtin_shuffle (zero, b.s, sel0);
+  res.h = __builtin_shuffle (x.h, y.h, sel1);
+  return res;
+}
+
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 1 "optimized" } } */

Reply via email to