On Wed, 22 Oct 2025, Pengfei Li wrote:

> When compiling the following code with SIMDe on AArch64:
> 
>       __m128i lo = _mm_srli_si128(a, 12);
>       __m128i hi = _mm_slli_si128(b, 4);
>       __m128i res = _mm_blend_epi16(hi, lo, 3);
> 
> current GCC produces:
> 
>       mov     v31.4s, 0
>       ext     v30.16b, v0.16b, v31.16b, #12
>       ext     v0.16b, v31.16b, v1.16b, #12
>       ins     v0.s[0], v30.s[0]
> 
> instead of the more efficient:
> 
>       ext     v0.16b, v0.16b, v1.16b, #12
> 
> GCC builds three VEC_PERM_EXPRs for the intrinsic calls. The first two
> implement vector shifts and the final one implements the blend, but they
> use different vector modes. The forward propagation fails to optimize
> this case because VIEW_CONVERT_EXPRs in between block the folding.
> 
> This patch adds a match.pd pattern to recognize the concat-and-extract
> idiom and folds the VEC_PERM_EXPR chain, even when VIEW_CONVERT_EXPRs
> split the chain.
> 
> Bootstrapped and tested on aarch64-linux-gnu and x86_64-linux-gnu.

OK.

Thanks,
Richard.

> gcc/ChangeLog:
> 
>       * match.pd: Fold VEC_PERM_EXPR chains implementing vector
>       concat-and-extract.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.dg/fold-vecperm-1.c: New test.
> ---
>  gcc/match.pd                          | 53 +++++++++++++++++++++++++++
>  gcc/testsuite/gcc.dg/fold-vecperm-1.c | 23 ++++++++++++
>  2 files changed, 76 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/fold-vecperm-1.c
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index a4248a521cf..7b86db54a78 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -11872,6 +11872,59 @@ and,
>        (if (full_perm_p)
>       (vec_perm (op@3 @0 @1) @3 @2))))))
>  
> +/* Fold
> +     x = VEC_PERM_EXPR <a, ANY, sel0>;
> +     y = VEC_PERM_EXPR <ANY, b, sel0>;
> +     c = VEC_PERM_EXPR <x, y, sel1>;
> +   into
> +     c = VEC_PERM_EXPR <a, b, sel0>;
> +   if sel0 combined with sel1 denotes extracting a contiguous subvector from
> +   the conceptual concatenated [ a | b ].  */
> +(simplify
> + (vec_perm (view_convert? (vec_perm @0 @4 VECTOR_CST@2))
> +        (view_convert? (vec_perm @5 @1 VECTOR_CST@2))
> +        VECTOR_CST@3)
> + (with
> +  {
> +    bool can_fold = false;
> +    unsigned HOST_WIDE_INT nelts;
> +    vec_perm_builder builder;
> +    if (TYPE_VECTOR_SUBPARTS (type).is_constant (&nelts)
> +     && tree_to_vec_perm_builder (&builder, @2))
> +      {
> +     /* Set can_fold to true when
> +        - sel0 is a vector of consecutive indices.
> +        - sel1 is composed of two parts of consecutive indices [ ia | ib ],
> +          selecting the elements originally in 'a' and 'b', respectively.  */
> +     vec_perm_indices sel0 (builder, 2, VECTOR_CST_NELTS (@2));
> +     unsigned int sel0_first_idx = sel0[0].to_constant ();
> +     unsigned int elt_size = vector_element_bits (TREE_TYPE (@0));
> +     unsigned int ia_size = tree_to_uhwi (TYPE_SIZE (type))
> +                            - elt_size * sel0_first_idx;
> +     unsigned int ib_start;
> +     if (sel0.series_p (0, 1, sel0_first_idx, 1)
> +         && multiple_p (ia_size, vector_element_bits (type), &ib_start)
> +         && tree_to_vec_perm_builder (&builder, @3))
> +       {
> +         /* Check if the ib part contains consecutive indices starting from
> +            'nelts + ib_start'.  */
> +         vec_perm_indices sel1 (builder, 2, VECTOR_CST_NELTS (@3));
> +         can_fold = sel1.series_p (ib_start, 1, nelts + ib_start, 1);
> +
> +         /* Check if the ia part contains indices [0 ... ib_start - 1].  */
> +         if (can_fold)
> +           for (unsigned int i = 0; i < ib_start; i++)
> +             if (sel1[i].to_constant () != i)
> +               {
> +                 can_fold = false;
> +                 break;
> +               }
> +       }
> +      }
> +  }
> +  (if (can_fold)
> +    (view_convert (vec_perm @0 @1 @2)))))
> +
>  #if GIMPLE
>  /* Simplify (a >> 1) + (b >> 1) + ((a | b) & 1) to .AVG_CEIL (a, b).
>     Similar for (a | b) - ((a ^ b) >> 1).  */
> diff --git a/gcc/testsuite/gcc.dg/fold-vecperm-1.c 
> b/gcc/testsuite/gcc.dg/fold-vecperm-1.c
> new file mode 100644
> index 00000000000..5d4456b98b1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/fold-vecperm-1.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +typedef int v4si __attribute__((vector_size(16)));
> +typedef short v8hi __attribute__((vector_size(16)));
> +
> +typedef union {
> +  v4si s;
> +  v8hi h;
> +} int128;
> +
> +int128 concat (int128 a, int128 b) {
> +  int128 x, y, res;
> +  v4si zero = { 0, 0, 0, 0 };
> +  v4si sel0 = { 3, 4, 5, 6 };
> +  v8hi sel1 = { 0, 1, 10, 11, 12, 13, 14, 15 };
> +  x.s = __builtin_shuffle (a.s, zero, sel0);
> +  y.s = __builtin_shuffle (zero, b.s, sel0);
> +  res.h = __builtin_shuffle (x.h, y.h, sel1);
> +  return res;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 1 "optimized" } } */
> 

-- 
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to