Pengfei Li <pengfei....@arm.com> writes:
> This patch implements the folding of a vector addition followed by a
> logical shift right by 1 (add + lsr #1) on AArch64 into an unsigned
> halving add, allowing GCC to emit NEON or SVE2 UHADD instructions.
>
> For example, this patch helps improve the codegen from:
>       add     v0.4s, v0.4s, v31.4s
>       ushr    v0.4s, v0.4s, 1
> to:
>       uhadd   v0.4s, v0.4s, v31.4s
>
> For NEON, vector operations are represented using generic mid-end
> operations, so new folding rules are added to match.pd. For SVE2, the
> operations are represented using built-in GIMPLE calls, so this
> optimization is implemented via gimple_folder.
>
> To ensure correctness, additional checks are introduced to guargntee
> that the operands to UHADD are vectors in which each element has its top
> bit cleared.
>
> This patch has been bootstrapped and regression tested on
> x86_64-linux-gnu and aarch64-linux-gnu.
>
> gcc/ChangeLog:
>
>       * config/aarch64/aarch64-sve-builtins-base.cc (find_sve_builtin_call):
>       New helper function for finding and checking a GIMPLE call.
>       (is_undef): Rewrite with find_sve_builtin_call.
>       (class svlsr_impl): Implement the folding for SVE2.
>       (FUNCTION): Check and fold the pattern.
>       * match.pd: Add new rules to implement the folding for NEON.
>       * tree.cc (top_bit_zero_vector_p): Add a new utility function for
>       vector top bit zero check.
>       * tree.h (top_bit_zero_vector_p): Add a function declaration.

The target-independent changes are out of my comfort area.
Cc:ing Richi for those.

But rather than top_bit_zero_vector_p, how about a more general
nonzero_element_bits?  I've wanted something similar in the past.

I don't think we can use an unbounded recursive walk, since that
would become quadratic if we ever used it when optimising one
AND in a chain of ANDs.  (And using this function for ANDs
seems plausible.)  Maybe we should be handling the information
in a similar way to Ranger.

Rather than handle the built-in case entirely in target code, how about
having a target hook into nonzero_element_bits (or whatever replaces it)
for machine-dependent builtins?

Thanks,
Richard

>
> gcc/testsuite/ChangeLog:
>
>       * gcc.target/aarch64/acle/uhadd_1.c: New test.
>       * gcc.target/aarch64/sve2/acle/general/uhadd_1.c: New test.
> ---
>  .../aarch64/aarch64-sve-builtins-base.cc      | 101 ++++++++++++++++--
>  gcc/match.pd                                  |   7 ++
>  .../gcc.target/aarch64/acle/uhadd_1.c         |  34 ++++++
>  .../aarch64/sve2/acle/general/uhadd_1.c       |  30 ++++++
>  gcc/tree.cc                                   |  30 ++++++
>  gcc/tree.h                                    |   4 +
>  6 files changed, 199 insertions(+), 7 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c
>  create mode 100644 
> gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index b4396837c24..ce6da82bf81 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -43,6 +43,7 @@
>  #include "aarch64-sve-builtins.h"
>  #include "aarch64-sve-builtins-shapes.h"
>  #include "aarch64-sve-builtins-base.h"
> +#include "aarch64-sve-builtins-sve2.h"
>  #include "aarch64-sve-builtins-functions.h"
>  #include "aarch64-builtins.h"
>  #include "ssa.h"
> @@ -53,6 +54,23 @@ using namespace aarch64_sve;
>  
>  namespace {
>  
> +/* Return gcall* if VAL is an SSA_NAME defined by the given SVE intrinsics 
> call.
> +   Otherwise return NULL.  */
> +static gcall*
> +find_sve_builtin_call (tree val, const function_base *func)
> +{
> +  if (TREE_CODE (val) == SSA_NAME)
> +    {
> +      gimple *def = SSA_NAME_DEF_STMT (val);
> +      if (gcall *call = dyn_cast<gcall *> (def))
> +     if (tree fndecl = gimple_call_fndecl (call))
> +       if (const function_instance *instance = lookup_fndecl (fndecl))
> +         if (instance->base == func)
> +           return call;
> +    }
> +  return NULL;
> +}
> +
>  /* Return true if VAL is an undefined value.  */
>  static bool
>  is_undef (tree val)
> @@ -62,12 +80,7 @@ is_undef (tree val)
>        if (ssa_undefined_value_p (val, false))
>       return true;
>  
> -      gimple *def = SSA_NAME_DEF_STMT (val);
> -      if (gcall *call = dyn_cast<gcall *> (def))
> -     if (tree fndecl = gimple_call_fndecl (call))
> -       if (const function_instance *instance = lookup_fndecl (fndecl))
> -         if (instance->base == functions::svundef)
> -           return true;
> +      return (find_sve_builtin_call (val, functions::svundef) != NULL);
>      }
>    return false;
>  }
> @@ -2088,6 +2101,80 @@ public:
>    }
>  };
>  
> +class svlsr_impl : public rtx_code_function
> +{
> +private:
> +  /* Return true if we know active lanes for use in T have top bit zero, 
> where
> +     pg_use tells which lanes are active for use.  */
> +  bool
> +  active_lanes_top_bit_zero_p (tree t, tree pg_use) const
> +  {
> +    /* Return true if T itself is a vector in which each element has top bit
> +       zero.  */
> +    if (top_bit_zero_vector_p (t))
> +      return true;
> +
> +    /* Return true if T is an AND op with a vector in which each element has
> +       top bit zero.  Note the predicate for AND op should cover active lanes
> +       for use.  */
> +    gcall *and_call = find_sve_builtin_call (t, functions::svand);
> +    if (and_call != NULL)
> +      {
> +     tree pg = gimple_call_arg (and_call, 0);
> +     if (pg == pg_use || is_ptrue (pg, element_precision (t) / CHAR_BIT))
> +       {
> +         return top_bit_zero_vector_p (gimple_call_arg (and_call, 1))
> +             || top_bit_zero_vector_p (gimple_call_arg (and_call, 2));
> +       }
> +      }
> +
> +    return false;
> +  }
> +
> +public:
> +  CONSTEXPR svlsr_impl ()
> +    : rtx_code_function (LSHIFTRT, LSHIFTRT) {}
> +
> +  gimple*
> +  fold (gimple_folder &f) const override
> +  {
> +    /* Below folding applies to SVE2 only.  */
> +    if (!TARGET_SVE2)
> +      return NULL;
> +
> +    /* Fold calls for patterns of LSR (ADD (x, y), 1) to an HADD (x, y). Note
> +       LSR and ADD should share the same pg to fold.  */
> +    tree pg = gimple_call_arg (f.call, 0);
> +    tree lsr_opnd = gimple_call_arg (f.call, 1);
> +    tree lsr_dist = gimple_call_arg (f.call, 2);
> +
> +    gcall *add_call;
> +    if ((add_call = find_sve_builtin_call (lsr_opnd, functions::svadd)) != 
> NULL
> +     && integer_onep (lsr_dist)
> +     && gimple_call_arg (add_call, 0) == pg)
> +      {
> +     /* Check if we know all active lanes in the two addends of the add_call
> +        have top bit zero, where pg indicates which lanes are active.  */
> +     tree addend1 = gimple_call_arg (add_call, 1);
> +     tree addend2 = gimple_call_arg (add_call, 2);
> +     if (active_lanes_top_bit_zero_p (addend1, pg)
> +         && active_lanes_top_bit_zero_p (addend2, pg))
> +       {
> +         function_instance instance ("svhadd", functions::svhadd,
> +                                     shapes::binary_opt_n, MODE_none,
> +                                     f.type_suffix_ids, GROUP_none, f.pred,
> +                                     FPM_unused);
> +         gcall *call = f.redirect_call (instance);
> +         gimple_call_set_arg (call, 1, addend1);
> +         gimple_call_set_arg (call, 2, addend2);
> +         return call;
> +       }
> +      }
> +
> +    return NULL;
> +  }
> +};
> +
>  class svmad_impl : public function_base
>  {
>  public:
> @@ -3586,7 +3673,7 @@ FUNCTION (svldnt1, svldnt1_impl,)
>  FUNCTION (svlen, svlen_impl,)
>  FUNCTION (svlsl, svlsl_impl,)
>  FUNCTION (svlsl_wide, shift_wide, (ASHIFT, UNSPEC_ASHIFT_WIDE))
> -FUNCTION (svlsr, rtx_code_function, (LSHIFTRT, LSHIFTRT))
> +FUNCTION (svlsr, svlsr_impl,)
>  FUNCTION (svlsr_wide, shift_wide, (LSHIFTRT, UNSPEC_LSHIFTRT_WIDE))
>  FUNCTION (svmad, svmad_impl,)
>  FUNCTION (svmax, rtx_code_function, (SMAX, UMAX, UNSPEC_COND_FMAX,
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 0fe90a6edc4..02f70ea78e3 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -2176,6 +2176,13 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>      (view_convert (rshift (view_convert:ntype @0) @1))
>      (convert (rshift (convert:ntype @0) @1))))))
>  
> +/* Fold ((x + y) >> 1 into IFN_AVG_FLOOR (x & y),
> +   if we know x and y are vectors in which each element has top bit zero.  */
> +(simplify
> + (rshift (plus:cs @0 @1) integer_onep)
> + (if (top_bit_zero_vector_p (@0) && top_bit_zero_vector_p (@1))
> +  (IFN_AVG_FLOOR @0 @1)))
> +
>  /* Try to fold (type) X op CST -> (type) (X op ((type-x) CST))
>     when profitable.
>     For bitwise binary operations apply operand conversions to the
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c 
> b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c
> new file mode 100644
> index 00000000000..f1748a199ad
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c
> @@ -0,0 +1,34 @@
> +/* Test if SIMD fused unsigned halving adds are generated */
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#include <arm_neon.h>
> +
> +#define FUSED_SIMD_UHADD(vectype, q, ts, mask) \
> +  vectype simd_uhadd ## q ## _ ## ts ## _1 (vectype a) \
> +  { \
> +    vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \
> +    vectype v2 = vdup ## q ## _n_ ## ts (mask); \
> +    return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \
> +  } \
> +  \
> +  vectype simd_uhadd ## q ## _ ## ts ## _2 (vectype a, vectype b) \
> +  { \
> +    vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \
> +    vectype v2 = vand ## q ## _ ## ts (b, vdup ## q ## _n_ ## ts (mask)); \
> +    return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \
> +  }
> +
> +FUSED_SIMD_UHADD (uint8x8_t, , u8, 0x7f)
> +FUSED_SIMD_UHADD (uint8x16_t, q, u8, 0x7f)
> +FUSED_SIMD_UHADD (uint16x4_t, , u16, 0x7fff)
> +FUSED_SIMD_UHADD (uint16x8_t, q, u16, 0x7fff)
> +FUSED_SIMD_UHADD (uint32x2_t, , u32, 0x7fffffff)
> +FUSED_SIMD_UHADD (uint32x4_t, q, u32, 0x7fffffff)
> +
> +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8b,} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.16b,} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4h,} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8h,} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.2s,} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4s,} 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c
> new file mode 100644
> index 00000000000..9a219eb5086
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c
> @@ -0,0 +1,30 @@
> +/* Test if SVE2 fused unsigned halving adds are generated */
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#include <arm_sve.h>
> +
> +#define FUSED_SVE2_UHADD(vectype, ts, tspg, mask) \
> +  vectype sve2_uhadd ## _ ## ts ## _1 (svbool_t pg, vectype a) \
> +  { \
> +    vectype v1 = svdup_ ## ts (mask); \
> +    vectype v2 = svand_m (svptrue_ ## tspg (), a, svdup_ ## ts (mask)); \
> +    return svlsr_x(pg, svadd_x (pg, v1, v2), svdup_ ## ts (1)); \
> +  } \
> +  \
> +  vectype sve2_uhadd ## _ ## ts ## _2 (svbool_t pg, vectype a, vectype b) \
> +  { \
> +    vectype v1 = svand_m (pg, a, svdup_ ## ts (mask)); \
> +    vectype v2 = svand_m (pg, b, svdup_ ## ts (mask)); \
> +    return svlsr_m(pg, svadd_m (pg, v1, v2), svdup_ ## ts (1)); \
> +  }
> +
> +FUSED_SVE2_UHADD (svuint8_t, u8, b8, 0x7f);
> +FUSED_SVE2_UHADD (svuint16_t, u16, b16, 0x7fff);
> +FUSED_SVE2_UHADD (svuint32_t, u32, b32, 0x7fffffff);
> +FUSED_SVE2_UHADD (svuint64_t, u64, b64, 0x7fffffffffffffff);
> +
> +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.b, p[0-7]/m,} 2 } } 
> */
> +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.h, p[0-7]/m,} 2 } } 
> */
> +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.s, p[0-7]/m,} 2 } } 
> */
> +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.d, p[0-7]/m,} 2 } } 
> */
> diff --git a/gcc/tree.cc b/gcc/tree.cc
> index eccfcc89da4..bdee2a93a44 100644
> --- a/gcc/tree.cc
> +++ b/gcc/tree.cc
> @@ -10756,6 +10756,36 @@ uniform_integer_cst_p (tree t)
>    return NULL_TREE;
>  }
>  
> +/* Checks to see if T is a vector in which each element has top bit zero then
> +   return T otherwise NULL_TREE.  */
> +
> +tree
> +top_bit_zero_vector_p (tree t)
> +{
> +  if (!VECTOR_TYPE_P (TREE_TYPE (t)))
> +    return NULL_TREE;
> +
> +  tree elem = uniform_vector_p (t);
> +  if (tree_fits_uhwi_p (elem))
> +    {
> +      unsigned int prec = element_precision (t);
> +      if ((tree_to_uhwi (elem) & (HOST_WIDE_INT_1U << (prec - 1))) == 0)
> +     return t;
> +    }
> +
> +  if (TREE_CODE (t) == SSA_NAME)
> +    {
> +      gimple *def = SSA_NAME_DEF_STMT (t);
> +      if (is_gimple_assign (def)
> +       && gimple_assign_rhs_code (def) == BIT_AND_EXPR
> +       && (top_bit_zero_vector_p (gimple_assign_rhs1 (def)) != NULL_TREE
> +           || top_bit_zero_vector_p (gimple_assign_rhs2 (def)) != NULL_TREE))
> +     return t;
> +    }
> +
> +  return NULL_TREE;
> +}
> +
>  /* Checks to see if T is a constant or a constant vector and if each element 
> E
>     adheres to ~E + 1 == pow2 then return ~E otherwise NULL_TREE.  */
>  
> diff --git a/gcc/tree.h b/gcc/tree.h
> index 99f26177628..6dfbbdc1aea 100644
> --- a/gcc/tree.h
> +++ b/gcc/tree.h
> @@ -5249,6 +5249,10 @@ extern tree uniform_vector_p (const_tree);
>  
>  extern tree uniform_integer_cst_p (tree);
>  
> +/* Checks to see if T is a vector in which each element has top bit zero then
> +   return T otherwise NULL_TREE.  */
> +extern tree top_bit_zero_vector_p (tree t);
> +
>  extern int single_nonzero_element (const_tree);
>  
>  /* Given a CONSTRUCTOR CTOR, return the element values as a vector.  */

Reply via email to