Pengfei Li <[email protected]> writes:
> This patch implements the folding of a vector addition followed by a
> logical shift right by 1 (add + lsr #1) on AArch64 into an unsigned
> halving add, allowing GCC to emit NEON or SVE2 UHADD instructions.
>
> For example, this patch helps improve the codegen from:
> add v0.4s, v0.4s, v31.4s
> ushr v0.4s, v0.4s, 1
> to:
> uhadd v0.4s, v0.4s, v31.4s
>
> For NEON, vector operations are represented using generic mid-end
> operations, so new folding rules are added to match.pd. For SVE2, the
> operations are represented using built-in GIMPLE calls, so this
> optimization is implemented via gimple_folder.
>
> To ensure correctness, additional checks are introduced to guargntee
> that the operands to UHADD are vectors in which each element has its top
> bit cleared.
>
> This patch has been bootstrapped and regression tested on
> x86_64-linux-gnu and aarch64-linux-gnu.
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64-sve-builtins-base.cc (find_sve_builtin_call):
> New helper function for finding and checking a GIMPLE call.
> (is_undef): Rewrite with find_sve_builtin_call.
> (class svlsr_impl): Implement the folding for SVE2.
> (FUNCTION): Check and fold the pattern.
> * match.pd: Add new rules to implement the folding for NEON.
> * tree.cc (top_bit_zero_vector_p): Add a new utility function for
> vector top bit zero check.
> * tree.h (top_bit_zero_vector_p): Add a function declaration.
The target-independent changes are out of my comfort area.
Cc:ing Richi for those.
But rather than top_bit_zero_vector_p, how about a more general
nonzero_element_bits? I've wanted something similar in the past.
I don't think we can use an unbounded recursive walk, since that
would become quadratic if we ever used it when optimising one
AND in a chain of ANDs. (And using this function for ANDs
seems plausible.) Maybe we should be handling the information
in a similar way to Ranger.
Rather than handle the built-in case entirely in target code, how about
having a target hook into nonzero_element_bits (or whatever replaces it)
for machine-dependent builtins?
Thanks,
Richard
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/acle/uhadd_1.c: New test.
> * gcc.target/aarch64/sve2/acle/general/uhadd_1.c: New test.
> ---
> .../aarch64/aarch64-sve-builtins-base.cc | 101 ++++++++++++++++--
> gcc/match.pd | 7 ++
> .../gcc.target/aarch64/acle/uhadd_1.c | 34 ++++++
> .../aarch64/sve2/acle/general/uhadd_1.c | 30 ++++++
> gcc/tree.cc | 30 ++++++
> gcc/tree.h | 4 +
> 6 files changed, 199 insertions(+), 7 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index b4396837c24..ce6da82bf81 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -43,6 +43,7 @@
> #include "aarch64-sve-builtins.h"
> #include "aarch64-sve-builtins-shapes.h"
> #include "aarch64-sve-builtins-base.h"
> +#include "aarch64-sve-builtins-sve2.h"
> #include "aarch64-sve-builtins-functions.h"
> #include "aarch64-builtins.h"
> #include "ssa.h"
> @@ -53,6 +54,23 @@ using namespace aarch64_sve;
>
> namespace {
>
> +/* Return gcall* if VAL is an SSA_NAME defined by the given SVE intrinsics
> call.
> + Otherwise return NULL. */
> +static gcall*
> +find_sve_builtin_call (tree val, const function_base *func)
> +{
> + if (TREE_CODE (val) == SSA_NAME)
> + {
> + gimple *def = SSA_NAME_DEF_STMT (val);
> + if (gcall *call = dyn_cast<gcall *> (def))
> + if (tree fndecl = gimple_call_fndecl (call))
> + if (const function_instance *instance = lookup_fndecl (fndecl))
> + if (instance->base == func)
> + return call;
> + }
> + return NULL;
> +}
> +
> /* Return true if VAL is an undefined value. */
> static bool
> is_undef (tree val)
> @@ -62,12 +80,7 @@ is_undef (tree val)
> if (ssa_undefined_value_p (val, false))
> return true;
>
> - gimple *def = SSA_NAME_DEF_STMT (val);
> - if (gcall *call = dyn_cast<gcall *> (def))
> - if (tree fndecl = gimple_call_fndecl (call))
> - if (const function_instance *instance = lookup_fndecl (fndecl))
> - if (instance->base == functions::svundef)
> - return true;
> + return (find_sve_builtin_call (val, functions::svundef) != NULL);
> }
> return false;
> }
> @@ -2088,6 +2101,80 @@ public:
> }
> };
>
> +class svlsr_impl : public rtx_code_function
> +{
> +private:
> + /* Return true if we know active lanes for use in T have top bit zero,
> where
> + pg_use tells which lanes are active for use. */
> + bool
> + active_lanes_top_bit_zero_p (tree t, tree pg_use) const
> + {
> + /* Return true if T itself is a vector in which each element has top bit
> + zero. */
> + if (top_bit_zero_vector_p (t))
> + return true;
> +
> + /* Return true if T is an AND op with a vector in which each element has
> + top bit zero. Note the predicate for AND op should cover active lanes
> + for use. */
> + gcall *and_call = find_sve_builtin_call (t, functions::svand);
> + if (and_call != NULL)
> + {
> + tree pg = gimple_call_arg (and_call, 0);
> + if (pg == pg_use || is_ptrue (pg, element_precision (t) / CHAR_BIT))
> + {
> + return top_bit_zero_vector_p (gimple_call_arg (and_call, 1))
> + || top_bit_zero_vector_p (gimple_call_arg (and_call, 2));
> + }
> + }
> +
> + return false;
> + }
> +
> +public:
> + CONSTEXPR svlsr_impl ()
> + : rtx_code_function (LSHIFTRT, LSHIFTRT) {}
> +
> + gimple*
> + fold (gimple_folder &f) const override
> + {
> + /* Below folding applies to SVE2 only. */
> + if (!TARGET_SVE2)
> + return NULL;
> +
> + /* Fold calls for patterns of LSR (ADD (x, y), 1) to an HADD (x, y). Note
> + LSR and ADD should share the same pg to fold. */
> + tree pg = gimple_call_arg (f.call, 0);
> + tree lsr_opnd = gimple_call_arg (f.call, 1);
> + tree lsr_dist = gimple_call_arg (f.call, 2);
> +
> + gcall *add_call;
> + if ((add_call = find_sve_builtin_call (lsr_opnd, functions::svadd)) !=
> NULL
> + && integer_onep (lsr_dist)
> + && gimple_call_arg (add_call, 0) == pg)
> + {
> + /* Check if we know all active lanes in the two addends of the add_call
> + have top bit zero, where pg indicates which lanes are active. */
> + tree addend1 = gimple_call_arg (add_call, 1);
> + tree addend2 = gimple_call_arg (add_call, 2);
> + if (active_lanes_top_bit_zero_p (addend1, pg)
> + && active_lanes_top_bit_zero_p (addend2, pg))
> + {
> + function_instance instance ("svhadd", functions::svhadd,
> + shapes::binary_opt_n, MODE_none,
> + f.type_suffix_ids, GROUP_none, f.pred,
> + FPM_unused);
> + gcall *call = f.redirect_call (instance);
> + gimple_call_set_arg (call, 1, addend1);
> + gimple_call_set_arg (call, 2, addend2);
> + return call;
> + }
> + }
> +
> + return NULL;
> + }
> +};
> +
> class svmad_impl : public function_base
> {
> public:
> @@ -3586,7 +3673,7 @@ FUNCTION (svldnt1, svldnt1_impl,)
> FUNCTION (svlen, svlen_impl,)
> FUNCTION (svlsl, svlsl_impl,)
> FUNCTION (svlsl_wide, shift_wide, (ASHIFT, UNSPEC_ASHIFT_WIDE))
> -FUNCTION (svlsr, rtx_code_function, (LSHIFTRT, LSHIFTRT))
> +FUNCTION (svlsr, svlsr_impl,)
> FUNCTION (svlsr_wide, shift_wide, (LSHIFTRT, UNSPEC_LSHIFTRT_WIDE))
> FUNCTION (svmad, svmad_impl,)
> FUNCTION (svmax, rtx_code_function, (SMAX, UMAX, UNSPEC_COND_FMAX,
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 0fe90a6edc4..02f70ea78e3 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -2176,6 +2176,13 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> (view_convert (rshift (view_convert:ntype @0) @1))
> (convert (rshift (convert:ntype @0) @1))))))
>
> +/* Fold ((x + y) >> 1 into IFN_AVG_FLOOR (x & y),
> + if we know x and y are vectors in which each element has top bit zero. */
> +(simplify
> + (rshift (plus:cs @0 @1) integer_onep)
> + (if (top_bit_zero_vector_p (@0) && top_bit_zero_vector_p (@1))
> + (IFN_AVG_FLOOR @0 @1)))
> +
> /* Try to fold (type) X op CST -> (type) (X op ((type-x) CST))
> when profitable.
> For bitwise binary operations apply operand conversions to the
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c
> b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c
> new file mode 100644
> index 00000000000..f1748a199ad
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c
> @@ -0,0 +1,34 @@
> +/* Test if SIMD fused unsigned halving adds are generated */
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#include <arm_neon.h>
> +
> +#define FUSED_SIMD_UHADD(vectype, q, ts, mask) \
> + vectype simd_uhadd ## q ## _ ## ts ## _1 (vectype a) \
> + { \
> + vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \
> + vectype v2 = vdup ## q ## _n_ ## ts (mask); \
> + return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \
> + } \
> + \
> + vectype simd_uhadd ## q ## _ ## ts ## _2 (vectype a, vectype b) \
> + { \
> + vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \
> + vectype v2 = vand ## q ## _ ## ts (b, vdup ## q ## _n_ ## ts (mask)); \
> + return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \
> + }
> +
> +FUSED_SIMD_UHADD (uint8x8_t, , u8, 0x7f)
> +FUSED_SIMD_UHADD (uint8x16_t, q, u8, 0x7f)
> +FUSED_SIMD_UHADD (uint16x4_t, , u16, 0x7fff)
> +FUSED_SIMD_UHADD (uint16x8_t, q, u16, 0x7fff)
> +FUSED_SIMD_UHADD (uint32x2_t, , u32, 0x7fffffff)
> +FUSED_SIMD_UHADD (uint32x4_t, q, u32, 0x7fffffff)
> +
> +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8b,} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.16b,} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4h,} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8h,} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.2s,} 2 } } */
> +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4s,} 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c
> b/gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c
> new file mode 100644
> index 00000000000..9a219eb5086
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c
> @@ -0,0 +1,30 @@
> +/* Test if SVE2 fused unsigned halving adds are generated */
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#include <arm_sve.h>
> +
> +#define FUSED_SVE2_UHADD(vectype, ts, tspg, mask) \
> + vectype sve2_uhadd ## _ ## ts ## _1 (svbool_t pg, vectype a) \
> + { \
> + vectype v1 = svdup_ ## ts (mask); \
> + vectype v2 = svand_m (svptrue_ ## tspg (), a, svdup_ ## ts (mask)); \
> + return svlsr_x(pg, svadd_x (pg, v1, v2), svdup_ ## ts (1)); \
> + } \
> + \
> + vectype sve2_uhadd ## _ ## ts ## _2 (svbool_t pg, vectype a, vectype b) \
> + { \
> + vectype v1 = svand_m (pg, a, svdup_ ## ts (mask)); \
> + vectype v2 = svand_m (pg, b, svdup_ ## ts (mask)); \
> + return svlsr_m(pg, svadd_m (pg, v1, v2), svdup_ ## ts (1)); \
> + }
> +
> +FUSED_SVE2_UHADD (svuint8_t, u8, b8, 0x7f);
> +FUSED_SVE2_UHADD (svuint16_t, u16, b16, 0x7fff);
> +FUSED_SVE2_UHADD (svuint32_t, u32, b32, 0x7fffffff);
> +FUSED_SVE2_UHADD (svuint64_t, u64, b64, 0x7fffffffffffffff);
> +
> +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.b, p[0-7]/m,} 2 } }
> */
> +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.h, p[0-7]/m,} 2 } }
> */
> +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.s, p[0-7]/m,} 2 } }
> */
> +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.d, p[0-7]/m,} 2 } }
> */
> diff --git a/gcc/tree.cc b/gcc/tree.cc
> index eccfcc89da4..bdee2a93a44 100644
> --- a/gcc/tree.cc
> +++ b/gcc/tree.cc
> @@ -10756,6 +10756,36 @@ uniform_integer_cst_p (tree t)
> return NULL_TREE;
> }
>
> +/* Checks to see if T is a vector in which each element has top bit zero then
> + return T otherwise NULL_TREE. */
> +
> +tree
> +top_bit_zero_vector_p (tree t)
> +{
> + if (!VECTOR_TYPE_P (TREE_TYPE (t)))
> + return NULL_TREE;
> +
> + tree elem = uniform_vector_p (t);
> + if (tree_fits_uhwi_p (elem))
> + {
> + unsigned int prec = element_precision (t);
> + if ((tree_to_uhwi (elem) & (HOST_WIDE_INT_1U << (prec - 1))) == 0)
> + return t;
> + }
> +
> + if (TREE_CODE (t) == SSA_NAME)
> + {
> + gimple *def = SSA_NAME_DEF_STMT (t);
> + if (is_gimple_assign (def)
> + && gimple_assign_rhs_code (def) == BIT_AND_EXPR
> + && (top_bit_zero_vector_p (gimple_assign_rhs1 (def)) != NULL_TREE
> + || top_bit_zero_vector_p (gimple_assign_rhs2 (def)) != NULL_TREE))
> + return t;
> + }
> +
> + return NULL_TREE;
> +}
> +
> /* Checks to see if T is a constant or a constant vector and if each element
> E
> adheres to ~E + 1 == pow2 then return ~E otherwise NULL_TREE. */
>
> diff --git a/gcc/tree.h b/gcc/tree.h
> index 99f26177628..6dfbbdc1aea 100644
> --- a/gcc/tree.h
> +++ b/gcc/tree.h
> @@ -5249,6 +5249,10 @@ extern tree uniform_vector_p (const_tree);
>
> extern tree uniform_integer_cst_p (tree);
>
> +/* Checks to see if T is a vector in which each element has top bit zero then
> + return T otherwise NULL_TREE. */
> +extern tree top_bit_zero_vector_p (tree t);
> +
> extern int single_nonzero_element (const_tree);
>
> /* Given a CONSTRUCTOR CTOR, return the element values as a vector. */