Pengfei Li <pengfei....@arm.com> writes: > This patch implements the folding of a vector addition followed by a > logical shift right by 1 (add + lsr #1) on AArch64 into an unsigned > halving add, allowing GCC to emit NEON or SVE2 UHADD instructions. > > For example, this patch helps improve the codegen from: > add v0.4s, v0.4s, v31.4s > ushr v0.4s, v0.4s, 1 > to: > uhadd v0.4s, v0.4s, v31.4s > > For NEON, vector operations are represented using generic mid-end > operations, so new folding rules are added to match.pd. For SVE2, the > operations are represented using built-in GIMPLE calls, so this > optimization is implemented via gimple_folder. > > To ensure correctness, additional checks are introduced to guargntee > that the operands to UHADD are vectors in which each element has its top > bit cleared. > > This patch has been bootstrapped and regression tested on > x86_64-linux-gnu and aarch64-linux-gnu. > > gcc/ChangeLog: > > * config/aarch64/aarch64-sve-builtins-base.cc (find_sve_builtin_call): > New helper function for finding and checking a GIMPLE call. > (is_undef): Rewrite with find_sve_builtin_call. > (class svlsr_impl): Implement the folding for SVE2. > (FUNCTION): Check and fold the pattern. > * match.pd: Add new rules to implement the folding for NEON. > * tree.cc (top_bit_zero_vector_p): Add a new utility function for > vector top bit zero check. > * tree.h (top_bit_zero_vector_p): Add a function declaration.
The target-independent changes are out of my comfort area. Cc:ing Richi for those. But rather than top_bit_zero_vector_p, how about a more general nonzero_element_bits? I've wanted something similar in the past. I don't think we can use an unbounded recursive walk, since that would become quadratic if we ever used it when optimising one AND in a chain of ANDs. (And using this function for ANDs seems plausible.) Maybe we should be handling the information in a similar way to Ranger. Rather than handle the built-in case entirely in target code, how about having a target hook into nonzero_element_bits (or whatever replaces it) for machine-dependent builtins? Thanks, Richard > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/acle/uhadd_1.c: New test. > * gcc.target/aarch64/sve2/acle/general/uhadd_1.c: New test. > --- > .../aarch64/aarch64-sve-builtins-base.cc | 101 ++++++++++++++++-- > gcc/match.pd | 7 ++ > .../gcc.target/aarch64/acle/uhadd_1.c | 34 ++++++ > .../aarch64/sve2/acle/general/uhadd_1.c | 30 ++++++ > gcc/tree.cc | 30 ++++++ > gcc/tree.h | 4 + > 6 files changed, 199 insertions(+), 7 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c > create mode 100644 > gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > index b4396837c24..ce6da82bf81 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > @@ -43,6 +43,7 @@ > #include "aarch64-sve-builtins.h" > #include "aarch64-sve-builtins-shapes.h" > #include "aarch64-sve-builtins-base.h" > +#include "aarch64-sve-builtins-sve2.h" > #include "aarch64-sve-builtins-functions.h" > #include "aarch64-builtins.h" > #include "ssa.h" > @@ -53,6 +54,23 @@ using namespace aarch64_sve; > > namespace { > > +/* Return gcall* if VAL is an SSA_NAME defined by the given SVE intrinsics > call. > + Otherwise return NULL. */ > +static gcall* > +find_sve_builtin_call (tree val, const function_base *func) > +{ > + if (TREE_CODE (val) == SSA_NAME) > + { > + gimple *def = SSA_NAME_DEF_STMT (val); > + if (gcall *call = dyn_cast<gcall *> (def)) > + if (tree fndecl = gimple_call_fndecl (call)) > + if (const function_instance *instance = lookup_fndecl (fndecl)) > + if (instance->base == func) > + return call; > + } > + return NULL; > +} > + > /* Return true if VAL is an undefined value. */ > static bool > is_undef (tree val) > @@ -62,12 +80,7 @@ is_undef (tree val) > if (ssa_undefined_value_p (val, false)) > return true; > > - gimple *def = SSA_NAME_DEF_STMT (val); > - if (gcall *call = dyn_cast<gcall *> (def)) > - if (tree fndecl = gimple_call_fndecl (call)) > - if (const function_instance *instance = lookup_fndecl (fndecl)) > - if (instance->base == functions::svundef) > - return true; > + return (find_sve_builtin_call (val, functions::svundef) != NULL); > } > return false; > } > @@ -2088,6 +2101,80 @@ public: > } > }; > > +class svlsr_impl : public rtx_code_function > +{ > +private: > + /* Return true if we know active lanes for use in T have top bit zero, > where > + pg_use tells which lanes are active for use. */ > + bool > + active_lanes_top_bit_zero_p (tree t, tree pg_use) const > + { > + /* Return true if T itself is a vector in which each element has top bit > + zero. */ > + if (top_bit_zero_vector_p (t)) > + return true; > + > + /* Return true if T is an AND op with a vector in which each element has > + top bit zero. Note the predicate for AND op should cover active lanes > + for use. */ > + gcall *and_call = find_sve_builtin_call (t, functions::svand); > + if (and_call != NULL) > + { > + tree pg = gimple_call_arg (and_call, 0); > + if (pg == pg_use || is_ptrue (pg, element_precision (t) / CHAR_BIT)) > + { > + return top_bit_zero_vector_p (gimple_call_arg (and_call, 1)) > + || top_bit_zero_vector_p (gimple_call_arg (and_call, 2)); > + } > + } > + > + return false; > + } > + > +public: > + CONSTEXPR svlsr_impl () > + : rtx_code_function (LSHIFTRT, LSHIFTRT) {} > + > + gimple* > + fold (gimple_folder &f) const override > + { > + /* Below folding applies to SVE2 only. */ > + if (!TARGET_SVE2) > + return NULL; > + > + /* Fold calls for patterns of LSR (ADD (x, y), 1) to an HADD (x, y). Note > + LSR and ADD should share the same pg to fold. */ > + tree pg = gimple_call_arg (f.call, 0); > + tree lsr_opnd = gimple_call_arg (f.call, 1); > + tree lsr_dist = gimple_call_arg (f.call, 2); > + > + gcall *add_call; > + if ((add_call = find_sve_builtin_call (lsr_opnd, functions::svadd)) != > NULL > + && integer_onep (lsr_dist) > + && gimple_call_arg (add_call, 0) == pg) > + { > + /* Check if we know all active lanes in the two addends of the add_call > + have top bit zero, where pg indicates which lanes are active. */ > + tree addend1 = gimple_call_arg (add_call, 1); > + tree addend2 = gimple_call_arg (add_call, 2); > + if (active_lanes_top_bit_zero_p (addend1, pg) > + && active_lanes_top_bit_zero_p (addend2, pg)) > + { > + function_instance instance ("svhadd", functions::svhadd, > + shapes::binary_opt_n, MODE_none, > + f.type_suffix_ids, GROUP_none, f.pred, > + FPM_unused); > + gcall *call = f.redirect_call (instance); > + gimple_call_set_arg (call, 1, addend1); > + gimple_call_set_arg (call, 2, addend2); > + return call; > + } > + } > + > + return NULL; > + } > +}; > + > class svmad_impl : public function_base > { > public: > @@ -3586,7 +3673,7 @@ FUNCTION (svldnt1, svldnt1_impl,) > FUNCTION (svlen, svlen_impl,) > FUNCTION (svlsl, svlsl_impl,) > FUNCTION (svlsl_wide, shift_wide, (ASHIFT, UNSPEC_ASHIFT_WIDE)) > -FUNCTION (svlsr, rtx_code_function, (LSHIFTRT, LSHIFTRT)) > +FUNCTION (svlsr, svlsr_impl,) > FUNCTION (svlsr_wide, shift_wide, (LSHIFTRT, UNSPEC_LSHIFTRT_WIDE)) > FUNCTION (svmad, svmad_impl,) > FUNCTION (svmax, rtx_code_function, (SMAX, UMAX, UNSPEC_COND_FMAX, > diff --git a/gcc/match.pd b/gcc/match.pd > index 0fe90a6edc4..02f70ea78e3 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -2176,6 +2176,13 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (view_convert (rshift (view_convert:ntype @0) @1)) > (convert (rshift (convert:ntype @0) @1)))))) > > +/* Fold ((x + y) >> 1 into IFN_AVG_FLOOR (x & y), > + if we know x and y are vectors in which each element has top bit zero. */ > +(simplify > + (rshift (plus:cs @0 @1) integer_onep) > + (if (top_bit_zero_vector_p (@0) && top_bit_zero_vector_p (@1)) > + (IFN_AVG_FLOOR @0 @1))) > + > /* Try to fold (type) X op CST -> (type) (X op ((type-x) CST)) > when profitable. > For bitwise binary operations apply operand conversions to the > diff --git a/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c > b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c > new file mode 100644 > index 00000000000..f1748a199ad > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c > @@ -0,0 +1,34 @@ > +/* Test if SIMD fused unsigned halving adds are generated */ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > + > +#include <arm_neon.h> > + > +#define FUSED_SIMD_UHADD(vectype, q, ts, mask) \ > + vectype simd_uhadd ## q ## _ ## ts ## _1 (vectype a) \ > + { \ > + vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \ > + vectype v2 = vdup ## q ## _n_ ## ts (mask); \ > + return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \ > + } \ > + \ > + vectype simd_uhadd ## q ## _ ## ts ## _2 (vectype a, vectype b) \ > + { \ > + vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \ > + vectype v2 = vand ## q ## _ ## ts (b, vdup ## q ## _n_ ## ts (mask)); \ > + return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \ > + } > + > +FUSED_SIMD_UHADD (uint8x8_t, , u8, 0x7f) > +FUSED_SIMD_UHADD (uint8x16_t, q, u8, 0x7f) > +FUSED_SIMD_UHADD (uint16x4_t, , u16, 0x7fff) > +FUSED_SIMD_UHADD (uint16x8_t, q, u16, 0x7fff) > +FUSED_SIMD_UHADD (uint32x2_t, , u32, 0x7fffffff) > +FUSED_SIMD_UHADD (uint32x4_t, q, u32, 0x7fffffff) > + > +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8b,} 2 } } */ > +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.16b,} 2 } } */ > +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4h,} 2 } } */ > +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8h,} 2 } } */ > +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.2s,} 2 } } */ > +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4s,} 2 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c > b/gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c > new file mode 100644 > index 00000000000..9a219eb5086 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c > @@ -0,0 +1,30 @@ > +/* Test if SVE2 fused unsigned halving adds are generated */ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > + > +#include <arm_sve.h> > + > +#define FUSED_SVE2_UHADD(vectype, ts, tspg, mask) \ > + vectype sve2_uhadd ## _ ## ts ## _1 (svbool_t pg, vectype a) \ > + { \ > + vectype v1 = svdup_ ## ts (mask); \ > + vectype v2 = svand_m (svptrue_ ## tspg (), a, svdup_ ## ts (mask)); \ > + return svlsr_x(pg, svadd_x (pg, v1, v2), svdup_ ## ts (1)); \ > + } \ > + \ > + vectype sve2_uhadd ## _ ## ts ## _2 (svbool_t pg, vectype a, vectype b) \ > + { \ > + vectype v1 = svand_m (pg, a, svdup_ ## ts (mask)); \ > + vectype v2 = svand_m (pg, b, svdup_ ## ts (mask)); \ > + return svlsr_m(pg, svadd_m (pg, v1, v2), svdup_ ## ts (1)); \ > + } > + > +FUSED_SVE2_UHADD (svuint8_t, u8, b8, 0x7f); > +FUSED_SVE2_UHADD (svuint16_t, u16, b16, 0x7fff); > +FUSED_SVE2_UHADD (svuint32_t, u32, b32, 0x7fffffff); > +FUSED_SVE2_UHADD (svuint64_t, u64, b64, 0x7fffffffffffffff); > + > +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.b, p[0-7]/m,} 2 } } > */ > +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.h, p[0-7]/m,} 2 } } > */ > +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.s, p[0-7]/m,} 2 } } > */ > +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.d, p[0-7]/m,} 2 } } > */ > diff --git a/gcc/tree.cc b/gcc/tree.cc > index eccfcc89da4..bdee2a93a44 100644 > --- a/gcc/tree.cc > +++ b/gcc/tree.cc > @@ -10756,6 +10756,36 @@ uniform_integer_cst_p (tree t) > return NULL_TREE; > } > > +/* Checks to see if T is a vector in which each element has top bit zero then > + return T otherwise NULL_TREE. */ > + > +tree > +top_bit_zero_vector_p (tree t) > +{ > + if (!VECTOR_TYPE_P (TREE_TYPE (t))) > + return NULL_TREE; > + > + tree elem = uniform_vector_p (t); > + if (tree_fits_uhwi_p (elem)) > + { > + unsigned int prec = element_precision (t); > + if ((tree_to_uhwi (elem) & (HOST_WIDE_INT_1U << (prec - 1))) == 0) > + return t; > + } > + > + if (TREE_CODE (t) == SSA_NAME) > + { > + gimple *def = SSA_NAME_DEF_STMT (t); > + if (is_gimple_assign (def) > + && gimple_assign_rhs_code (def) == BIT_AND_EXPR > + && (top_bit_zero_vector_p (gimple_assign_rhs1 (def)) != NULL_TREE > + || top_bit_zero_vector_p (gimple_assign_rhs2 (def)) != NULL_TREE)) > + return t; > + } > + > + return NULL_TREE; > +} > + > /* Checks to see if T is a constant or a constant vector and if each element > E > adheres to ~E + 1 == pow2 then return ~E otherwise NULL_TREE. */ > > diff --git a/gcc/tree.h b/gcc/tree.h > index 99f26177628..6dfbbdc1aea 100644 > --- a/gcc/tree.h > +++ b/gcc/tree.h > @@ -5249,6 +5249,10 @@ extern tree uniform_vector_p (const_tree); > > extern tree uniform_integer_cst_p (tree); > > +/* Checks to see if T is a vector in which each element has top bit zero then > + return T otherwise NULL_TREE. */ > +extern tree top_bit_zero_vector_p (tree t); > + > extern int single_nonzero_element (const_tree); > > /* Given a CONSTRUCTOR CTOR, return the element values as a vector. */