Re: [PATCH 10/12] aarch64: Use VNx16BI for svdup_b*

Kyrylo Tkachov Thu, 31 Jul 2025 10:03:24 -0700

> On 29 Jul 2025, at 18:41, Richard Sandiford <richard.sandif...@arm.com> wrote:
> 
> This patch continues the work of making ACLE intrinsics use VNx16BI
> for svbool_t results.  It deals with the predicate forms of svdup.
> 

Ok.
Thanks,
Kyrill

> gcc/
> * config/aarch64/aarch64-protos.h
> (aarch64_emit_sve_pred_vec_duplicate): Declare.
> * config/aarch64/aarch64.cc
> (aarch64_emit_sve_pred_vec_duplicate): New function.
> * config/aarch64/aarch64-sve.md (vec_duplicate<PRED_ALL:mode>): Use it.
> * config/aarch64/aarch64-sve-builtins-base.cc
> (svdup_impl::expand): Handle boolean values specially.  Check for
> constants and fall back on aarch64_emit_sve_pred_vec_duplicate
> for the variable case, ensuring that the result has mode VNx16BI.
> 
> gcc/testsuite/
> * gcc.target/aarch64/sve/acle/general/dup_1.c: New test.
> ---
> gcc/config/aarch64/aarch64-protos.h           |  1 +
> .../aarch64/aarch64-sve-builtins-base.cc      | 18 ++++++-
> gcc/config/aarch64/aarch64-sve.md             |  5 +-
> gcc/config/aarch64/aarch64.cc                 | 21 +++++++++
> .../aarch64/sve/acle/general/dup_1.c          | 47 +++++++++++++++++++
> 5 files changed, 87 insertions(+), 5 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general/dup_1.c
> 
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index e946e8da11d..8f2fc9d2f97 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -1038,6 +1038,7 @@ void aarch64_expand_sve_mem_move (rtx, rtx, 
> machine_mode);
> bool aarch64_maybe_expand_sve_subreg_move (rtx, rtx);
> rtx aarch64_replace_reg_mode (rtx, machine_mode);
> void aarch64_split_sve_subreg_move (rtx, rtx, rtx);
> +void aarch64_emit_sve_pred_vec_duplicate (machine_mode, rtx, rtx);
> void aarch64_expand_prologue (void);
> void aarch64_decompose_vec_struct_index (machine_mode, rtx *, rtx *, bool);
> void aarch64_expand_vector_init (rtx, rtx);
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index d58d5972baf..314d53ec9ad 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -1050,6 +1050,23 @@ public:
>   rtx
>   expand (function_expander &e) const override
>   {
> +    machine_mode mode = e.vector_mode (0);
> +    if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
> +      {
> + gcc_assert (e.pred == PRED_none);
> +
> + rtx src = e.args[0];
> + if (GET_CODE (src) == CONST_INT)
> +  return (src == const0_rtx
> +  ? CONST0_RTX (VNx16BImode)
> +  : aarch64_ptrue_all (e.type_suffix (0).element_bytes));
> +
> + rtx dest = e.get_reg_target ();
> + src = force_reg (GET_MODE (src), src);
> + aarch64_emit_sve_pred_vec_duplicate (mode, dest, src);
> + return dest;
> +      }
> +
>     if (e.pred == PRED_none || e.pred == PRED_x)
>       /* There's no benefit to using predicated instructions for _x here.  */
>       return e.use_unpred_insn (e.direct_optab_handler (vec_duplicate_optab));
> @@ -1058,7 +1075,6 @@ public:
>        the duplicate of the function argument and the "false" value
>        is the value of inactive lanes.  */
>     insn_code icode;
> -    machine_mode mode = e.vector_mode (0);
>     if (valid_for_const_vector_p (GET_MODE_INNER (mode), e.args.last ()))
>       /* Duplicate the constant to fill a vector.  The pattern optimizes
> various cases involving constant operands, falling back to SEL
> diff --git a/gcc/config/aarch64/aarch64-sve.md 
> b/gcc/config/aarch64/aarch64-sve.md
> index 8011227e2d9..6330ac60779 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -2990,10 +2990,7 @@ (define_expand "vec_duplicate<mode>"
> (vec_duplicate:PRED_ALL (match_operand:QI 1 "register_operand")))]
>   "TARGET_SVE"
>   {
> -    rtx tmp = gen_reg_rtx (DImode);
> -    rtx op1 = gen_lowpart (DImode, operands[1]);
> -    emit_insn (gen_ashldi3 (tmp, op1, gen_int_mode (63, DImode)));
> -    emit_insn (gen_while_ultdi<mode> (operands[0], const0_rtx, tmp));
> +    aarch64_emit_sve_pred_vec_duplicate (<MODE>mode, operands[0], 
> operands[1]);
>     DONE;
>   }
> )
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index f2a752260a4..3c8d08b7fdf 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -6725,6 +6725,27 @@ aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, 
> rtx src)
>       dest, ptrue, src));
> }
> 
> +/* Set predicate register DEST such that every element has the scalar
> +   boolean value in SRC, with any nonzero source counting as "true".
> +   MODE is a MODE_VECTOR_BOOL that determines the element size;
> +   DEST can have this mode or VNx16BImode.  In the latter case,
> +   the upper bits of each element are defined to be zero, as for
> +   the .H, .S, and .D forms of PTRUE.  */
> +
> +void
> +aarch64_emit_sve_pred_vec_duplicate (machine_mode mode, rtx dest, rtx src)
> +{
> +  rtx tmp = gen_reg_rtx (DImode);
> +  emit_insn (gen_ashldi3 (tmp, gen_lowpart (DImode, src),
> +  gen_int_mode (63, DImode)));
> +  if (GET_MODE (dest) == VNx16BImode)
> +    emit_insn (gen_aarch64_sve_while_acle (UNSPEC_WHILELO, DImode, mode,
> +   dest, const0_rtx, tmp));
> +  else
> +    emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
> +  dest, const0_rtx, tmp));
> +}
> +
> static bool
> aarch64_function_ok_for_sibcall (tree, tree exp)
> {
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dup_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dup_1.c
> new file mode 100644
> index 00000000000..c3c4e2d086e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dup_1.c
> @@ -0,0 +1,47 @@
> +/* { dg-options "-O2" } */
> +
> +#include <arm_sve.h>
> +
> +svbool_t
> +test1 (int x)
> +{
> +  return svand_z (svptrue_b16 (), svdup_b16 (x), svptrue_b16 ());
> +}
> +
> +svbool_t
> +test2 (int x)
> +{
> +  return svand_z (svptrue_b8 (), svdup_b32 (x), svptrue_b16 ());
> +}
> +
> +svbool_t
> +test3 (int x)
> +{
> +  return svand_z (svptrue_b32 (), svdup_b32 (x), svptrue_b16 ());
> +}
> +
> +svbool_t
> +test4 (int x)
> +{
> +  return svand_z (svptrue_b32 (), svdup_b32 (x), svptrue_b32 ());
> +}
> +
> +svbool_t
> +test5 (int x)
> +{
> +  return svand_z (svptrue_b8 (), svdup_b64 (x), svptrue_b32 ());
> +}
> +
> +svbool_t
> +test6 (int x)
> +{
> +  return svand_z (svptrue_b16 (), svdup_b64 (x), svptrue_b8 ());
> +}
> +
> +svbool_t
> +test7 (int x)
> +{
> +  return svand_z (svptrue_b16 (), svdup_b64 (x), svptrue_b64 ());
> +}
> +
> +/* { dg-final { scan-assembler-not {\tand\t} } } */
> -- 
> 2.43.0
>
Re: [PATCH 10/12] aarch64: Use VNx16BI for svdup_b*

Reply via email to