Re: [PATCH 8/9]AArch64: Implement widen_[us]sum using [US]ADDW[TB] for SVE2 [PR122069]

Kyrylo Tkachov Fri, 03 Oct 2025 03:17:14 -0700

Hi Tamar,

> On 3 Oct 2025, at 11:47, Tamar Christina <[email protected]> wrote:
> 
> SVE2 adds [US]ADDW[TB] which we can use when we have to do a single step
> widening addition.  This is useful for instance when the value to be widened
> does not come from a load.  For example for
> 
> int foo2_int(unsigned short *x, unsigned short * restrict y) {
>  int sum = 0;
>  for (int i = 0; i < 8000; i++)
>    {
>      x[i] = x[i] + y[i];
>      sum += x[i];
>    }
>  return sum;
> }
> 
> we used to generate
> 
> .L6:
>        ld1h    z1.h, p7/z, [x0, x2, lsl 1]
>        ld1h    z29.h, p7/z, [x1, x2, lsl 1]
>        add     z29.h, z29.h, z1.h
>        punpklo p6.h, p7.b
>        uunpklo z0.s, z29.h
>        add     z31.s, p6/m, z31.s, z0.s
>        punpkhi p6.h, p7.b
>        uunpkhi z30.s, z29.h
>        add     z31.s, p6/m, z31.s, z30.s
>        st1h    z29.h, p7, [x0, x2, lsl 1]
>        add     x2, x2, x4
>        whilelo p7.h, w2, w3
>        b.any   .L6
>        ptrue   p7.b, all
>        uaddv   d31, p7, z31.s
> 
> but with +sve2
> 
> .L12:
>        ld1h    z30.h, p7/z, [x0, x2, lsl 1]
>        ld1h    z29.h, p7/z, [x1, x2, lsl 1]
>        add     z30.h, z30.h, z29.h
>        uaddwb  z31.s, z31.s, z30.h
>        uaddwt  z31.s, z31.s, z30.h
>        st1h    z30.h, p7, [x0, x2, lsl 1]
>        mov     x3, x2
>        inch    x2
>        cmp     w2, w4
>        bls     .L12
>        inch    x3
>        uaddv   d31, p7, z31.s
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> PR middle-end/122069
> * config/aarch64/aarch64-sve2.md
> (@aarch64_sve_<sve_int_op>_internal<mode>): New.
> (widen_ssum<mode><Vnarrow>3): New.
> (widen_usum<mode><Vnarrow>3): New.
> * config/aarch64/iterators.md (Vnarrow): New, to match VNARROW.
> 
> gcc/testsuite/ChangeLog:
> 
> PR middle-end/122069
> * gcc.target/aarch64/sve2/pr122069_1.c: New test.
> * gcc.target/aarch64/sve2/pr122069_2.c: New test.
> 
> ---
> diff --git a/gcc/config/aarch64/aarch64-sve2.md 
> b/gcc/config/aarch64/aarch64-sve2.md
> index 
> 69a376706facaa5f0dd5032fa30cb9298d222568..5f3b10ade8f55f9a71eaa0e3fe060627621108f6
>  100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -2377,6 +2377,50 @@ (define_insn "@aarch64_sve_<sve_int_op><mode>"
>   [(set_attr "sve_type" "sve_int_general")]
> )
> 
> +(define_insn "@aarch64_sve_<sve_int_op>_internal<mode>"
> +  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
> + (unspec:SVE_FULL_HSDI
> +  [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")
> +   (match_operand:<VNARROW> 2 "register_operand" "w")
> +   (match_operand:SVE_FULL_HSDI 3 "register_operand" "0")]
> +  SVE2_INT_BINARY_WIDE))]
> +  "TARGET_SVE2"
> +  "<sve_int_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Ventype>"
> +  [(set_attr "sve_type" "sve_int_general")]
> +)


Can you elaborate a bit on why you need this new pattern form? We already have 
a define_insn for it in
the "[INT] Wide binary arithmetic” section. I guess this one has a third 
operand inside the unspec.
It looks a bit weird to have 3 operands inside a SVE2_INT_BINARY_WIDE operation.
If we need to represent the fact that the SADDWT operation takes the 
accumulator as input as well then perhaps the existing pattern
@aarch64_sve_<sve_int_op><mode> needs to be modified?

Thanks,
Kyrill 

> +
> +;; Define single step widening for widen_ssum using SADDWB and SADDWT
> +(define_expand "widen_ssum<mode><Vnarrow>3"
> +  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
> + (unspec:SVE_FULL_HSDI
> +  [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
> +   (match_operand:<VNARROW> 1 "register_operand" "w")]
> +  UNSPEC_SADDWB))
> +   (set (match_dup 0)
> + (unspec:SVE_FULL_HSDI
> +  [(match_dup 2)
> +   (match_dup 1)
> +   (match_dup 0)]
> +  UNSPEC_SADDWT))]
> +  "TARGET_SVE2"
> +)
> +
> +;; Define single step widening for widen_usum using UADDWB and UADDWT
> +(define_expand "widen_usum<mode><Vnarrow>3"
> +  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
> + (unspec:SVE_FULL_HSDI
> +  [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
> +   (match_operand:<VNARROW> 1 "register_operand" "w")]
> +  UNSPEC_UADDWB))
> +   (set (match_dup 0)
> + (unspec:SVE_FULL_HSDI
> +  [(match_dup 2)
> +   (match_dup 1)
> +   (match_dup 0)]
> +  UNSPEC_UADDWT))]
> +  "TARGET_SVE2"
> +)
> +
> ;; -------------------------------------------------------------------------
> ;; ---- [INT] Long binary arithmetic
> ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 
> 61ca4990b94170f016a9f50e3505c8cfb24df9be..3757998c0ea9831b526a5bbc8568933fc05ed5d4
>  100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -1935,6 +1935,11 @@ (define_mode_attr VNARROW [(VNx8HI "VNx16QI")
>   (VNx2DI "VNx4SI") (VNx2DF "VNx4SF")
>   (VNx8SI "VNx8HI") (VNx16SI "VNx16QI")
>   (VNx8DI "VNx8HI")])
> +(define_mode_attr Vnarrow [(VNx8HI "vnx16qi")
> +   (VNx4SI "vnx8hi") (VNx4SF "vnx8hf")
> +   (VNx2DI "vnx4si") (VNx2DF "vnx4sf")
> +   (VNx8SI "vnx8hi") (VNx16SI "vnx16qi")
> +   (VNx8DI "vnx8hi")])
> 
> ;; Suffix mapping Advanced SIMD modes to be expanded as SVE instructions.
> (define_mode_attr sve_di_suf [(VNx16QI "") (VNx8HI "") (VNx4SI "") (VNx2DI "")
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..6a347072ae892ceeabddc05dbb4ead4814dda2da
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_1.c
> @@ -0,0 +1,41 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=armv8-a+sve2 -mautovec-preference=sve-only 
> --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks 
> -fno-schedule-insns2 -fdump-tree-vect-details" }*/
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +inline char char_abs(char i) {
> +  return (i < 0 ? -i : i);
> +}
> +
> +/*
> +** foo_int:
> +** ...
> +** sub z[0-9]+.b, z[0-9]+.b, z[0-9]+.b
> +** udot z[0-9]+.s, z[0-9]+.b, z[0-9]+.b
> +** ...
> +*/
> +int foo_int(unsigned char *x, unsigned char * restrict y) {
> +  int sum = 0;
> +  for (int i = 0; i < 8000; i++)
> +     sum += char_abs(x[i] - y[i]);
> +  return sum;
> +}
> +
> +/* 
> +** foo2_int:
> +** ...
> +** add z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
> +** uaddwb z[0-9]+.s, z[0-9]+.s, z[0-9]+.h
> +** uaddwt z[0-9]+.s, z[0-9]+.s, z[0-9]+.h
> +** ...
> +*/
> +int foo2_int(unsigned short *x, unsigned short * restrict y) {
> +  int sum = 0;
> +  for (int i = 0; i < 8000; i++)
> +    {
> +      x[i] = x[i] + y[i];
> +      sum += x[i];
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..f9ae97158688aad60ed2b705c02621ee7a33e6ec
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_2.c
> @@ -0,0 +1,81 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target aarch64_sve2_hw }  */
> +/* { dg-options "-O3 -march=armv8-a+sve2 -mautovec-preference=sve-only 
> --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks 
> -fno-schedule-insns2 -fdump-tree-vect-details" }*/
> +
> +inline char char_abs(char i) {
> +  return (i < 0 ? -i : i);
> +}
> +
> +__attribute__((noipa))
> +int foo_int(unsigned char *x, unsigned char * restrict y) {
> +  int sum = 0;
> +  for (int i = 0; i < 100; i++)
> +     sum += char_abs(x[i] - y[i]);
> +  return sum;
> +}
> +
> +__attribute__((noipa))
> +int foo2_int(unsigned short *x, unsigned short * restrict y,
> +     unsigned short * restrict z) {
> +  int sum = 0;
> +  for (int i = 0; i < 100; i++)
> +    {
> +      z[i] = x[i] + y[i];
> +      sum += z[i];
> +    }
> +  return sum;
> +}
> +
> +__attribute__((noipa))
> +int foo_int2(unsigned char *x, unsigned char * restrict y) {
> +  int sum = 0;
> +#pragma GCC novector
> +  for (int i = 0; i < 100; i++)
> +     sum += char_abs(x[i] - y[i]);
> +  return sum;
> +}
> +
> +__attribute__((noipa))
> +int foo2_int2(unsigned short *x, unsigned short * restrict y,
> +      unsigned short * restrict z) {
> +  int sum = 0;
> +#pragma GCC novector
> +  for (int i = 0; i < 100; i++)
> +    {
> +      z[i] = x[i] + y[i];
> +      sum += z[i];
> +    }
> +  return sum;
> +}
> +
> +int main ()
> +{
> +  unsigned short a[100];
> +  unsigned short b[100];
> +  unsigned short r1[100];
> +  unsigned short r2[100];
> +  unsigned char c[100];
> +  unsigned char d[100];
> +#pragma GCC novector
> +  for (int i = 0; i < 100; i++)
> +    {
> +      a[i] = c[i] = i;
> +      b[i] = d[i] = 100 - i;
> +    }
> +
> +  if (foo_int (c, d) != foo_int2 (c, d))
> +    __builtin_abort();
> +
> +
> +  if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
> +    __builtin_abort();
> +
> +#pragma GCC novector
> +  for (int i = 0; i < 100; i++)
> +    if (r1[i] != r2[i])
> +      __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
> \ No newline at end of file
> 
> 
> -- 
> <rb19876.patch>

Re: [PATCH 8/9]AArch64: Implement widen_[us]sum using [US]ADDW[TB] for SVE2 [PR122069]

Reply via email to