Hi Tamar,

> On 3 Oct 2025, at 11:45, Tamar Christina <[email protected]> wrote:
> 
> This patch implements support for using dotproduct to do sum reductions by
> changing += a into += (a * 1).  i.e. we seed the multiplication with 1.
> 
> Given the example
> 
> int foo_int(unsigned char *x, unsigned char * restrict y) {
>  int sum = 0;
>  for (int i = 0; i < 8000; i++)
>     sum += char_abs(x[i] - y[i]);
>  return sum;
> }
> 
> we used to generate
> 
> .L2:
>        ldr     q0, [x0, x2]
>        ldr     q28, [x1, x2]
>        sub     v28.16b, v0.16b, v28.16b
>        zip1    v29.16b, v28.16b, v31.16b
>        zip2    v28.16b, v28.16b, v31.16b
>        uaddw   v30.4s, v30.4s, v29.4h
>        uaddw2  v30.4s, v30.4s, v29.8h
>        uaddw   v30.4s, v30.4s, v28.4h
>        uaddw2  v30.4s, v30.4s, v28.8h
>        add     x2, x2, 16
>        cmp     x2, x3
>        bne     .L2
>        addv    s31, v30.4s
> 
> but now generates with +dotprod
> 
> .L2:
>        ldr     q29, [x0, x2]
>        ldr     q28, [x1, x2]
>        sub     v28.16b, v29.16b, v28.16b
>        udot    v31.4s, v28.16b, v30.16b
>        add     x2, x2, 16
>        cmp     x2, x3
>        bne     .L2
>        addv    s31, v31.4s
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> PR middle-end/122069
> * config/aarch64/aarch64-simd.md (widen_ssum<mode><vsi2qi>3): New.
> (widen_usum<mode><vsi2qi>3): New.
> 
> gcc/testsuite/ChangeLog:
> 
> PR middle-end/122069
> * gcc.target/aarch64/pr122069_3.c: New test.
> * gcc.target/aarch64/pr122069_4.c: New test.
> 
> ---
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> d4a7912a11aef0a00385d544307feee40e86754f..5ee7daf775c17b3e1d49b423f3722dd08dca1d89
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -4703,6 +4703,34 @@ (define_expand "widen_usum<mode><Vwide>3"
>   DONE;
> })
> 
> +(define_expand "widen_ssum<mode><vsi2qi>3"
> +  [(set (match_operand:VS 0 "register_operand")
> + (plus:VS (sign_extend:VS
> +   (match_operand:<VSI2QI> 1 "register_operand"))
> + (match_operand:VS 2 "register_operand")))]
> +  "TARGET_DOTPROD"
> +  {
> +    rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
> +    emit_insn (gen_sdot_prod<mode><vsi2qi> (operands[0], operands[1], ones,
> +    operands[2]));
> +    DONE;
> +  }
> +)
> +
> +(define_expand "widen_usum<mode><vsi2qi>3"
> +  [(set (match_operand:VS 0 "register_operand")
> + (plus:VS (zero_extend:VS
> +        (match_operand:<VSI2QI> 1 "register_operand"))
> +      (match_operand:VS 2 "register_operand")))]
> +  "TARGET_DOTPROD"
> +  {
> +    rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
> +    emit_insn (gen_udot_prod<mode><vsi2qi> (operands[0], operands[1], ones,
> +    operands[2]));
> +    DONE;
> +  }
> +)

IMO these expanders should have a comment explaining the a*1 trick from the 
cover letter.
Ok with a comment added.
Thanks,
Kyrill

> +
> (define_insn "aarch64_<ANY_EXTEND:su>subw<mode>"
>   [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
> (minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_3.c 
> b/gcc/testsuite/gcc.target/aarch64/pr122069_3.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..a7c53ead4bf61243ec8879deb2bf2b39b6a4cd83
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122069_3.c
> @@ -0,0 +1,41 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=armv8-a+dotprod -mautovec-preference=asimd-only 
> --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks 
> -fno-schedule-insns2 -fdump-tree-vect-details" }*/
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +inline char char_abs(char i) {
> +  return (i < 0 ? -i : i);
> +}
> +
> +/*
> +** foo_int:
> +** ...
> +** sub v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +** udot v[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b
> +** ...
> +*/
> +int foo_int(unsigned char *x, unsigned char * restrict y) {
> +  int sum = 0;
> +  for (int i = 0; i < 8000; i++)
> +     sum += char_abs(x[i] - y[i]);
> +  return sum;
> +}
> +
> +/*
> +** foo2_int:
> +** ...
> +** add v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
> +** uaddw2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.8h
> +** ...
> +*/
> +int foo2_int(unsigned short *x, unsigned short * restrict y) {
> +  int sum = 0;
> +  for (int i = 0; i < 8000; i++)
> +    {
> +      x[i] = x[i] + y[i];
> +      sum += x[i];
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_4.c 
> b/gcc/testsuite/gcc.target/aarch64/pr122069_4.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..462d7d3124b1f92f89d4ea55e289b51d36ac7cb1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122069_4.c
> @@ -0,0 +1,80 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -march=armv8-a+dotprod -mautovec-preference=asimd-only 
> --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks 
> -fno-schedule-insns2 -fdump-tree-vect-details" }*/
> +
> +inline char char_abs(char i) {
> +  return (i < 0 ? -i : i);
> +}
> +
> +__attribute__((noipa))
> +int foo_int(unsigned char *x, unsigned char * restrict y) {
> +  int sum = 0;
> +  for (int i = 0; i < 100; i++)
> +     sum += char_abs(x[i] - y[i]);
> +  return sum;
> +}
> +
> +__attribute__((noipa))
> +int foo2_int(unsigned short *x, unsigned short * restrict y,
> +     unsigned short * restrict z) {
> +  int sum = 0;
> +  for (int i = 0; i < 100; i++)
> +    {
> +      z[i] = x[i] + y[i];
> +      sum += z[i];
> +    }
> +  return sum;
> +}
> +
> +__attribute__((noipa))
> +int foo_int2(unsigned char *x, unsigned char * restrict y) {
> +  int sum = 0;
> +#pragma GCC novector
> +  for (int i = 0; i < 100; i++)
> +     sum += char_abs(x[i] - y[i]);
> +  return sum;
> +}
> +
> +__attribute__((noipa))
> +int foo2_int2(unsigned short *x, unsigned short * restrict y,
> +      unsigned short * restrict z) {
> +  int sum = 0;
> +#pragma GCC novector
> +  for (int i = 0; i < 100; i++)
> +    {
> +      z[i] = x[i] + y[i];
> +      sum += z[i];
> +    }
> +  return sum;
> +}
> +
> +int main ()
> +{
> +  unsigned short a[100];
> +  unsigned short b[100];
> +  unsigned short r1[100];
> +  unsigned short r2[100];
> +  unsigned char c[100];
> +  unsigned char d[100];
> +#pragma GCC novector
> +  for (int i = 0; i < 100; i++)
> +    {
> +      a[i] = c[i] = i;
> +      b[i] = d[i] = 100 - i;
> +    }
> +
> +  if (foo_int (c, d) != foo_int2 (c, d))
> +    __builtin_abort();
> +
> +
> +  if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
> +    __builtin_abort();
> +
> +#pragma GCC novector
> +  for (int i = 0; i < 100; i++)
> +    if (r1[i] != r2[i])
> +      __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
> \ No newline at end of file
> 
> 
> -- 
> <rb19871.patch>

Reply via email to