Hi Tamar,
> On 3 Oct 2025, at 11:45, Tamar Christina <[email protected]> wrote:
>
> This patch implements support for using dotproduct to do sum reductions by
> changing += a into += (a * 1). i.e. we seed the multiplication with 1.
>
> Given the example
>
> int foo_int(unsigned char *x, unsigned char * restrict y) {
> int sum = 0;
> for (int i = 0; i < 8000; i++)
> sum += char_abs(x[i] - y[i]);
> return sum;
> }
>
> we used to generate
>
> .L2:
> ldr q0, [x0, x2]
> ldr q28, [x1, x2]
> sub v28.16b, v0.16b, v28.16b
> zip1 v29.16b, v28.16b, v31.16b
> zip2 v28.16b, v28.16b, v31.16b
> uaddw v30.4s, v30.4s, v29.4h
> uaddw2 v30.4s, v30.4s, v29.8h
> uaddw v30.4s, v30.4s, v28.4h
> uaddw2 v30.4s, v30.4s, v28.8h
> add x2, x2, 16
> cmp x2, x3
> bne .L2
> addv s31, v30.4s
>
> but now generates with +dotprod
>
> .L2:
> ldr q29, [x0, x2]
> ldr q28, [x1, x2]
> sub v28.16b, v29.16b, v28.16b
> udot v31.4s, v28.16b, v30.16b
> add x2, x2, 16
> cmp x2, x3
> bne .L2
> addv s31, v31.4s
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> PR middle-end/122069
> * config/aarch64/aarch64-simd.md (widen_ssum<mode><vsi2qi>3): New.
> (widen_usum<mode><vsi2qi>3): New.
>
> gcc/testsuite/ChangeLog:
>
> PR middle-end/122069
> * gcc.target/aarch64/pr122069_3.c: New test.
> * gcc.target/aarch64/pr122069_4.c: New test.
>
> ---
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index
> d4a7912a11aef0a00385d544307feee40e86754f..5ee7daf775c17b3e1d49b423f3722dd08dca1d89
> 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -4703,6 +4703,34 @@ (define_expand "widen_usum<mode><Vwide>3"
> DONE;
> })
>
> +(define_expand "widen_ssum<mode><vsi2qi>3"
> + [(set (match_operand:VS 0 "register_operand")
> + (plus:VS (sign_extend:VS
> + (match_operand:<VSI2QI> 1 "register_operand"))
> + (match_operand:VS 2 "register_operand")))]
> + "TARGET_DOTPROD"
> + {
> + rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
> + emit_insn (gen_sdot_prod<mode><vsi2qi> (operands[0], operands[1], ones,
> + operands[2]));
> + DONE;
> + }
> +)
> +
> +(define_expand "widen_usum<mode><vsi2qi>3"
> + [(set (match_operand:VS 0 "register_operand")
> + (plus:VS (zero_extend:VS
> + (match_operand:<VSI2QI> 1 "register_operand"))
> + (match_operand:VS 2 "register_operand")))]
> + "TARGET_DOTPROD"
> + {
> + rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
> + emit_insn (gen_udot_prod<mode><vsi2qi> (operands[0], operands[1], ones,
> + operands[2]));
> + DONE;
> + }
> +)
IMO these expanders should have a comment explaining the a*1 trick from the
cover letter.
Ok with a comment added.
Thanks,
Kyrill
> +
> (define_insn "aarch64_<ANY_EXTEND:su>subw<mode>"
> [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
> (minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_3.c
> b/gcc/testsuite/gcc.target/aarch64/pr122069_3.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..a7c53ead4bf61243ec8879deb2bf2b39b6a4cd83
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122069_3.c
> @@ -0,0 +1,41 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=armv8-a+dotprod -mautovec-preference=asimd-only
> --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks
> -fno-schedule-insns2 -fdump-tree-vect-details" }*/
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +inline char char_abs(char i) {
> + return (i < 0 ? -i : i);
> +}
> +
> +/*
> +** foo_int:
> +** ...
> +** sub v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +** udot v[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b
> +** ...
> +*/
> +int foo_int(unsigned char *x, unsigned char * restrict y) {
> + int sum = 0;
> + for (int i = 0; i < 8000; i++)
> + sum += char_abs(x[i] - y[i]);
> + return sum;
> +}
> +
> +/*
> +** foo2_int:
> +** ...
> +** add v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
> +** uaddw2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.8h
> +** ...
> +*/
> +int foo2_int(unsigned short *x, unsigned short * restrict y) {
> + int sum = 0;
> + for (int i = 0; i < 8000; i++)
> + {
> + x[i] = x[i] + y[i];
> + sum += x[i];
> + }
> + return sum;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_4.c
> b/gcc/testsuite/gcc.target/aarch64/pr122069_4.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..462d7d3124b1f92f89d4ea55e289b51d36ac7cb1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122069_4.c
> @@ -0,0 +1,80 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -march=armv8-a+dotprod -mautovec-preference=asimd-only
> --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks
> -fno-schedule-insns2 -fdump-tree-vect-details" }*/
> +
> +inline char char_abs(char i) {
> + return (i < 0 ? -i : i);
> +}
> +
> +__attribute__((noipa))
> +int foo_int(unsigned char *x, unsigned char * restrict y) {
> + int sum = 0;
> + for (int i = 0; i < 100; i++)
> + sum += char_abs(x[i] - y[i]);
> + return sum;
> +}
> +
> +__attribute__((noipa))
> +int foo2_int(unsigned short *x, unsigned short * restrict y,
> + unsigned short * restrict z) {
> + int sum = 0;
> + for (int i = 0; i < 100; i++)
> + {
> + z[i] = x[i] + y[i];
> + sum += z[i];
> + }
> + return sum;
> +}
> +
> +__attribute__((noipa))
> +int foo_int2(unsigned char *x, unsigned char * restrict y) {
> + int sum = 0;
> +#pragma GCC novector
> + for (int i = 0; i < 100; i++)
> + sum += char_abs(x[i] - y[i]);
> + return sum;
> +}
> +
> +__attribute__((noipa))
> +int foo2_int2(unsigned short *x, unsigned short * restrict y,
> + unsigned short * restrict z) {
> + int sum = 0;
> +#pragma GCC novector
> + for (int i = 0; i < 100; i++)
> + {
> + z[i] = x[i] + y[i];
> + sum += z[i];
> + }
> + return sum;
> +}
> +
> +int main ()
> +{
> + unsigned short a[100];
> + unsigned short b[100];
> + unsigned short r1[100];
> + unsigned short r2[100];
> + unsigned char c[100];
> + unsigned char d[100];
> +#pragma GCC novector
> + for (int i = 0; i < 100; i++)
> + {
> + a[i] = c[i] = i;
> + b[i] = d[i] = 100 - i;
> + }
> +
> + if (foo_int (c, d) != foo_int2 (c, d))
> + __builtin_abort();
> +
> +
> + if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
> + __builtin_abort();
> +
> +#pragma GCC novector
> + for (int i = 0; i < 100; i++)
> + if (r1[i] != r2[i])
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
> \ No newline at end of file
>
>
> --
> <rb19871.patch>