Hi Tamar,
> On 3 Oct 2025, at 12:48, Tamar Christina <[email protected]> wrote:
>
> My changelog seems to be out of date here. Correct one is:
>
> SVE2p1 adds 2-way dotproduct which we can use when we have to do a single step
> widening addition. This is useful for instance when the value to be widened
> does not come from a load. For example for
>
> int foo2_int(unsigned short *x, unsigned short * restrict y) {
> int sum = 0;
> for (int i = 0; i < 8000; i++)
> {
> x[i] = x[i] + y[i];
> sum += x[i];
> }
> return sum;
> }
>
> we used to generate
>
> .L12:
> ld1h z30.h, p7/z, [x0, x2, lsl 1]
> ld1h z29.h, p7/z, [x1, x2, lsl 1]
> add z30.h, z30.h, z29.h
> uaddwb z31.s, z31.s, z30.h
> uaddwt z31.s, z31.s, z30.h
> st1h z30.h, p7, [x0, x2, lsl 1]
> mov x3, x2
> inch x2
> cmp w2, w4
> bls .L12
> inch x3
> uaddv d31, p7, z31.s
>
> but with +sve2p1
>
> .L12:
> ld1h z31.h, p7/z, [x0, x2, lsl 1]
> ld1h z29.h, p7/z, [x1, x2, lsl 1]
> add z31.h, z31.h, z29.h
> udot z30.s, z31.h, z28.h
> st1h z31.h, p7, [x0, x2, lsl 1]
> mov x3, x2
> inch x2
> cmp w2, w4
> bls .L12
> inch x3
> uaddv d30, p7, z30.s
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> PR middle-end/122069
> * config/aarch64/aarch64-sve2.md
> (widen_ssum<mode><Vnarrow>3): Update.
> (widen_usum<mode><Vnarrow>3): Update.
>
> gcc/testsuite/ChangeLog:
>
> PR middle-end/122069
> * gcc.target/aarch64/sve2/pr122069_3.c: New test.
> * gcc.target/aarch64/sve2/pr122069_4.c: New test.
>
>> -----Original Message-----
>> From: Tamar Christina <[email protected]>
>> Sent: 03 October 2025 10:48
>> To: [email protected]
>> Cc: nd <[email protected]>; Richard Earnshaw <[email protected]>;
>> [email protected]; Alex Coplan <[email protected]>;
>> [email protected]; Wilco Dijkstra
>> <[email protected]>; Alice Carlotti <[email protected]>
>> Subject: [PATCH 9/9]AArch64: Implement widen_[us]sum using 2-way
>> [US]UDOT for SVE2p1 [PR122069]
>>
>> SVE2p1 adds 2-way dotproduct which we can use when we have to do a
>> single step
>> widening addition. This is useful for instance when the value to be widened
>> does not come from a load. For example for
>>
>> int foo2_int(unsigned short *x, unsigned short * restrict y) {
>> int sum = 0;
>> for (int i = 0; i < 8000; i++)
>> {
>> x[i] = x[i] + y[i];
>> sum += x[i];
>> }
>> return sum;
>> }
>>
>> we used to generate
>>
>> .L12:
>> ld1h z30.h, p7/z, [x0, x2, lsl 1]
>> ld1h z29.h, p7/z, [x1, x2, lsl 1]
>> add z30.h, z30.h, z29.h
>> uaddwb z31.s, z31.s, z30.h
>> uaddwt z31.s, z31.s, z30.h
>> st1h z30.h, p7, [x0, x2, lsl 1]
>> mov x3, x2
>> inch x2
>> cmp w2, w4
>> bls .L12
>> inch x3
>> uaddv d31, p7, z31.s
>>
>> but with +sve2p1
>>
>> .L12:
>> ld1h z31.h, p7/z, [x0, x2, lsl 1]
>> ld1h z29.h, p7/z, [x1, x2, lsl 1]
>> add z31.h, z31.h, z29.h
>> udot z30.s, z31.h, z28.h
>> st1h z31.h, p7, [x0, x2, lsl 1]
>> mov x3, x2
>> inch x2
>> cmp w2, w4
>> bls .L12
>> inch x3
>> uaddv d30, p7, z30.s
>>
>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>>
>> Ok for master?
>>
>> Thanks,
>> Tamar
>>
>> gcc/ChangeLog:
>>
>> PR middle-end/122069
>> * config/aarch64/aarch64-sve2.md
>> (@aarch64_sve_<sve_int_op>_internal<mode>): New.
>> (widen_ssum<mode><Vnarrow>3): New.
>> (widen_usum<mode><Vnarrow>3): New.
>> * config/aarch64/iterators.md (Vnarrow): New, to match VNARROW.
>>
>> gcc/testsuite/ChangeLog:
>>
>> PR middle-end/122069
>> * gcc.target/aarch64/sve2/pr122069_1.c: New test.
>> * gcc.target/aarch64/sve2/pr122069_2.c: New test.
>>
>> ---
>> diff --git a/gcc/config/aarch64/aarch64-sve2.md
>> b/gcc/config/aarch64/aarch64-sve2.md
>> index
>> 5f3b10ade8f55f9a71eaa0e3fe060627621108f6..41748f82b32b5c6bab0b5
>> d6076c08e6383bff515 100644
>> --- a/gcc/config/aarch64/aarch64-sve2.md
>> +++ b/gcc/config/aarch64/aarch64-sve2.md
>> @@ -2403,7 +2403,17 @@ (define_expand
>> "widen_ssum<mode><Vnarrow>3"
>> (match_dup 0)]
>> UNSPEC_SADDWT))]
>> "TARGET_SVE2"
>> -)
>> +{
>> + if (TARGET_SVE2p1_OR_SME2
>> + && <VNARROW>mode == VNx8HImode
>> + && <MODE>mode == VNx4SImode)
>> + {
>> + rtx ones = force_reg (VNx8HImode, CONST1_RTX (VNx8HImode));
>> + emit_insn (gen_sdot_prodvnx4sivnx8hi (operands[0], operands[1],
>> + ones, operands[2]));
>> + DONE;
>> + }
>> +})
>>
>> ;; Define single step widening for widen_usum using UADDWB and UADDWT
>> (define_expand "widen_usum<mode><Vnarrow>3"
>> @@ -2419,7 +2429,17 @@ (define_expand
>> "widen_usum<mode><Vnarrow>3"
>> (match_dup 0)]
>> UNSPEC_UADDWT))]
>> "TARGET_SVE2"
>> -)
>> +{
>> + if (TARGET_SVE2p1_OR_SME2
>> + && <VNARROW>mode == VNx8HImode
>> + && <MODE>mode == VNx4SImode)
>> + {
>> + rtx ones = force_reg (VNx8HImode, CONST1_RTX (VNx8HImode));
>> + emit_insn (gen_udot_prodvnx4sivnx8hi (operands[0], operands[1],
>> + ones, operands[2]));
>> + DONE;
>> + }
>> +})
A short comment at these new additions explaining the use of CONST1_RTX would
be useful for posterity.
>>
>> ;; -------------------------------------------------------------------------
>> ;; ---- [INT] Long binary arithmetic
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_3.c
>> b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_3.c
>> new file mode 100644
>> index
>> 0000000000000000000000000000000000000000..c50a0ccc22604ff4a54
>> 85881fa922fb603ca3122
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_3.c
>> @@ -0,0 +1,39 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3 -march=armv8-a+sve2p1 -mautovec-preference=sve-
>> only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-
>> blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/
>> +/* { dg-final { check-function-bodies "**" "" } } */
>> +
>> +inline char char_abs(char i) {
>> + return (i < 0 ? -i : i);
>> +}
>> +
>> +/*
>> +** foo_int:
>> +** ...
>> +** sub z[0-9]+.b, z[0-9]+.b, z[0-9]+.b
>> +** udot z[0-9]+.s, z[0-9]+.b, z[0-9]+.b
>> +** ...
>> +*/
>> +int foo_int(unsigned char *x, unsigned char * restrict y) {
>> + int sum = 0;
>> + for (int i = 0; i < 8000; i++)
>> + sum += char_abs(x[i] - y[i]);
>> + return sum;
>> +}
>> +
>> +/*
>> +** foo2_int:
>> +** ...
>> +** udot z[0-9]+.s, z[0-9]+.h, z[0-9]+.h
>> +** ...
>> +*/
>> +int foo2_int(unsigned short *x, unsigned short * restrict y) {
>> + int sum = 0;
>> + for (int i = 0; i < 8000; i++)
>> + {
>> + x[i] = x[i] + y[i];
>> + sum += x[i];
>> + }
>> + return sum;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_4.c
>> b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_4.c
>> new file mode 100644
>> index
>> 0000000000000000000000000000000000000000..53eff348605ba452e27
>> fd3ee5e022ae4bf4ce321
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_4.c
>> @@ -0,0 +1,81 @@
>> +/* { dg-do run } */
>> +/* { dg-require-effective-target aarch64_sve2p1_hw } */
>> +/* { dg-options "-O3 -march=armv8-a+sve2p1 -mautovec-preference=sve-
>> only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-
>> blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/
Again would be cleaner to not have the scheduling/reordering flags here for a
runtime test.
Ok with those changes.
Thanks,
Kyrill
>> +
>> +inline char char_abs(char i) {
>> + return (i < 0 ? -i : i);
>> +}
>> +
>> +__attribute__((noipa))
>> +int foo_int(unsigned char *x, unsigned char * restrict y) {
>> + int sum = 0;
>> + for (int i = 0; i < 100; i++)
>> + sum += char_abs(x[i] - y[i]);
>> + return sum;
>> +}
>> +
>> +__attribute__((noipa))
>> +int foo2_int(unsigned short *x, unsigned short * restrict y,
>> + unsigned short * restrict z) {
>> + int sum = 0;
>> + for (int i = 0; i < 100; i++)
>> + {
>> + z[i] = x[i] + y[i];
>> + sum += z[i];
>> + }
>> + return sum;
>> +}
>> +
>> +__attribute__((noipa))
>> +int foo_int2(unsigned char *x, unsigned char * restrict y) {
>> + int sum = 0;
>> +#pragma GCC novector
>> + for (int i = 0; i < 100; i++)
>> + sum += char_abs(x[i] - y[i]);
>> + return sum;
>> +}
>> +
>> +__attribute__((noipa))
>> +int foo2_int2(unsigned short *x, unsigned short * restrict y,
>> + unsigned short * restrict z) {
>> + int sum = 0;
>> +#pragma GCC novector
>> + for (int i = 0; i < 100; i++)
>> + {
>> + z[i] = x[i] + y[i];
>> + sum += z[i];
>> + }
>> + return sum;
>> +}
>> +
>> +int main ()
>> +{
>> + unsigned short a[100];
>> + unsigned short b[100];
>> + unsigned short r1[100];
>> + unsigned short r2[100];
>> + unsigned char c[100];
>> + unsigned char d[100];
>> +#pragma GCC novector
>> + for (int i = 0; i < 100; i++)
>> + {
>> + a[i] = c[i] = i;
>> + b[i] = d[i] = 100 - i;
>> + }
>> +
>> + if (foo_int (c, d) != foo_int2 (c, d))
>> + __builtin_abort();
>> +
>> +
>> + if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
>> + __builtin_abort();
>> +
>> +#pragma GCC novector
>> + for (int i = 0; i < 100; i++)
>> + if (r1[i] != r2[i])
>> + __builtin_abort ();
>> +
>> + return 0;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
>> \ No newline at end of file
>>
>>
>> --