https://gcc.gnu.org/g:75fb400d2950e1f743f133ece8fb3abe815faf13
commit r16-4483-g75fb400d2950e1f743f133ece8fb3abe815faf13 Author: Tamar Christina <[email protected]> Date: Sat Oct 18 08:22:50 2025 +0100 AArch64: Implement widen_[us]sum using 2-way [US]UDOT for SVE2p1 [PR122069] SVE2p1 adds 2-way dotproduct which we can use when we have to do a single step widening addition. This is useful for instance when the value to be widened does not come from a load. For example for int foo2_int(unsigned short *x, unsigned short * restrict y) { int sum = 0; for (int i = 0; i < 8000; i++) { x[i] = x[i] + y[i]; sum += x[i]; } return sum; } we used to generate .L12: ld1h z30.h, p7/z, [x0, x2, lsl 1] ld1h z29.h, p7/z, [x1, x2, lsl 1] add z30.h, z30.h, z29.h uaddwb z31.s, z31.s, z30.h uaddwt z31.s, z31.s, z30.h st1h z30.h, p7, [x0, x2, lsl 1] mov x3, x2 inch x2 cmp w2, w4 bls .L12 inch x3 uaddv d31, p7, z31.s but with +sve2p1 .L12: ld1h z31.h, p7/z, [x0, x2, lsl 1] ld1h z29.h, p7/z, [x1, x2, lsl 1] add z31.h, z31.h, z29.h udot z30.s, z31.h, z28.h st1h z31.h, p7, [x0, x2, lsl 1] mov x3, x2 inch x2 cmp w2, w4 bls .L12 inch x3 uaddv d30, p7, z30.s gcc/ChangeLog: PR middle-end/122069 * config/aarch64/aarch64-sve2.md (widen_ssum<mode><Vnarrow>3): Update. (widen_usum<mode><Vnarrow>3): Update. gcc/testsuite/ChangeLog: PR middle-end/122069 * gcc.target/aarch64/sve2/pr122069_3.c: New test. * gcc.target/aarch64/sve2/pr122069_4.c: New test. Diff: --- gcc/config/aarch64/aarch64-sve2.md | 26 ++++++- gcc/testsuite/gcc.target/aarch64/sve2/pr122069_3.c | 39 +++++++++++ gcc/testsuite/gcc.target/aarch64/sve2/pr122069_4.c | 81 ++++++++++++++++++++++ 3 files changed, 144 insertions(+), 2 deletions(-) diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index a05ad56aac30..910918351829 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -2390,7 +2390,19 @@ (match_dup 1)] UNSPEC_SADDWT))] "TARGET_SVE2" -) +{ + /* Use dot product to perform double widening sum reductions by + changing += a into += (a * 1). i.e. we seed the multiplication with 1. */ + if (TARGET_SVE2p1_OR_SME2 + && <VNARROW>mode == VNx8HImode + && <MODE>mode == VNx4SImode) + { + rtx ones = force_reg (VNx8HImode, CONST1_RTX (VNx8HImode)); + emit_insn (gen_sdot_prodvnx4sivnx8hi (operands[0], operands[1], + ones, operands[2])); + DONE; + } +}) ;; Define single step widening for widen_usum using UADDWB and UADDWT (define_expand "widen_usum<mode><Vnarrow>3" @@ -2405,7 +2417,17 @@ (match_dup 1)] UNSPEC_UADDWT))] "TARGET_SVE2" -) +{ + if (TARGET_SVE2p1_OR_SME2 + && <VNARROW>mode == VNx8HImode + && <MODE>mode == VNx4SImode) + { + rtx ones = force_reg (VNx8HImode, CONST1_RTX (VNx8HImode)); + emit_insn (gen_udot_prodvnx4sivnx8hi (operands[0], operands[1], + ones, operands[2])); + DONE; + } +}) ;; ------------------------------------------------------------------------- ;; ---- [INT] Long binary arithmetic diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_3.c b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_3.c new file mode 100644 index 000000000000..c50a0ccc2260 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_3.c @@ -0,0 +1,39 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8-a+sve2p1 -mautovec-preference=sve-only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/ +/* { dg-final { check-function-bodies "**" "" } } */ + +inline char char_abs(char i) { + return (i < 0 ? -i : i); +} + +/* +** foo_int: +** ... +** sub z[0-9]+.b, z[0-9]+.b, z[0-9]+.b +** udot z[0-9]+.s, z[0-9]+.b, z[0-9]+.b +** ... +*/ +int foo_int(unsigned char *x, unsigned char * restrict y) { + int sum = 0; + for (int i = 0; i < 8000; i++) + sum += char_abs(x[i] - y[i]); + return sum; +} + +/* +** foo2_int: +** ... +** udot z[0-9]+.s, z[0-9]+.h, z[0-9]+.h +** ... +*/ +int foo2_int(unsigned short *x, unsigned short * restrict y) { + int sum = 0; + for (int i = 0; i < 8000; i++) + { + x[i] = x[i] + y[i]; + sum += x[i]; + } + return sum; +} + +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_4.c b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_4.c new file mode 100644 index 000000000000..cfa232fc3fc1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_4.c @@ -0,0 +1,81 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve2p1_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve2p1 -mautovec-preference=sve-only -fdump-tree-vect-details" }*/ + +inline char char_abs(char i) { + return (i < 0 ? -i : i); +} + +__attribute__((noipa)) +int foo_int(unsigned char *x, unsigned char * restrict y) { + int sum = 0; + for (int i = 0; i < 100; i++) + sum += char_abs(x[i] - y[i]); + return sum; +} + +__attribute__((noipa)) +int foo2_int(unsigned short *x, unsigned short * restrict y, + unsigned short * restrict z) { + int sum = 0; + for (int i = 0; i < 100; i++) + { + z[i] = x[i] + y[i]; + sum += z[i]; + } + return sum; +} + +__attribute__((noipa)) +int foo_int2(unsigned char *x, unsigned char * restrict y) { + int sum = 0; +#pragma GCC novector + for (int i = 0; i < 100; i++) + sum += char_abs(x[i] - y[i]); + return sum; +} + +__attribute__((noipa)) +int foo2_int2(unsigned short *x, unsigned short * restrict y, + unsigned short * restrict z) { + int sum = 0; +#pragma GCC novector + for (int i = 0; i < 100; i++) + { + z[i] = x[i] + y[i]; + sum += z[i]; + } + return sum; +} + +int main () +{ + unsigned short a[100]; + unsigned short b[100]; + unsigned short r1[100]; + unsigned short r2[100]; + unsigned char c[100]; + unsigned char d[100]; +#pragma GCC novector + for (int i = 0; i < 100; i++) + { + a[i] = c[i] = i; + b[i] = d[i] = 100 - i; + } + + if (foo_int (c, d) != foo_int2 (c, d)) + __builtin_abort(); + + + if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2)) + __builtin_abort(); + +#pragma GCC novector + for (int i = 0; i < 100; i++) + if (r1[i] != r2[i]) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */ \ No newline at end of file
