This patch implements support for using dotproduct to do sum reductions by
changing += a into += (a * 1). i.e. we seed the multiplication with 1.
Given the example
int foo_int(unsigned char *x, unsigned char * restrict y) {
int sum = 0;
for (int i = 0; i < 8000; i++)
sum += char_abs(x[i] - y[i]);
return sum;
}
we used to generate
.L2:
ld1b z1.b, p7/z, [x0, x2]
ld1b z29.b, p7/z, [x1, x2]
sub z29.b, z1.b, z29.b
uunpklo z0.h, z29.b
uunpkhi z29.h, z29.b
uunpklo z30.s, z0.h
add z31.s, p6/m, z31.s, z30.s
uunpkhi z0.s, z0.h
add z31.s, p5/m, z31.s, z0.s
uunpklo z28.s, z29.h
add z31.s, p4/m, z31.s, z28.s
uunpkhi z29.s, z29.h
add z31.s, p3/m, z31.s, z29.s
add x2, x2, x7
whilelo p7.b, w2, w3
whilelo p3.s, w2, w6
whilelo p4.s, w2, w5
whilelo p5.s, w2, w4
whilelo p6.s, w2, w3
b.any .L2
ptrue p7.b, all
uaddv d31, p7, z31.s
but now generates with +dotprod
.L3:
ld1b z30.b, p7/z, [x5, x2]
ld1b z29.b, p7/z, [x1, x2]
sub z30.b, z30.b, z29.b
udot z31.s, z30.b, z28.b
mov x3, x2
add x2, x2, x6
cmp w2, w0
bls .L3
incb x3
uaddv d31, p7, z31.s
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
PR middle-end/122069
* config/aarch64/aarch64-sve.md (widen_<sur>sum<mode><vsi2qi>3): New.
gcc/testsuite/ChangeLog:
PR middle-end/122069
* gcc.target/aarch64/sve/pr122069_1.c: New test.
* gcc.target/aarch64/sve/pr122069_2.c: New test.
---
diff --git a/gcc/config/aarch64/aarch64-sve.md
b/gcc/config/aarch64/aarch64-sve.md
index
8c47d441c3fd6a70f0d2ef5a26883733a9fd36c1..29ef5cf990573fc9ff4bad8901f8f5004f985f36
100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -7722,6 +7722,20 @@ (define_insn
"@aarch64_<sur>dot_prod_lane<VNx4SI_ONLY:mode><VNx16QI_ONLY:mode>"
[(set_attr "sve_type" "sve_int_dot")]
)
+;; Define double widen_[su]sum as dotproduct
+(define_expand "widen_<sur>sum<mode><vsi2qi>3"
+ [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
+ (plus:SVE_FULL_SDI
+ (unspec:SVE_FULL_SDI
+ [(match_operand:<VSI2QI> 1 "register_operand")
+ (match_dup 3)]
+ DOTPROD)
+ (match_operand:SVE_FULL_SDI 2 "register_operand")))]
+ "TARGET_SVE"
+{
+ operands[3] = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+})
+
;; -------------------------------------------------------------------------
;; ---- [INT] Sum of absolute differences
;; -------------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122069_1.c
b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_1.c
new file mode 100644
index
0000000000000000000000000000000000000000..5d1f61f4a6a8d02c190aeb96cb145a3a1ca1cd20
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_1.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only --param
vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks
-fno-schedule-insns2 -fdump-tree-vect-details" }*/
+/* { dg-final { check-function-bodies "**" "" } } */
+
+inline char char_abs(char i) {
+ return (i < 0 ? -i : i);
+}
+
+/*
+** foo_int:
+** ...
+** sub z[0-9]+.b, z[0-9]+.b, z[0-9]+.b
+** udot z[0-9]+.s, z[0-9]+.b, z[0-9]+.b
+** ...
+*/
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 8000; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+/*
+** foo2_int:
+** ...
+** add z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
+** punpklo p[0-9]+.h, p[0-9]+.b
+** uunpklo z[0-9]+.s, z[0-9]+.h
+** add z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+** punpkhi p[0-9]+.h, p[0-9]+.b
+** uunpkhi z[0-9]+.s, z[0-9]+.h
+** add z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+int foo2_int(unsigned short *x, unsigned short * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 8000; i++)
+ {
+ x[i] = x[i] + y[i];
+ sum += x[i];
+ }
+ return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122069_2.c
b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_2.c
new file mode 100644
index
0000000000000000000000000000000000000000..62f7efde16811a282c1feffb97f0a229cd40482c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_2.c
@@ -0,0 +1,81 @@
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_sve_hw } */
+/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only --param
vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks
-fno-schedule-insns2 -fdump-tree-vect-details" }*/
+
+inline char char_abs(char i) {
+ return (i < 0 ? -i : i);
+}
+
+__attribute__((noipa))
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 100; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+__attribute__((noipa))
+int foo2_int(unsigned short *x, unsigned short * restrict y,
+ unsigned short * restrict z) {
+ int sum = 0;
+ for (int i = 0; i < 100; i++)
+ {
+ z[i] = x[i] + y[i];
+ sum += z[i];
+ }
+ return sum;
+}
+
+__attribute__((noipa))
+int foo_int2(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+__attribute__((noipa))
+int foo2_int2(unsigned short *x, unsigned short * restrict y,
+ unsigned short * restrict z) {
+ int sum = 0;
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ {
+ z[i] = x[i] + y[i];
+ sum += z[i];
+ }
+ return sum;
+}
+
+int main ()
+{
+ unsigned short a[100];
+ unsigned short b[100];
+ unsigned short r1[100];
+ unsigned short r2[100];
+ unsigned char c[100];
+ unsigned char d[100];
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ {
+ a[i] = c[i] = i;
+ b[i] = d[i] = 100 - i;
+ }
+
+ if (foo_int (c, d) != foo_int2 (c, d))
+ __builtin_abort();
+
+
+ if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
+ __builtin_abort();
+
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ if (r1[i] != r2[i])
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
\ No newline at end of file
--
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 8c47d441c3fd6a70f0d2ef5a26883733a9fd36c1..29ef5cf990573fc9ff4bad8901f8f5004f985f36 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -7722,6 +7722,20 @@ (define_insn "@aarch64_<sur>dot_prod_lane<VNx4SI_ONLY:mode><VNx16QI_ONLY:mode>"
[(set_attr "sve_type" "sve_int_dot")]
)
+;; Define double widen_[su]sum as dotproduct
+(define_expand "widen_<sur>sum<mode><vsi2qi>3"
+ [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
+ (plus:SVE_FULL_SDI
+ (unspec:SVE_FULL_SDI
+ [(match_operand:<VSI2QI> 1 "register_operand")
+ (match_dup 3)]
+ DOTPROD)
+ (match_operand:SVE_FULL_SDI 2 "register_operand")))]
+ "TARGET_SVE"
+{
+ operands[3] = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+})
+
;; -------------------------------------------------------------------------
;; ---- [INT] Sum of absolute differences
;; -------------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122069_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..5d1f61f4a6a8d02c190aeb96cb145a3a1ca1cd20
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_1.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/
+/* { dg-final { check-function-bodies "**" "" } } */
+
+inline char char_abs(char i) {
+ return (i < 0 ? -i : i);
+}
+
+/*
+** foo_int:
+** ...
+** sub z[0-9]+.b, z[0-9]+.b, z[0-9]+.b
+** udot z[0-9]+.s, z[0-9]+.b, z[0-9]+.b
+** ...
+*/
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 8000; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+/*
+** foo2_int:
+** ...
+** add z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
+** punpklo p[0-9]+.h, p[0-9]+.b
+** uunpklo z[0-9]+.s, z[0-9]+.h
+** add z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+** punpkhi p[0-9]+.h, p[0-9]+.b
+** uunpkhi z[0-9]+.s, z[0-9]+.h
+** add z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+int foo2_int(unsigned short *x, unsigned short * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 8000; i++)
+ {
+ x[i] = x[i] + y[i];
+ sum += x[i];
+ }
+ return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122069_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..62f7efde16811a282c1feffb97f0a229cd40482c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_2.c
@@ -0,0 +1,81 @@
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_sve_hw } */
+/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/
+
+inline char char_abs(char i) {
+ return (i < 0 ? -i : i);
+}
+
+__attribute__((noipa))
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 100; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+__attribute__((noipa))
+int foo2_int(unsigned short *x, unsigned short * restrict y,
+ unsigned short * restrict z) {
+ int sum = 0;
+ for (int i = 0; i < 100; i++)
+ {
+ z[i] = x[i] + y[i];
+ sum += z[i];
+ }
+ return sum;
+}
+
+__attribute__((noipa))
+int foo_int2(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+__attribute__((noipa))
+int foo2_int2(unsigned short *x, unsigned short * restrict y,
+ unsigned short * restrict z) {
+ int sum = 0;
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ {
+ z[i] = x[i] + y[i];
+ sum += z[i];
+ }
+ return sum;
+}
+
+int main ()
+{
+ unsigned short a[100];
+ unsigned short b[100];
+ unsigned short r1[100];
+ unsigned short r2[100];
+ unsigned char c[100];
+ unsigned char d[100];
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ {
+ a[i] = c[i] = i;
+ b[i] = d[i] = 100 - i;
+ }
+
+ if (foo_int (c, d) != foo_int2 (c, d))
+ __builtin_abort();
+
+
+ if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
+ __builtin_abort();
+
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ if (r1[i] != r2[i])
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
\ No newline at end of file