This patch implements support for using dotproduct to do sum reductions by
changing += a into += (a * 1). i.e. we seed the multiplication with 1.
Given the example
int foo_int(unsigned char *x, unsigned char * restrict y) {
int sum = 0;
for (int i = 0; i < 8000; i++)
sum += char_abs(x[i] - y[i]);
return sum;
}
we used to generate
.L2:
ldr q0, [x0, x2]
ldr q28, [x1, x2]
sub v28.16b, v0.16b, v28.16b
zip1 v29.16b, v28.16b, v31.16b
zip2 v28.16b, v28.16b, v31.16b
uaddw v30.4s, v30.4s, v29.4h
uaddw2 v30.4s, v30.4s, v29.8h
uaddw v30.4s, v30.4s, v28.4h
uaddw2 v30.4s, v30.4s, v28.8h
add x2, x2, 16
cmp x2, x3
bne .L2
addv s31, v30.4s
but now generates with +dotprod
.L2:
ldr q29, [x0, x2]
ldr q28, [x1, x2]
sub v28.16b, v29.16b, v28.16b
udot v31.4s, v28.16b, v30.16b
add x2, x2, 16
cmp x2, x3
bne .L2
addv s31, v31.4s
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
PR middle-end/122069
* config/aarch64/aarch64-simd.md (widen_ssum<mode><vsi2qi>3): New.
(widen_usum<mode><vsi2qi>3): New.
gcc/testsuite/ChangeLog:
PR middle-end/122069
* gcc.target/aarch64/pr122069_3.c: New test.
* gcc.target/aarch64/pr122069_4.c: New test.
---
diff --git a/gcc/config/aarch64/aarch64-simd.md
b/gcc/config/aarch64/aarch64-simd.md
index
d4a7912a11aef0a00385d544307feee40e86754f..5ee7daf775c17b3e1d49b423f3722dd08dca1d89
100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4703,6 +4703,34 @@ (define_expand "widen_usum<mode><Vwide>3"
DONE;
})
+(define_expand "widen_ssum<mode><vsi2qi>3"
+ [(set (match_operand:VS 0 "register_operand")
+ (plus:VS (sign_extend:VS
+ (match_operand:<VSI2QI> 1 "register_operand"))
+ (match_operand:VS 2 "register_operand")))]
+ "TARGET_DOTPROD"
+ {
+ rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+ emit_insn (gen_sdot_prod<mode><vsi2qi> (operands[0], operands[1], ones,
+ operands[2]));
+ DONE;
+ }
+)
+
+(define_expand "widen_usum<mode><vsi2qi>3"
+ [(set (match_operand:VS 0 "register_operand")
+ (plus:VS (zero_extend:VS
+ (match_operand:<VSI2QI> 1 "register_operand"))
+ (match_operand:VS 2 "register_operand")))]
+ "TARGET_DOTPROD"
+ {
+ rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+ emit_insn (gen_udot_prod<mode><vsi2qi> (operands[0], operands[1], ones,
+ operands[2]));
+ DONE;
+ }
+)
+
(define_insn "aarch64_<ANY_EXTEND:su>subw<mode>"
[(set (match_operand:<VWIDE> 0 "register_operand" "=w")
(minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_3.c
b/gcc/testsuite/gcc.target/aarch64/pr122069_3.c
new file mode 100644
index
0000000000000000000000000000000000000000..a7c53ead4bf61243ec8879deb2bf2b39b6a4cd83
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122069_3.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8-a+dotprod -mautovec-preference=asimd-only
--param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks
-fno-schedule-insns2 -fdump-tree-vect-details" }*/
+/* { dg-final { check-function-bodies "**" "" } } */
+
+inline char char_abs(char i) {
+ return (i < 0 ? -i : i);
+}
+
+/*
+** foo_int:
+** ...
+** sub v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** udot v[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b
+** ...
+*/
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 8000; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+/*
+** foo2_int:
+** ...
+** add v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** uaddw2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.8h
+** ...
+*/
+int foo2_int(unsigned short *x, unsigned short * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 8000; i++)
+ {
+ x[i] = x[i] + y[i];
+ sum += x[i];
+ }
+ return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_4.c
b/gcc/testsuite/gcc.target/aarch64/pr122069_4.c
new file mode 100644
index
0000000000000000000000000000000000000000..462d7d3124b1f92f89d4ea55e289b51d36ac7cb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122069_4.c
@@ -0,0 +1,80 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -march=armv8-a+dotprod -mautovec-preference=asimd-only
--param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks
-fno-schedule-insns2 -fdump-tree-vect-details" }*/
+
+inline char char_abs(char i) {
+ return (i < 0 ? -i : i);
+}
+
+__attribute__((noipa))
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 100; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+__attribute__((noipa))
+int foo2_int(unsigned short *x, unsigned short * restrict y,
+ unsigned short * restrict z) {
+ int sum = 0;
+ for (int i = 0; i < 100; i++)
+ {
+ z[i] = x[i] + y[i];
+ sum += z[i];
+ }
+ return sum;
+}
+
+__attribute__((noipa))
+int foo_int2(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+__attribute__((noipa))
+int foo2_int2(unsigned short *x, unsigned short * restrict y,
+ unsigned short * restrict z) {
+ int sum = 0;
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ {
+ z[i] = x[i] + y[i];
+ sum += z[i];
+ }
+ return sum;
+}
+
+int main ()
+{
+ unsigned short a[100];
+ unsigned short b[100];
+ unsigned short r1[100];
+ unsigned short r2[100];
+ unsigned char c[100];
+ unsigned char d[100];
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ {
+ a[i] = c[i] = i;
+ b[i] = d[i] = 100 - i;
+ }
+
+ if (foo_int (c, d) != foo_int2 (c, d))
+ __builtin_abort();
+
+
+ if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
+ __builtin_abort();
+
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ if (r1[i] != r2[i])
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
\ No newline at end of file
--
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index d4a7912a11aef0a00385d544307feee40e86754f..5ee7daf775c17b3e1d49b423f3722dd08dca1d89 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4703,6 +4703,34 @@ (define_expand "widen_usum<mode><Vwide>3"
DONE;
})
+(define_expand "widen_ssum<mode><vsi2qi>3"
+ [(set (match_operand:VS 0 "register_operand")
+ (plus:VS (sign_extend:VS
+ (match_operand:<VSI2QI> 1 "register_operand"))
+ (match_operand:VS 2 "register_operand")))]
+ "TARGET_DOTPROD"
+ {
+ rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+ emit_insn (gen_sdot_prod<mode><vsi2qi> (operands[0], operands[1], ones,
+ operands[2]));
+ DONE;
+ }
+)
+
+(define_expand "widen_usum<mode><vsi2qi>3"
+ [(set (match_operand:VS 0 "register_operand")
+ (plus:VS (zero_extend:VS
+ (match_operand:<VSI2QI> 1 "register_operand"))
+ (match_operand:VS 2 "register_operand")))]
+ "TARGET_DOTPROD"
+ {
+ rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+ emit_insn (gen_udot_prod<mode><vsi2qi> (operands[0], operands[1], ones,
+ operands[2]));
+ DONE;
+ }
+)
+
(define_insn "aarch64_<ANY_EXTEND:su>subw<mode>"
[(set (match_operand:<VWIDE> 0 "register_operand" "=w")
(minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_3.c b/gcc/testsuite/gcc.target/aarch64/pr122069_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..a7c53ead4bf61243ec8879deb2bf2b39b6a4cd83
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122069_3.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8-a+dotprod -mautovec-preference=asimd-only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/
+/* { dg-final { check-function-bodies "**" "" } } */
+
+inline char char_abs(char i) {
+ return (i < 0 ? -i : i);
+}
+
+/*
+** foo_int:
+** ...
+** sub v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** udot v[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b
+** ...
+*/
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 8000; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+/*
+** foo2_int:
+** ...
+** add v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** uaddw2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.8h
+** ...
+*/
+int foo2_int(unsigned short *x, unsigned short * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 8000; i++)
+ {
+ x[i] = x[i] + y[i];
+ sum += x[i];
+ }
+ return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_4.c b/gcc/testsuite/gcc.target/aarch64/pr122069_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..462d7d3124b1f92f89d4ea55e289b51d36ac7cb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122069_4.c
@@ -0,0 +1,80 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -march=armv8-a+dotprod -mautovec-preference=asimd-only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/
+
+inline char char_abs(char i) {
+ return (i < 0 ? -i : i);
+}
+
+__attribute__((noipa))
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 100; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+__attribute__((noipa))
+int foo2_int(unsigned short *x, unsigned short * restrict y,
+ unsigned short * restrict z) {
+ int sum = 0;
+ for (int i = 0; i < 100; i++)
+ {
+ z[i] = x[i] + y[i];
+ sum += z[i];
+ }
+ return sum;
+}
+
+__attribute__((noipa))
+int foo_int2(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+__attribute__((noipa))
+int foo2_int2(unsigned short *x, unsigned short * restrict y,
+ unsigned short * restrict z) {
+ int sum = 0;
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ {
+ z[i] = x[i] + y[i];
+ sum += z[i];
+ }
+ return sum;
+}
+
+int main ()
+{
+ unsigned short a[100];
+ unsigned short b[100];
+ unsigned short r1[100];
+ unsigned short r2[100];
+ unsigned char c[100];
+ unsigned char d[100];
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ {
+ a[i] = c[i] = i;
+ b[i] = d[i] = 100 - i;
+ }
+
+ if (foo_int (c, d) != foo_int2 (c, d))
+ __builtin_abort();
+
+
+ if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
+ __builtin_abort();
+
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ if (r1[i] != r2[i])
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
\ No newline at end of file