https://gcc.gnu.org/g:9b3606d2c0a8f6adfaf5b912d3b015bf4e4dd514
commit r16-4559-g9b3606d2c0a8f6adfaf5b912d3b015bf4e4dd514 Author: Tamar Christina <[email protected]> Date: Wed Oct 22 10:50:35 2025 +0100 AArch64: Add support for boolean reductions for SVE The vectorizer has learned how to do boolean reductions of masks to a C bool for the operations OR, XOR and AND. This implements the new optabs for SVE. For SVE & and the | case would use the CC registers. or_reduc: ptest p0, p0.b cset w0, any and_reduc: ptrue p3.b, all nots p3.b, p3/z, p0.b cset w0, none and the ^ case we'd see if the number of active predicate lanes is a multiple of two. xor_reduc: ptrue p3.b, all cntp x0, p3, p0.b and w0, w0, 1 gcc/ChangeLog: * config/aarch64/aarch64-sve.md (reduc_sbool_and_scal_<mode>, reduc_sbool_ior_scal_<mode>, reduc_sbool_xor_scal_<mode>): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/vect-reduc-bool-1.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-2.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-3.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-4.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-5.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-6.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-7.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-8.c: New test. * gcc.target/aarch64/sve/vect-reduc-bool-9.c: New test. Diff: --- gcc/config/aarch64/aarch64-sve.md | 95 ++++++++++++++++++++++ .../gcc.target/aarch64/sve/vect-reduc-bool-1.c | 52 ++++++++++++ .../gcc.target/aarch64/sve/vect-reduc-bool-2.c | 52 ++++++++++++ .../gcc.target/aarch64/sve/vect-reduc-bool-3.c | 52 ++++++++++++ .../gcc.target/aarch64/sve/vect-reduc-bool-4.c | 52 ++++++++++++ .../gcc.target/aarch64/sve/vect-reduc-bool-5.c | 50 ++++++++++++ .../gcc.target/aarch64/sve/vect-reduc-bool-6.c | 50 ++++++++++++ .../gcc.target/aarch64/sve/vect-reduc-bool-7.c | 50 ++++++++++++ .../gcc.target/aarch64/sve/vect-reduc-bool-8.c | 50 ++++++++++++ .../gcc.target/aarch64/sve/vect-reduc-bool-9.c | 59 ++++++++++++++ 10 files changed, 562 insertions(+) diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 550ff0a3cde6..047c16f974ac 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -135,6 +135,7 @@ ;; ---- [INT,FP] Conditional reductions ;; ---- [INT] Tree reductions ;; ---- [FP] Tree reductions +;; ---- [Predicate] Tree reductions ;; ---- [FP] Left-to-right reductions ;; ;; == Permutes @@ -9887,6 +9888,100 @@ [(set_attr "sve_type" "sve_fp_reduc")] ) +;; ------------------------------------------------------------------------- +;; ---- [Predicate] Tree reductions +;; ------------------------------------------------------------------------- +;; Includes: +;; - IORV +;; - XORV +;; - ANDV +;; ------------------------------------------------------------------------- + +;; Unpredicated predicate AND tree reductions. +;; Invert the predicate and check across all lanes +;; that the Zero flag is set. +;; +;; ptrue p3.b, all +;; nots p3.b, p3/z, p0.b +;; cset w0, none +;; +(define_expand "reduc_sbool_and_scal_<mode>" + [(set (match_operand:QI 0 "register_operand") + (unspec:QI [(match_operand:PRED_ALL 1 "register_operand")] + UNSPEC_ANDV))] + "TARGET_SVE" + { + rtx ptrue = force_reg (VNx16BImode, aarch64_ptrue_all (<data_bytes>)); + rtx cast_ptrue = gen_lowpart (<MODE>mode, ptrue); + rtx tmp = gen_reg_rtx (<MODE>mode); + emit_insn (gen_aarch64_pred_one_cmpl_z (<MODE>mode, tmp, cast_ptrue, + operands[1])); + emit_insn ( + gen_aarch64_ptest<mode> (ptrue, cast_ptrue, + gen_int_mode (SVE_KNOWN_PTRUE, SImode), + tmp)); + rtx cc_reg = gen_rtx_REG (CC_NZCmode, CC_REGNUM); + rtx cmp = gen_rtx_fmt_ee (EQ, SImode, cc_reg, const0_rtx); + rtx tmp2 = gen_reg_rtx (SImode); + emit_insn (gen_aarch64_cstoresi (tmp2, cmp, cc_reg)); + emit_move_insn (operands[0], gen_lowpart (QImode, tmp2)); + DONE; + } +) + +;; Unpredicated predicate IOR tree reductions. +;; We need to make sure the results are in the CC flags, so execute a ptest +;; on the same predicate. +;; +;; ptest p0, p0.b +;; cset w0, any +;; +(define_expand "reduc_sbool_ior_scal_<mode>" + [(set (match_operand:QI 0 "register_operand") + (unspec:QI [(match_operand:PRED_ALL 1 "register_operand")] + UNSPEC_IORV))] + "TARGET_SVE" + { + rtx ptrue = lowpart_subreg (VNx16BImode, operands[1], <MODE>mode); + emit_insn ( + gen_aarch64_ptest<mode> (ptrue, operands[1], + gen_int_mode (SVE_MAYBE_NOT_PTRUE, SImode), + operands[1])); + rtx cc_reg = gen_rtx_REG (CC_NZCmode, CC_REGNUM); + rtx cmp = gen_rtx_fmt_ee (NE, SImode, cc_reg, const0_rtx); + rtx tmp = gen_reg_rtx (SImode); + emit_insn (gen_aarch64_cstoresi (tmp, cmp, cc_reg)); + emit_move_insn (operands[0], gen_lowpart (QImode, tmp)); + DONE; + } +) + +;; Unpredicated predicate XOR tree reductions. +;; Check to see if the number of active lanes in the predicates is a multiple +;; of 2. This generates: +;; +;; cntp x0, p0, p0.b +;; and w0, w0, 1 +;; +(define_expand "reduc_sbool_xor_scal_<mode>" + [(set (match_dup 2) + (zero_extend:DI + (unspec:SI [(match_dup 1) + (const_int SVE_MAYBE_NOT_PTRUE) + (match_operand:PRED_ALL 1 "register_operand")] + UNSPEC_CNTP))) + (set (match_dup 4) + (and:DI (match_dup 2) + (const_int 1))) + (set (match_operand:QI 0 "register_operand") + (subreg:QI (match_dup 4) 0))] + "TARGET_SVE" + { + operands[2] = gen_reg_rtx (DImode); + operands[4] = gen_reg_rtx (DImode); + } +) + ;; ------------------------------------------------------------------------- ;; ---- [FP] Left-to-right reductions ;; ------------------------------------------------------------------------- diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-1.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-1.c new file mode 100644 index 000000000000..177a7ddeeb0c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-1.c @@ -0,0 +1,52 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/ + +char p[128]; + +bool __attribute__((noipa)) +fand (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r &= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fior (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r |= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fand (n)) + __builtin_abort (); + + p[0] = 0; + for (int n = 1; n < 77; ++n) + if (fand (n)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fior (n)) + __builtin_abort (); + + p[0] = 1; + for (int n = 1; n < 77; ++n) + if (!fior (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-2.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-2.c new file mode 100644 index 000000000000..dd6e3939175f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-2.c @@ -0,0 +1,52 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/ + +short p[128]; + +bool __attribute__((noipa)) +fand (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r &= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fior (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r |= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fand (n)) + __builtin_abort (); + + p[0] = 0; + for (int n = 1; n < 77; ++n) + if (fand (n)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fior (n)) + __builtin_abort (); + + p[0] = 1; + for (int n = 1; n < 77; ++n) + if (!fior (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-3.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-3.c new file mode 100644 index 000000000000..cae2ac8c7ee2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-3.c @@ -0,0 +1,52 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/ + +int p[128]; + +bool __attribute__((noipa)) +fand (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r &= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fior (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r |= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fand (n)) + __builtin_abort (); + + p[0] = 0; + for (int n = 1; n < 77; ++n) + if (fand (n)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fior (n)) + __builtin_abort (); + + p[0] = 1; + for (int n = 1; n < 77; ++n) + if (!fior (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-4.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-4.c new file mode 100644 index 000000000000..3526d8ce6414 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-4.c @@ -0,0 +1,52 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/ + +long long p[128]; + +bool __attribute__((noipa)) +fand (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r &= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fior (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r |= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fand (n)) + __builtin_abort (); + + p[0] = 0; + for (int n = 1; n < 77; ++n) + if (fand (n)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fior (n)) + __builtin_abort (); + + p[0] = 1; + for (int n = 1; n < 77; ++n) + if (!fior (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-5.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-5.c new file mode 100644 index 000000000000..b6477af8e13e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-5.c @@ -0,0 +1,50 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/ + +char p[128]; + +bool __attribute__((noipa)) +fxort (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fxorf (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fxort (n) != !(n & 1)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n) != (n & 1)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fxort (n)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-6.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-6.c new file mode 100644 index 000000000000..7333aa4b32f9 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-6.c @@ -0,0 +1,50 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/ + +short p[128]; + +bool __attribute__((noipa)) +fxort (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fxorf (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fxort (n) != !(n & 1)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n) != (n & 1)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fxort (n)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-7.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-7.c new file mode 100644 index 000000000000..a28ee165f841 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-7.c @@ -0,0 +1,50 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/ + +int p[128]; + +bool __attribute__((noipa)) +fxort (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fxorf (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fxort (n) != !(n & 1)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n) != (n & 1)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fxort (n)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-8.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-8.c new file mode 100644 index 000000000000..71695b5b43fc --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-8.c @@ -0,0 +1,50 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_sve_hw } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fdump-tree-vect-details" }*/ + +long long p[128]; + +bool __attribute__((noipa)) +fxort (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fxorf (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fxort (n) != !(n & 1)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n) != (n & 1)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fxort (n)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { target { vect_int && vect_condition } } } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-9.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-9.c new file mode 100644 index 000000000000..ebccb8de9cf9 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-9.c @@ -0,0 +1,59 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/ +/* { dg-final { check-function-bodies "**" "" } } */ + +char p[128]; + +/* +** fand: +** ... +** ptrue p[0-9]+.s, all +** nots p[0-9]+.b, p[0-9]+/z, p[0-9]+.b +** cset w[0-9]+, none +** and w[0-9]+, w[0-9]+, w[0-9]+ +** ... +*/ +bool __attribute__((noipa)) +fand (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r &= (p[i] != 0); + return r; +} + +/* +** fior: +** ... +** ptest p[0-9]+, p[0-9]+.b +** cset w[0-9]+, any +** orr w[0-9]+, w[0-9]+, w[0-9]+ +** ... +*/ +bool __attribute__((noipa)) +fior (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r |= (p[i] != 0); + return r; +} + +/* +** fxor: +** ... +** cntp x[0-9]+, p[0-9]+, p[0-9]+.h +** and w[0-9]+, w[0-9]+, 1 +** eor w[0-9]+, w[0-9]+, w[0-9]+ +** ... +*/ +bool __attribute__((noipa)) +fxor (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 3 "vect" } } */
