[match.pd] Mid-end fix for r277110
Hi, SVE2 vectorization for BSL and NBSL fails when the element type is unsigned 8/16-bit. The operands are being converted implicitly to corresponding signed types, which the mid-end fold pattern does not take into account; this patch augments the pattern with type conversion checks in order to rectify the above problem. #define TYPE uint{8,16}_t void foo (TYPE *a, TYPE *b, TYPE *c, TYPE *d, int n) { for (int i = 0; i < n; i++) a[i] = OP (b[i], c[i], d[i]); } BSL: // #define OP(x,y,z) (((x) & (z)) | ((y) & ~(z))) beforeand z1.d, z2.d, z1.d bic z0.d, z0.d, z2.d orr z0.d, z0.d, z1.d ... after bsl z0.d, z0.d, z1.d, z2.d NBSL: // #define OP(x,y,z) ~(((x) & (z)) | ((y) & ~(z))) beforeand z1.d, z2.d, z1.d bic z0.d, z0.d, z2.d orr z0.d, z0.d, z1.d not z0.{b,h}, p1/m, z0.{b,h} ... after nbslz0.d, z0.d, z1.d, z2.d The GIMPLE output for BSL shows where conversions could be inserted: _1 = b[i]; _2 = d[i]; _3 = _1 & _2; _4 = (signed short) _3; _5 = c[i]; _6 = (signed short) _5; _7 = d[i]; _8 = (signed short) _7; _9 = ~_8; _10 = _6 & _9; _11 = _4 | _10; _12 = (short unsigned int) _11; a[i] = _12; In contrast, for 32/64-bit types (regardless of signedness): _1 = b[i]; _2 = d[i]; _3 = _1 & _2; _4 = c[i]; _5 = d[i]; _6 = ~_5; _7 = _4 & _6; _8 = _3 | _7; _9 = ~_8; a[i] = _9; Built and tested on aarch64-none-elf. Regards, Yuliang Wang gcc/ChangeLog: 2019-10-17 Yuliang Wang * match.pd (/* (x & ~m) | (y & m) -> ... */): Modified fold pattern. * genmatch.c (convert3): New convert operation to support the above. gcc/testsuite/ChangeLog: 2019-10-17 Yuliang Wang * gcc.target/aarch64/sve2/bitsel_1.c: Add testing for unsigned types. * gcc.target/aarch64/sve2/bitsel_2.c: As above. * gcc.target/aarch64/sve2/bitsel_3.c: As above. * gcc.target/aarch64/sve2/bitsel_4.c: As above. * gcc.target/aarch64/sve2/eor3_1.c: As above. diff --git a/gcc/genmatch.c b/gcc/genmatch.c index 7db1f135840e09e794e2921859fa8e9b7fa8..ce87ae33e0b3c06f4d1fde8d8e74bf2210ee7a5a 100644 --- a/gcc/genmatch.c +++ b/gcc/genmatch.c @@ -227,6 +227,7 @@ enum tree_code { CONVERT0, CONVERT1, CONVERT2, +CONVERT3, VIEW_CONVERT0, VIEW_CONVERT1, VIEW_CONVERT2, @@ -1176,6 +1177,7 @@ lower_opt_convert (operand *o) = { CONVERT0, CONVERT_EXPR, CONVERT1, CONVERT_EXPR, CONVERT2, CONVERT_EXPR, + CONVERT3, CONVERT_EXPR, VIEW_CONVERT0, VIEW_CONVERT_EXPR, VIEW_CONVERT1, VIEW_CONVERT_EXPR, VIEW_CONVERT2, VIEW_CONVERT_EXPR }; @@ -4145,8 +4147,8 @@ parser::record_operlist (location_t loc, user_id *p) } } -/* Parse the operator ID, special-casing convert?, convert1? and - convert2? */ +/* Parse the operator ID, special-casing convert?, convert1?, convert2? and + convert3? */ id_base * parser::parse_operation () @@ -4167,6 +4169,8 @@ parser::parse_operation () ; else if (strcmp (id, "convert2") == 0) ; + else if (strcmp (id, "convert3") == 0) + ; else if (strcmp (id, "view_convert") == 0) id = "view_convert0"; else if (strcmp (id, "view_convert1") == 0) @@ -4183,6 +4187,7 @@ parser::parse_operation () } else if (strcmp (id, "convert1") == 0 || strcmp (id, "convert2") == 0 + || strcmp (id, "convert3") == 0 || strcmp (id, "view_convert1") == 0 || strcmp (id, "view_convert2") == 0) fatal_at (id_tok, "expected '?' after conditional operator"); @@ -4723,9 +4728,9 @@ parser::parse_for (location_t) id_base *idb = get_operator (oper, true); if (idb == NULL) fatal_at (token, "no such operator '%s'", oper); - if (*idb == CONVERT0 || *idb == CONVERT1 || *idb == CONVERT2 - || *idb == VIEW_CONVERT0 || *idb == VIEW_CONVERT1 - || *idb == VIEW_CONVERT2) + if (*idb == CONVERT0 || *idb == VIEW_CONVERT0 + || *idb == CONVERT1 || *idb == CONVERT2|| *idb == CONVERT3 + || *idb == VIEW_CONVERT1 || *idb == VIEW_CONVERT2) fatal_at (token, "conditional operators cannot be used inside for"); if (arity == -1) @@ -5136,6 +5141,7 @@ main (int argc, char **argv) add_operator (CONVERT0, "convert0", "tcc_unary", 1); add_operator (CONVERT1, "convert1", "tcc_unary", 1); add_operator (CONVERT2, "convert2", "tcc_unary", 1); +add_operator (CONVERT3, "convert3", "tcc_unary", 1); add_operator (VIEW_CONVERT0, "view_convert0", "tcc_unary", 1); add_operator (VIEW_CONVERT1
[AArch64][SVE2] Fix for r277110 (BSL variants)
Hi, SVE2 vectorization for BSL and NBSL fails when the element type is unsigned 8/16-bit. The operands are being converted implicitly to corresponding signed types, which the mid-end fold pattern does not take into account; this patch augments the pattern with type conversion checks in order to rectify the above problem. #define TYPE uint{8,16}_t void foo (TYPE *a, TYPE *b, TYPE *c, TYPE *d, int n) { for (int i = 0; i < n; i++) a[i] = OP (b[i], c[i], d[i]); } BSL: // #define OP(x,y,z) (((x) & (z)) | ((y) & ~(z))) beforeand z1.d, z2.d, z1.d bic z0.d, z0.d, z2.d orr z0.d, z0.d, z1.d ... after bsl z0.d, z0.d, z1.d, z2.d NBSL: // #define OP(x,y,z) ~(((x) & (z)) | ((y) & ~(z))) beforeand z1.d, z2.d, z1.d bic z0.d, z0.d, z2.d orr z0.d, z0.d, z1.d not z0.{b,h}, p1/m, z0.{b,h} ... after nbslz0.d, z0.d, z1.d, z2.d The GIMPLE output for BSL shows where conversions could be inserted: _1 = b[i]; _2 = d[i]; _3 = _1 & _2; _4 = (signed short) _3; _5 = c[i]; _6 = (signed short) _5; _7 = d[i]; _8 = (signed short) _7; _9 = ~_8; _10 = _6 & _9; _11 = _4 | _10; _12 = (short unsigned int) _11; a[i] = _12; In contrast, for 32/64-bit types (regardless of signedness): _1 = b[i]; _2 = d[i]; _3 = _1 & _2; _4 = c[i]; _5 = d[i]; _6 = ~_5; _7 = _4 & _6; _8 = _3 | _7; _9 = ~_8; a[i] = _9; Built and tested on aarch64-none-elf. Regards, Yuliang Wang gcc/ChangeLog: 2019-10-17 Yuliang Wang * match.pd (/* (x & ~m) | (y & m) -> ... */): Modified fold pattern. * genmatch.c (convert3): New convert operation to support the above. gcc/testsuite/ChangeLog: 2019-10-17 Yuliang Wang * gcc.target/aarch64/sve2/bitsel_1.c: Add testing for unsigned types. * gcc.target/aarch64/sve2/bitsel_2.c: As above. * gcc.target/aarch64/sve2/bitsel_3.c: As above. * gcc.target/aarch64/sve2/bitsel_4.c: As above. * gcc.target/aarch64/sve2/eor3_1.c: As above. diff --git a/gcc/genmatch.c b/gcc/genmatch.c index 7db1f135840e09e794e2921859fa8e9b7fa8..ce87ae33e0b3c06f4d1fde8d8e74bf2210ee7a5a 100644 --- a/gcc/genmatch.c +++ b/gcc/genmatch.c @@ -227,6 +227,7 @@ enum tree_code { CONVERT0, CONVERT1, CONVERT2, +CONVERT3, VIEW_CONVERT0, VIEW_CONVERT1, VIEW_CONVERT2, @@ -1176,6 +1177,7 @@ lower_opt_convert (operand *o) = { CONVERT0, CONVERT_EXPR, CONVERT1, CONVERT_EXPR, CONVERT2, CONVERT_EXPR, + CONVERT3, CONVERT_EXPR, VIEW_CONVERT0, VIEW_CONVERT_EXPR, VIEW_CONVERT1, VIEW_CONVERT_EXPR, VIEW_CONVERT2, VIEW_CONVERT_EXPR }; @@ -4145,8 +4147,8 @@ parser::record_operlist (location_t loc, user_id *p) } } -/* Parse the operator ID, special-casing convert?, convert1? and - convert2? */ +/* Parse the operator ID, special-casing convert?, convert1?, convert2? and + convert3? */ id_base * parser::parse_operation () @@ -4167,6 +4169,8 @@ parser::parse_operation () ; else if (strcmp (id, "convert2") == 0) ; + else if (strcmp (id, "convert3") == 0) + ; else if (strcmp (id, "view_convert") == 0) id = "view_convert0"; else if (strcmp (id, "view_convert1") == 0) @@ -4183,6 +4187,7 @@ parser::parse_operation () } else if (strcmp (id, "convert1") == 0 || strcmp (id, "convert2") == 0 + || strcmp (id, "convert3") == 0 || strcmp (id, "view_convert1") == 0 || strcmp (id, "view_convert2") == 0) fatal_at (id_tok, "expected '?' after conditional operator"); @@ -4723,9 +4728,9 @@ parser::parse_for (location_t) id_base *idb = get_operator (oper, true); if (idb == NULL) fatal_at (token, "no such operator '%s'", oper); - if (*idb == CONVERT0 || *idb == CONVERT1 || *idb == CONVERT2 - || *idb == VIEW_CONVERT0 || *idb == VIEW_CONVERT1 - || *idb == VIEW_CONVERT2) + if (*idb == CONVERT0 || *idb == VIEW_CONVERT0 + || *idb == CONVERT1 || *idb == CONVERT2|| *idb == CONVERT3 + || *idb == VIEW_CONVERT1 || *idb == VIEW_CONVERT2) fatal_at (token, "conditional operators cannot be used inside for"); if (arity == -1) @@ -5136,6 +5141,7 @@ main (int argc, char **argv) add_operator (CONVERT0, "convert0", "tcc_unary", 1); add_operator (CONVERT1, "convert1", "tcc_unary", 1); add_operator (CONVERT2, "convert2", "tcc_unary", 1); +add_operator (CONVERT3, "convert3", "tcc_unary", 1); add_operator (VIEW_CONVERT0, "view_convert0", "tcc_unary", 1); add_operator (VIEW_CONVERT1
RE: [AArch64][SVE2] Support for EOR3 and variants of BSL
Thanks very much, updated. Regards, Yuliang gcc/ChangeLog: 2019-10-17 Yuliang Wang * config/aarch64/aarch64-sve2.md (aarch64_sve2_eor3) (aarch64_sve2_nor, aarch64_sve2_nand) (aarch64_sve2_bsl, aarch64_sve2_nbsl) (aarch64_sve2_bsl1n, aarch64_sve2_bsl2n): New combine patterns. * config/aarch64/iterators.md (BSL_DUP): New int iterator for the above. (bsl_1st, bsl_2nd, bsl_dup, bsl_mov): Attributes for the above. gcc/testsuite/ChangeLog: 2019-10-17 Yuliang Wang * gcc.target/aarch64/sve2/eor3_1.c: New test. * gcc.target/aarch64/sve2/nlogic_1.c: As above. * gcc.target/aarch64/sve2/nlogic_2.c: As above. * gcc.target/aarch64/sve2/bitsel_1.c: As above. * gcc.target/aarch64/sve2/bitsel_2.c: As above. * gcc.target/aarch64/sve2/bitsel_3.c: As above. * gcc.target/aarch64/sve2/bitsel_4.c: As above. diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index b018f5b0bc9b51edf831e2571f0f5a9af2210829..1158a76c49adc329d72a9eb9dbe6bf6f380f92c6 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -142,3 +142,188 @@ } ) +;; Unpredicated 3-way exclusive OR. +(define_insn "*aarch64_sve2_eor3" + [(set (match_operand:SVE_I 0 "register_operand" "=w, w, w, ?&w") + (xor:SVE_I + (xor:SVE_I + (match_operand:SVE_I 1 "register_operand" "0, w, w, w") + (match_operand:SVE_I 2 "register_operand" "w, 0, w, w")) + (match_operand:SVE_I 3 "register_operand" "w, w, 0, w")))] + "TARGET_SVE2" + "@ + eor3\t%0.d, %0.d, %2.d, %3.d + eor3\t%0.d, %0.d, %1.d, %3.d + eor3\t%0.d, %0.d, %1.d, %2.d + movprfx\t%0, %1\;eor3\t%0.d, %0.d, %2.d, %3.d" + [(set_attr "movprfx" "*,*,*,yes")] +) + +;; Use NBSL for vector NOR. +(define_insn_and_rewrite "*aarch64_sve2_nor" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (unspec:SVE_I + [(match_operand 3) + (and:SVE_I +(not:SVE_I + (match_operand:SVE_I 1 "register_operand" "%0, w")) +(not:SVE_I + (match_operand:SVE_I 2 "register_operand" "w, w")))] + UNSPEC_PRED_X))] + "TARGET_SVE2" + "@ + nbsl\t%0.d, %0.d, %2.d, %0.d + movprfx\t%0, %1\;nbsl\t%0.d, %0.d, %2.d, %0.d" + "&& !CONSTANT_P (operands[3])" + { +operands[3] = CONSTM1_RTX (mode); + } + [(set_attr "movprfx" "*,yes")] +) + +;; Use NBSL for vector NAND. +(define_insn_and_rewrite "*aarch64_sve2_nand" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (unspec:SVE_I + [(match_operand 3) + (ior:SVE_I +(not:SVE_I + (match_operand:SVE_I 1 "register_operand" "%0, w")) +(not:SVE_I + (match_operand:SVE_I 2 "register_operand" "w, w")))] + UNSPEC_PRED_X))] + "TARGET_SVE2" + "@ + nbsl\t%0.d, %0.d, %2.d, %2.d + movprfx\t%0, %1\;nbsl\t%0.d, %0.d, %2.d, %2.d" + "&& !CONSTANT_P (operands[3])" + { +operands[3] = CONSTM1_RTX (mode); + } + [(set_attr "movprfx" "*,yes")] +) + +;; Unpredicated bitwise select. +;; (op3 ? bsl_mov : bsl_dup) == (((bsl_mov ^ bsl_dup) & op3) ^ bsl_dup) +(define_insn "*aarch64_sve2_bsl" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (xor:SVE_I + (and:SVE_I + (xor:SVE_I + (match_operand:SVE_I 1 "register_operand" ", w") + (match_operand:SVE_I 2 "register_operand" ", w")) + (match_operand:SVE_I 3 "register_operand" "w, w")) + (match_dup BSL_DUP)))] + "TARGET_SVE2" + "@ + bsl\t%0.d, %0.d, %.d, %3.d + movprfx\t%0, %\;bsl\t%0.d, %0.d, %.d, %3.d" + [(set_attr "movprfx" "*,yes")] +) + +;; Unpredicated bitwise inverted select. +;; (~(op3 ? bsl_mov : bsl_dup)) == (~(((bsl_mov ^ bsl_dup) & op3) ^ bsl_dup)) +(define_insn_and_rewrite "*aarch64_sve2_nbsl" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (unspec:SVE_I + [(match_operand 4) + (not:SVE_I +(xor:SVE_I + (and:SVE_I +(xor:SVE_I + (match_operand:SVE_I 1 "register_operand" ", w") + (match_operand:SVE_I 2 "register_operand" ", w")) +(match_operand:SVE_I 3 "register_operand" "w, w")) + (match_dup
RE: [AArch64][SVE2] Support for EOR3 and variants of BSL
Hi Richard, Thanks for the suggestions, updated. Regards, Yuliang gcc/ChangeLog: 2019-10-17 Yuliang Wang * config/aarch64/aarch64-sve2.md (aarch64_sve2_eor3) (aarch64_sve2_nor, aarch64_sve2_nand) (aarch64_sve2_bsl, aarch64_sve2_nbsl) (aarch64_sve2_bsl1n, aarch64_sve2_bsl2n): New combine patterns. * config/aarch64/iterators.md (BSL_DUP): New int iterator for the above. (bsl_1st, bsl_2nd, bsl_dup, bsl_mov): Attributes for the above. * config/aarch64/aarch64.h (AARCH64_ISA_SVE2_SHA3): New ISA flag macro. (TARGET_SVE2_SHA3): New CPU target. gcc/testsuite/ChangeLog: 2019-10-17 Yuliang Wang * gcc.target/aarch64/sve2/eor3_1.c: New test. * gcc.target/aarch64/sve2/eor3_2.c: As above. * gcc.target/aarch64/sve2/nlogic_1.c: As above. * gcc.target/aarch64/sve2/nlogic_2.c: As above. * gcc.target/aarch64/sve2/bitsel_1.c: As above. * gcc.target/aarch64/sve2/bitsel_2.c: As above. * gcc.target/aarch64/sve2/bitsel_3.c: As above. * gcc.target/aarch64/sve2/bitsel_4.c: As above. diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index b018f5b0bc9b51edf831e2571f0f5a9af2210829..08d5214a3debb9e9a0796da0af3009ed3ff55774 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -142,3 +142,189 @@ } ) +;; Unpredicated 3-way exclusive OR. +(define_insn "*aarch64_sve2_eor3" + [(set (match_operand:SVE_I 0 "register_operand" "=w, w, w, ?&w") + (xor:SVE_I + (xor:SVE_I + (match_operand:SVE_I 1 "register_operand" "0, w, w, w") + (match_operand:SVE_I 2 "register_operand" "w, 0, w, w")) + (match_operand:SVE_I 3 "register_operand" "w, w, 0, w")))] + "TARGET_SVE2_SHA3" + "@ + eor3\t%0.d, %0.d, %2.d, %3.d + eor3\t%0.d, %0.d, %1.d, %3.d + eor3\t%0.d, %0.d, %1.d, %2.d + movprfx\t%0, %1\;eor3\t%0.d, %0.d, %2.d, %3.d" + [(set_attr "movprfx" "*,*,*,yes")] +) + +;; Use NBSL for vector NOR. +(define_insn_and_rewrite "*aarch64_sve2_nor" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (unspec:SVE_I + [(match_operand 3) + (and:SVE_I +(not:SVE_I + (match_operand:SVE_I 1 "register_operand" "%0, w")) +(not:SVE_I + (match_operand:SVE_I 2 "register_operand" "w, w")))] + UNSPEC_PRED_X))] + "TARGET_SVE2" + "@ + nbsl\t%0.d, %0.d, %2.d, %0.d + movprfx\t%0, %1\;nbsl\t%0.d, %0.d, %2.d, %0.d" + "&& !CONSTANT_P (operands[3])" + { +operands[3] = CONSTM1_RTX (mode); + } + [(set_attr "movprfx" "*,yes")] +) + +;; Use NBSL for vector NAND. +(define_insn_and_rewrite "*aarch64_sve2_nand" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (unspec:SVE_I + [(match_operand 3) + (ior:SVE_I +(not:SVE_I + (match_operand:SVE_I 1 "register_operand" "%0, w")) +(not:SVE_I + (match_operand:SVE_I 2 "register_operand" "w, w")))] + UNSPEC_PRED_X))] + "TARGET_SVE2" + "@ + nbsl\t%0.d, %0.d, %2.d, %2.d + movprfx\t%0, %1\;nbsl\t%0.d, %0.d, %2.d, %2.d" + "&& !CONSTANT_P (operands[3])" + { +operands[3] = CONSTM1_RTX (mode); + } + [(set_attr "movprfx" "*,yes")] +) + +;; Unpredicated bitwise select. +;; N.B. non-canonical equivalent form due to expand pass. +;; (op3 ? bsl_mov : bsl_dup) == (((bsl_mov ^ bsl_dup) & op3) ^ bsl_dup) +(define_insn "*aarch64_sve2_bsl" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (xor:SVE_I + (and:SVE_I + (xor:SVE_I + (match_operand:SVE_I 1 "register_operand" ", w") + (match_operand:SVE_I 2 "register_operand" ", w")) + (match_operand:SVE_I 3 "register_operand" "w, w")) + (match_dup BSL_DUP)))] + "TARGET_SVE2" + "@ + bsl\t%0.d, %0.d, %.d, %3.d + movprfx\t%0, %\;bsl\t%0.d, %0.d, %.d, %3.d" + [(set_attr "movprfx" "*,yes")] +) + +;; Unpredicated bitwise inverted select. +;; N.B. non-canonical equivalent form. +;; (~(op3 ? bsl_mov : bsl_dup)) == (~(((bsl_mov ^ bsl_dup) & op3) ^ bsl_dup)) +(define_insn_and_rewrite "*aarch64_sve2_nbsl" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (unspec:SVE_I + [(match_operand 4) + (not:SVE_I +(xor:SVE_I + (and:SVE_I +(xor
[AArch64][SVE2] Support for EOR3 and variants of BSL
Hi, This patch adds combine pass support for the following SVE2 bitwise logic instructions: - EOR3 (3-way vector exclusive OR) - BSL (bitwise select) - NBSL (inverted ") - BSL1N (" with first input inverted) - BSL2N (" with second input inverted) Example template snippet: void foo (TYPE *a, TYPE *b, TYPE *c, TYPE *d, int n) { for (int i = 0; i < n; i++) a[i] = OP (b[i], c[i], d[i]); } EOR3: // #define OP(x,y,z) ((x) ^ (y) ^ (z)) beforeeor z1.d, z1.d, z2.d eor z0.d, z0.d, z1.d ... after eor3z0.d, z0.d, z1.d, z2.d BSL: // #define OP(x,y,z) (((x) & (z)) | ((y) & ~(z))) beforeeor z0.d, z0.d, z1.d and z0.d, z0.d, z2.d eor z0.d, z0.d, z1.d ... after bsl z0.d, z0.d, z1.d, z2.d NBSL: // #define OP(x,y,z) ~(((x) & (z)) | ((y) & ~(z))) beforeeor z0.d, z0.d, z1.d and z0.d, z0.d, z2.d eor z0.d, z0.d, z1.d not z0.s, p1/m, z0.s ... after nbslz0.d, z0.d, z1.d, z2.d BSL1N: // #define OP(x,y,z) ((~(x) & (z)) | ((y) & ~(z))) beforeeor z0.d, z0.d, z1.d bic z0.d, z2.d, z0.d eor z0.d, z0.d, z1.d ... after bsl1n z0.d, z0.d, z1.d, z2.d BSL2N: // #define OP(x,y,z) (((x) & (z)) | (~(y) & ~(z))) beforeorr z0.d, z1.d, z0.d and z1.d, z1.d, z2.d not z0.s, p1/m, z0.s orr z0.d, z0.d, z1.d ... after bsl2n z0.d, z0.d, z1.d, z2.d Additionally, vector NOR and NAND operations are now optimized with NBSL: NOR x, y -> NBSL x, y, x NAND x, y -> NBSL x, y, y Built and tested on aarch64-none-elf. Best Regards, Yuliang Wang gcc/ChangeLog: 2019-10-16 Yuliang Wang * config/aarch64/aarch64-sve2.md (aarch64_sve2_eor3) (aarch64_sve2_nor, aarch64_sve2_nand) (aarch64_sve2_bsl, aarch64_sve2_nbsl) (aarch64_sve2_bsl1n, aarch64_sve2_bsl2n): New combine patterns. * config/aarch64/iterators.md (BSL_3RD): New int iterator for the above. (bsl_1st, bsl_2nd, bsl_3rd, bsl_mov): Attributes for the above. * config/aarch64/aarch64.h (AARCH64_ISA_SVE2_AES, AARCH64_ISA_SVE2_SM4) (AARCH64_ISA_SVE2_SHA3, AARCH64_ISA_SVE2_BITPERM): New ISA flag macros. (TARGET_SVE2_AES, TARGET_SVE2_SM4, TARGET_SVE2_SHA3) (TARGET_SVE2_BITPERM): New CPU targets. gcc/testsuite/ChangeLog: 2019-10-16 Yuliang Wang * gcc.target/aarch64/sve2/eor3_1.c: New test. * gcc.target/aarch64/sve2/eor3_2.c: As above. * gcc.target/aarch64/sve2/nlogic_1.c: As above. * gcc.target/aarch64/sve2/nlogic_2.c: As above. * gcc.target/aarch64/sve2/bitsel_1.c: As above. * gcc.target/aarch64/sve2/bitsel_2.c: As above. * gcc.target/aarch64/sve2/bitsel_3.c: As above. * gcc.target/aarch64/sve2/bitsel_4.c: As above. rb11975.patch Description: rb11975.patch
RE: [AArch64][SVE2] Shift-Right Accumulate combine patterns
Hi Christophe, Thanks for pointing this out, uploaded a fix. Regards, Yuliang From: Christophe Lyon Sent: 04 October 2019 09:25 To: Richard Sandiford Cc: Yuliang Wang ; gcc-patches@gcc.gnu.org; nd Subject: Re: [AArch64][SVE2] Shift-Right Accumulate combine patterns On Fri, 27 Sep 2019 at 10:12, Richard Sandiford mailto:richard.sandif...@arm.com>> wrote: Yuliang Wang mailto:yuliang.w...@arm.com>> writes: > Hi, > > This patch adds combining support for SVE2's shift-right accumulate > instructions. > > Example snippet: > > #define IMM ... > > void foo (TYPE a, TYPE b, int n) > { > for (int i = 0; i < n; i++) > a[i] += b[i] >> IMM; > } > > Signed: > > beforeasr z0.s, z0.s, #{IMM} > add z0.s, z0.s, z1.s > ... > after ssraz0.s, z1.s, #{IMM} > > Unsigned: > > beforelsr z0.s, z0.s, #{IMM} > add z0.s, z0.s, z1.s > ... > after usraz0.s, z1.s, #{IMM} > > Built and regression tested on aarch64-none-elf. > > Best Regards, > Yuliang Wang > > > gcc/ChangeLog: > > 2019-09-26 Yuliang Wang mailto:yuliang.w...@arm.com>> > > * config/aarch64/aarch64-sve2.md (aarch64_sve2_sra): > New combine pattern. > > gcc/testsuite/ChangeLog: > > 2019-09-26 Yuliang Wang mailto:yuliang.w...@arm.com>> > > * gcc.target/aarch64/sve2/shracc_1.c: New test. Thanks, applied as r276174. Hi, I've noticed that the new test fails with -mabi=ilp32 Christophe Richard
[AArch64][SVE2] Fix for new test in r276174
Hi, The new test added as part of r276174 fails on an ilp32 target, which this patch fixes. Thanks to Christophe Lyon for pointing this out. Regression tested on aarch64-none-elf. Best Regards, Yuliang Wang gcc/testsuite/ChangeLog: 2019-10-04 Yuliang Wang * gcc.target/aarch64/sve2/shracc_1.c: Amended test. rb11915.patch Description: rb11915.patch
RE: [AArch64][SVE] Utilize ASRD instruction for division and remainder
Thanks for the corrections, updated. Regards Yuliang (no ChangeLog updates) -Original Message- From: Richard Sandiford Sent: 27 September 2019 11:20 To: Yuliang Wang Cc: gcc-patches@gcc.gnu.org; nd Subject: Re: [AArch64][SVE] Utilize ASRD instruction for division and remainder Yuliang Wang writes: > +;; Unpredicated arithmetic right shift for division by power-of-2. > +(define_expand "sdiv_pow23" > + [(set (match_operand:SVE_I 0 "register_operand" "") > + (unspec:SVE_I > + [(match_dup 3) > +(unspec:SVE_I > + [(match_operand:SVE_I 1 "register_operand" "") > + (match_operand 2 "aarch64_simd_rshift_imm")] > + UNSPEC_ASRD)] > + UNSPEC_PRED_X))] > + "TARGET_SVE" > + { > +operands[3] = aarch64_ptrue_reg (mode); > + } > +) Sorry for not noticing last time, but: define_expands shouldn't have constraints, so it's better to drop the empty "" from the match_operands. > diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi index > 4ace224a8ff5ed4fafed10a69ef00ffb2d7d8c39..009b8f8db74c7a3bef996ceaba58 > 123f6558221c 100644 > --- a/gcc/doc/sourcebuild.texi > +++ b/gcc/doc/sourcebuild.texi > @@ -1446,6 +1446,10 @@ of bytes. > Target supports both signed and unsigned > multiply-high-with-round-and-scale > operations on vectors of half-words. > > +@item vect_sdiv_pow2_si > +Target supports signed division by constant power-of-2 operations on > +vectors of words. "4 bytes" is more accurate than "words". The problem with "word" is that it depends on context; e.g. although the AArch64 ISA uses "word" to mean 32 bits, its words are really 64 bits as far as GCC is concerned. (It's also worth noting that "4 bytes" isn't necessarily 32 bits, since GCC supports 16-bit and 32-bit bytes.) All a biit pedantic, sorry. > +/* { dg-final { scan-assembler-not {\tand\t%} } } */ > + Stray newline at end of file. > diff --git a/gcc/testsuite/lib/target-supports.exp > b/gcc/testsuite/lib/target-supports.exp > index > 414bf80003b9192806f79afed9393f9ef4750a7d..dedee87ced3d4cbed18fe4144282 > e5b4330a113d 100644 > --- a/gcc/testsuite/lib/target-supports.exp > +++ b/gcc/testsuite/lib/target-supports.exp > @@ -6192,6 +6192,14 @@ proc check_effective_target_vect_mulhrs_hi {} { > && [check_effective_target_aarch64_sve2] }] } > > +# Return 1 if the target plus current options supports signed > +division # by power-of-2 operations on vectors of half-words. "words" rather than "half-words", although as above 4 bytes is more accurate. > + > +proc check_effective_target_vect_sdiv_pow2_si {} { > +return [expr { [istarget aarch64*-*-*] > +&& [check_effective_target_aarch64_sve] }] } > + > # Return 1 if the target plus current options supports a vector # > demotion (packing) of shorts (to chars) and ints (to shorts) # using > modulo arithmetic, 0 otherwise. > diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c index > 2f86f9e4fc7039add1b1d7b82574cb8262eb4ba4..f09e9d54701ebee0382742d20d4f > 5a0db84110de 100644 > --- a/gcc/tree-vect-patterns.c > +++ b/gcc/tree-vect-patterns.c > @@ -2925,6 +2925,38 @@ vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, > tree *type_out) >/* Pattern detected. */ >vect_pattern_detected ("vect_recog_divmod_pattern", last_stmt); > > + *type_out = vectype; > + > + /* Check if the target supports this internal function. */ > + internal_fn ifn = IFN_DIV_POW2; > + if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED)) > + { > + tree shift = build_int_cst (itype, tree_log2 (oprnd1)); > + > + tree var_div = vect_recog_temp_ssa_var (itype, NULL); > + gimple *div_stmt = gimple_build_call_internal (ifn, 2, oprnd0, shift); > + gimple_call_set_lhs (div_stmt, var_div); > + > + if (rhs_code == TRUNC_MOD_EXPR) > + { > + append_pattern_def_seq (stmt_vinfo, div_stmt); > + def_stmt > + = gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL), > +LSHIFT_EXPR, var_div, shift); > + append_pattern_def_seq (stmt_vinfo, def_stmt); > + pattern_stmt > + = gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL), > +MINUS_EXPR, oprnd0, > +gimple_assign_lhs (def_stmt)); > + } > + else > + { > + pattern_stmt = div_stmt; > +
RE: [AArch64][SVE] Utilize ASRD instruction for division and remainder
Apologies for the accidental change, and added the underscore. Regards Yuliang gcc/ChangeLog: 2019-09-27 Yuliang Wang * config/aarch64/aarch64-sve.md (sdiv_pow23): New pattern for ASRD. * config/aarch64/iterators.md (UNSPEC_ASRD): New unspec. * internal-fn.def (IFN_DIV_POW2): New internal function. * optabs.def (sdiv_pow2_optab): New optab. * tree-vect-patterns.c (vect_recog_divmod_pattern): Modify pattern to support new operation. * doc/md.texi (sdiv_pow2$var{m3}): Documentation for the above. * doc/sourcebuild.texi (vect_sdiv_pow2_si): Document new target selector. gcc/testsuite/ChangeLog: 2019-09-27 Yuliang Wang * gcc.dg/vect/vect-sdiv-pow2-1.c: New test. * gcc.target/aarch64/sve/asrdiv_1.c: As above. * lib/target-support.exp (check_effective_target_vect_sdiv_pow2_si): Return true for AArch64 with SVE. -Original Message- From: Yuliang Wang Sent: 27 September 2019 10:37 To: Richard Sandiford Cc: nd ; gcc-patches@gcc.gnu.org Subject: RE: [AArch64][SVE] Utilize ASRD instruction for division and remainder Hi Richard, I have renamed the optabs and associated identifiers as per your suggestion. Thanks. Regards Yuliang gcc/ChangeLog: 2019-09-27 Yuliang Wang * config/aarch64/aarch64-sve.md (sdiv_pow23): New pattern for ASRD. * config/aarch64/iterators.md (UNSPEC_ASRD): New unspec. * internal-fn.def (IFN_DIV_POW2): New internal function. * optabs.def (sdiv_pow2_optab): New optab. * tree-vect-patterns.c (vect_recog_divmod_pattern): Modify pattern to support new operation. * doc/md.texi (sdiv_pow2$var{m3}): Documentation for the above. * doc/sourcebuild.texi (vect_sdivpow2_si): Document new target selector. gcc/testsuite/ChangeLog: 2019-09-27 Yuliang Wang * gcc.dg/vect/vect-sdivpow2-1.c: New test. * gcc.target/aarch64/sve/asrdiv_1.c: As above. * lib/target-support.exp (check_effective_target_vect_sdivpow2_si): Return true for AArch64 with SVE. -Original Message- From: Richard Sandiford Sent: 24 September 2019 17:12 To: Yuliang Wang Cc: gcc-patches@gcc.gnu.org; nd Subject: Re: [AArch64][SVE] Utilize ASRD instruction for division and remainder Yuliang Wang writes: > Hi, > > The C snippets below (signed division/modulo by a power-of-2 immediate > value): > > #define P ... > > void foo_div (int *a, int *b, int N) > { > for (int i = 0; i < N; i++) > a[i] = b[i] / (1 << P); > } > void foo_mod (int *a, int *b, int N) > { > for (int i = 0; i < N; i++) > a[i] = b[i] % (1 << P); > } > > Vectorize to the following on AArch64 + SVE: > > foo_div: > movx0, 0 > movw2, N > ptruep1.b, all > whilelop0.s, wzr, w2 > .p2align3,,7 > .L2: > ld1wz1.s, p0/z, [x3, x0, lsl 2] > cmpltp2.s, p1/z, z1.s, #0// > movz0.s, p2/z, #7// > addz0.s, z0.s, z1.s// > asrz0.s, z0.s, #3// > st1wz0.s, p0, [x1, x0, lsl 2] > incwx0 > whilelop0.s, w0, w2 > b.any.L2 > ret > > foo_mod: > ... > .L2: > ld1wz0.s, p0/z, [x3, x0, lsl 2] > cmpltp2.s, p1/z, z0.s, #0// > movz1.s, p2/z, #-1// > lsrz1.s, z1.s, #29// > addz0.s, z0.s, z1.s// > andz0.s, z0.s, #{2^P-1}// > subz0.s, z0.s, z1.s// > st1wz0.s, p0, [x1, x0, lsl 2] > incwx0 > whilelop0.s, w0, w2 > b.any.L2 > ret > > This patch utilizes the special-purpose ASRD (arithmetic shift-right for > divide by immediate) instruction: > > foo_div: > ... > .L2: > ld1wz0.s, p0/z, [x3, x0, lsl 2] > asrdz0.s, p1/m, z0.s, #{P}// > st1wz0.s, p0, [x1, x0, lsl 2] > incwx0 > whilelop0.s, w0, w2 > b.any.L2 > ret > > foo_mod: > ... > .L2: > ld1wz0.s, p0/z, [x3, x0, lsl 2] > movprfxz1, z0// > asrdz1.s, p1/m, z1.s, #{P}// > lslz1.s, z1.s, #{P}// > subz0.s, z0.s, z1.s// > st1wz0.s, p0, [x1, x0, lsl 2] > incwx0 > whilelop0.s, w0, w2 > b.any.L2 > ret > > Added new tests. Built and regression tested on aarch64-none-elf. > > Best Regards, > Yuliang Wang > > > gcc/ChangeLog: > > 2019-09-23 Yuliang Wang > > * config/aarch64/aarch64-sve.md (asrd3): New pattern for ASRD. > * config/aarch64/iterators.md (UNSPEC_ASRD): New unspec. > (ASRDIV): New int iterator. > * internal-fn.def (IFN_ASHR_DIV): New internal function. > * optabs.def (ashr_div_optab): New optab. > * tree-vect-patterns.c (vect_recog_divmod_pattern): > Modify pattern to support new operation. > * doc/md.texi (asrd$var{m3}): Documentation for the above. > * doc/sourcebuild.texi (v
RE: [AArch64][SVE] Utilize ASRD instruction for division and remainder
Hi Richard, I have renamed the optabs and associated identifiers as per your suggestion. Thanks. Regards Yuliang gcc/ChangeLog: 2019-09-27 Yuliang Wang * config/aarch64/aarch64-sve.md (sdiv_pow23): New pattern for ASRD. * config/aarch64/iterators.md (UNSPEC_ASRD): New unspec. * internal-fn.def (IFN_DIV_POW2): New internal function. * optabs.def (sdiv_pow2_optab): New optab. * tree-vect-patterns.c (vect_recog_divmod_pattern): Modify pattern to support new operation. * doc/md.texi (sdiv_pow2$var{m3}): Documentation for the above. * doc/sourcebuild.texi (vect_sdivpow2_si): Document new target selector. gcc/testsuite/ChangeLog: 2019-09-27 Yuliang Wang * gcc.dg/vect/vect-sdivpow2-1.c: New test. * gcc.target/aarch64/sve/asrdiv_1.c: As above. * lib/target-support.exp (check_effective_target_vect_sdivpow2_si): Return true for AArch64 with SVE. -Original Message- From: Richard Sandiford Sent: 24 September 2019 17:12 To: Yuliang Wang Cc: gcc-patches@gcc.gnu.org; nd Subject: Re: [AArch64][SVE] Utilize ASRD instruction for division and remainder Yuliang Wang writes: > Hi, > > The C snippets below (signed division/modulo by a power-of-2 immediate > value): > > #define P ... > > void foo_div (int *a, int *b, int N) > { > for (int i = 0; i < N; i++) > a[i] = b[i] / (1 << P); > } > void foo_mod (int *a, int *b, int N) > { > for (int i = 0; i < N; i++) > a[i] = b[i] % (1 << P); > } > > Vectorize to the following on AArch64 + SVE: > > foo_div: > movx0, 0 > movw2, N > ptruep1.b, all > whilelop0.s, wzr, w2 > .p2align3,,7 > .L2: > ld1wz1.s, p0/z, [x3, x0, lsl 2] > cmpltp2.s, p1/z, z1.s, #0// > movz0.s, p2/z, #7// > addz0.s, z0.s, z1.s// > asrz0.s, z0.s, #3// > st1wz0.s, p0, [x1, x0, lsl 2] > incwx0 > whilelop0.s, w0, w2 > b.any.L2 > ret > > foo_mod: > ... > .L2: > ld1wz0.s, p0/z, [x3, x0, lsl 2] > cmpltp2.s, p1/z, z0.s, #0// > movz1.s, p2/z, #-1// > lsrz1.s, z1.s, #29// > addz0.s, z0.s, z1.s// > andz0.s, z0.s, #{2^P-1}// > subz0.s, z0.s, z1.s// > st1wz0.s, p0, [x1, x0, lsl 2] > incwx0 > whilelop0.s, w0, w2 > b.any.L2 > ret > > This patch utilizes the special-purpose ASRD (arithmetic shift-right for > divide by immediate) instruction: > > foo_div: > ... > .L2: > ld1wz0.s, p0/z, [x3, x0, lsl 2] > asrdz0.s, p1/m, z0.s, #{P}// > st1wz0.s, p0, [x1, x0, lsl 2] > incwx0 > whilelop0.s, w0, w2 > b.any.L2 > ret > > foo_mod: > ... > .L2: > ld1wz0.s, p0/z, [x3, x0, lsl 2] > movprfxz1, z0// > asrdz1.s, p1/m, z1.s, #{P}// > lslz1.s, z1.s, #{P}// > subz0.s, z0.s, z1.s// > st1wz0.s, p0, [x1, x0, lsl 2] > incwx0 > whilelop0.s, w0, w2 > b.any.L2 > ret > > Added new tests. Built and regression tested on aarch64-none-elf. > > Best Regards, > Yuliang Wang > > > gcc/ChangeLog: > > 2019-09-23 Yuliang Wang > > * config/aarch64/aarch64-sve.md (asrd3): New pattern for ASRD. > * config/aarch64/iterators.md (UNSPEC_ASRD): New unspec. > (ASRDIV): New int iterator. > * internal-fn.def (IFN_ASHR_DIV): New internal function. > * optabs.def (ashr_div_optab): New optab. > * tree-vect-patterns.c (vect_recog_divmod_pattern): > Modify pattern to support new operation. > * doc/md.texi (asrd$var{m3}): Documentation for the above. > * doc/sourcebuild.texi (vect_asrdiv_si): Document new target selector. This looks good to me. My only real question is about naming: maybe IFN_DIV_POW2 would be a better name for the internal function and sdiv_pow2_optab/"div_pow2$a3" for the optab? But I'm useless at naming things, so maybe others would prefer your names. Thanks, Richard rb11863.patch Description: rb11863.patch
[AArch64][SVE2] Shift-Right Accumulate combine patterns
Hi, This patch adds combining support for SVE2's shift-right accumulate instructions. Example snippet: #define IMM ... void foo (TYPE a, TYPE b, int n) { for (int i = 0; i < n; i++) a[i] += b[i] >> IMM; } Signed: beforeasr z0.s, z0.s, #{IMM} add z0.s, z0.s, z1.s ... after ssraz0.s, z1.s, #{IMM} Unsigned: beforelsr z0.s, z0.s, #{IMM} add z0.s, z0.s, z1.s ... after usraz0.s, z1.s, #{IMM} Built and regression tested on aarch64-none-elf. Best Regards, Yuliang Wang gcc/ChangeLog: 2019-09-26 Yuliang Wang * config/aarch64/aarch64-sve2.md (aarch64_sve2_sra): New combine pattern. gcc/testsuite/ChangeLog: 2019-09-26 Yuliang Wang * gcc.target/aarch64/sve2/shracc_1.c: New test. rb11872.patch Description: rb11872.patch
[AArch64][SVE] Utilize ASRD instruction for division and remainder
Hi, The C snippets below (signed division/modulo by a power-of-2 immediate value): #define P ... void foo_div (int *a, int *b, int N) { for (int i = 0; i < N; i++) a[i] = b[i] / (1 << P); } void foo_mod (int *a, int *b, int N) { for (int i = 0; i < N; i++) a[i] = b[i] % (1 << P); } Vectorize to the following on AArch64 + SVE: foo_div: mov x0, 0 mov w2, N ptrue p1.b, all whilelo p0.s, wzr, w2 .p2align3,,7 .L2: ld1wz1.s, p0/z, [x3, x0, lsl 2] cmplt p2.s, p1/z, z1.s, #0// mov z0.s, p2/z, #7 // add z0.s, z0.s, z1.s// asr z0.s, z0.s, #3 // st1wz0.s, p0, [x1, x0, lsl 2] incwx0 whilelo p0.s, w0, w2 b.any .L2 ret foo_mod: ... .L2: ld1wz0.s, p0/z, [x3, x0, lsl 2] cmplt p2.s, p1/z, z0.s, #0// mov z1.s, p2/z, #-1 // lsr z1.s, z1.s, #29 // add z0.s, z0.s, z1.s// and z0.s, z0.s, #{2^P-1}// sub z0.s, z0.s, z1.s// st1wz0.s, p0, [x1, x0, lsl 2] incwx0 whilelo p0.s, w0, w2 b.any .L2 ret This patch utilizes the special-purpose ASRD (arithmetic shift-right for divide by immediate) instruction: foo_div: ... .L2: ld1wz0.s, p0/z, [x3, x0, lsl 2] asrdz0.s, p1/m, z0.s, #{P} // st1wz0.s, p0, [x1, x0, lsl 2] incwx0 whilelo p0.s, w0, w2 b.any .L2 ret foo_mod: ... .L2: ld1wz0.s, p0/z, [x3, x0, lsl 2] movprfx z1, z0 // asrdz1.s, p1/m, z1.s, #{P} // lsl z1.s, z1.s, #{P}// sub z0.s, z0.s, z1.s// st1wz0.s, p0, [x1, x0, lsl 2] incwx0 whilelo p0.s, w0, w2 b.any .L2 ret Added new tests. Built and regression tested on aarch64-none-elf. Best Regards, Yuliang Wang gcc/ChangeLog: 2019-09-23 Yuliang Wang * config/aarch64/aarch64-sve.md (asrd3): New pattern for ASRD. * config/aarch64/iterators.md (UNSPEC_ASRD): New unspec. (ASRDIV): New int iterator. * internal-fn.def (IFN_ASHR_DIV): New internal function. * optabs.def (ashr_div_optab): New optab. * tree-vect-patterns.c (vect_recog_divmod_pattern): Modify pattern to support new operation. * doc/md.texi (asrd$var{m3}): Documentation for the above. * doc/sourcebuild.texi (vect_asrdiv_si): Document new target selector. gcc/testsuite/ChangeLog: 2019-09-23 Yuliang Wang * gcc.dg/vect/vect-asrdiv-1.c: New test. * gcc.target/aarch64/sve/asrdiv_1.c: As above. * lib/target-support.exp (check_effective_target_vect_asrdiv_si): Return true for AArch64 with SVE. rb11863.patch Description: rb11863.patch
RE: [PATCH] Reduction of conditional operations for vectorization
Hi Richard, Thanks for your comments and tips. fold_binary_op_with_conditional_arg performs the reverse transformation to this patch in certain situations: /* Transform `a + (b ? x : y)' into `b ? (a + x) : (a + y)'. ... */ static tree fold_binary_op_with_conditional_arg (location_t loc, ... /* This transformation is only worthwhile if we don't have to wrap ARG in a SAVE_EXPR and the operation can be simplified without recursing on at least one of the branches once its pushed inside the COND_EXPR. */ if (!TREE_CONSTANT (arg) && (TREE_SIDE_EFFECTS (arg) ...) return NULL_TREE; ... For instance, this causes infinite recursion in gcc.dg/vect/fast-math-vect-call-2 because ARG is a float literal. Regards, Yuliang -Original Message- From: Richard Biener Sent: 20 September 2019 13:02 To: Yuliang Wang Cc: gcc-patches@gcc.gnu.org; nd ; Richard Sandiford Subject: Re: [PATCH] Reduction of conditional operations for vectorization On Fri, Sep 20, 2019 at 10:09 AM Yuliang Wang wrote: > > Hi, > > ifcvt transforms the following conditional operation reduction pattern: > > if ( condition ) > a = a OP b; > else > a = a OP c; > > Into: > > a_1 = a OP b; > a_2 = a OP c; > a = condition ? a_1 : a_2; > > Where OP is one of { plus minus mult min max and ior eor }. > > This patch further optimizes the above to: > > a_0 = condition ? b : c; > a = a OP a_0; > > Which enables vectorization on AArch64. > Also supported are permutations of the above operand ordering subject to > commutativity of OP. > > Added new tests. Built and regression tested on aarch64-none-elf and > aarch64-linux-gnu. @@ -3206,7 +3206,41 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) /* !A ? B : C -> A ? C : B. */ (simplify (cnd (logical_inverted_value truth_valued_p@0) @1 @2) - (cnd @0 @2 @1))) + (cnd @0 @2 @1)) + + /* !A ? B : C -> A ? C : B. */ + (simplify + (cnd (logical_inverted_value truth_valued_p@0) @1 @2) (cnd @0 @2 + @1)) + looks like you duplicate the above pattern. Should have raised a warning in the genmatch run. The patch header shows you are not working against trunk? + (for op (plus minus mult + min max + bit_and bit_ior bit_xor) + (simplify + (cnd @0 (op @1 @2) (op @1 @3)) + (op @1 (cnd @0 @2 @3))) + (simplify + (cnd @0 (op @1 @2) (op @3 @2)) + (op (cnd @0 @1 @3) @2)) + (if (op != MINUS_EXPR) + (simplify +(cnd @0 (op @1 @2) (op @3 @1)) +(op @1 (cnd @0 @2 @3))) + (simplify +(cnd @0 (op @2 @1) (op @1 @3)) +(op @1 (cnd @0 @2 @3) if you would have dropped minus handling this simplifies to (for op (...) (simpify (cnd @0 (op:c @1 @2) (op:c @1 @3)) (op @1 (cnd @0 @2 @3))) you can then add minus special-cases if they are important (simplify (cnd @0 (minus @1 @2) (minus @1 @3)) ... (simplify (cnd @0 (minus @2 @1) (minus @3 @1)) I think that's clearer. + /* Hack: generic-match causes infinite recursion +by reverting this transformation when +i) -fno-trapping-math is enabled, and +ii) the common operand does not need to be wrapped in a SAVE_EXPR. + */ What's the specific transform that causes this? Yes, there are some left in fold-const.c. Thanks, Richard. > Best Regards, > Yuliang Wang > > > gcc/ChangeLog: > > 2019-09-19 Yuliang Wang > > * match.pd (for cnd (cond vec_cond)): New match statements for the > above patterns. > * doc/sourcebuild.texi (vect_condred_si): Document new target > selector. > > gcc/testsuite/ChangeLog: > > 2019-09-19 Yuliang Wang > > * gcc.target/aarch64/sve2/condred_1.c: New test. > * gcc.dg/vect/vect-condred-1.c: As above. > * gcc.dg/vect/vect-condred-2.c: As above. > * gcc.dg/vect/vect-condred-3.c: As above. > * gcc.dg/vect/vect-condred-4.c: As above. > * gcc.dg/vect/vect-condred-5.c: As above. > * gcc.dg/vect/vect-condred-6.c: As above. > * gcc.dg/vect/vect-condred-7.c: As above. > * gcc.dg/vect/vect-condred-8.c: As above. > * lib/target-supports.exp (check_effective_target_vect_condred_si): > Return true for AArch64 without SVE.
[PATCH] Reduction of conditional operations for vectorization
Hi, ifcvt transforms the following conditional operation reduction pattern: if ( condition ) a = a OP b; else a = a OP c; Into: a_1 = a OP b; a_2 = a OP c; a = condition ? a_1 : a_2; Where OP is one of { plus minus mult min max and ior eor }. This patch further optimizes the above to: a_0 = condition ? b : c; a = a OP a_0; Which enables vectorization on AArch64. Also supported are permutations of the above operand ordering subject to commutativity of OP. Added new tests. Built and regression tested on aarch64-none-elf and aarch64-linux-gnu. Best Regards, Yuliang Wang gcc/ChangeLog: 2019-09-19 Yuliang Wang * match.pd (for cnd (cond vec_cond)): New match statements for the above patterns. * doc/sourcebuild.texi (vect_condred_si): Document new target selector. gcc/testsuite/ChangeLog: 2019-09-19 Yuliang Wang * gcc.target/aarch64/sve2/condred_1.c: New test. * gcc.dg/vect/vect-condred-1.c: As above. * gcc.dg/vect/vect-condred-2.c: As above. * gcc.dg/vect/vect-condred-3.c: As above. * gcc.dg/vect/vect-condred-4.c: As above. * gcc.dg/vect/vect-condred-5.c: As above. * gcc.dg/vect/vect-condred-6.c: As above. * gcc.dg/vect/vect-condred-7.c: As above. * gcc.dg/vect/vect-condred-8.c: As above. * lib/target-supports.exp (check_effective_target_vect_condred_si): Return true for AArch64 without SVE. rb11852.patch Description: rb11852.patch
RE: [PATCH][AArch64] Vectorize MULH(R)S patterns with SVE2 instructions
Hi Richard, Thanks for your comments and advice; I have applied the relevant changes. Regards, Yuliang UPDATE: Added new tests. Built and regression tested on aarch64-none-elf and aarch64-linux-gnu. gcc/ChangeLog: 2019-09-1 Yuliang Wang PR tree-optimization/89386 * config/aarch64/aarch64-sve2.md (mull) (shrnb, shrnt): New SVE2 patterns. (mulhs3): New pattern for MULHRS. * config/aarch64/iterators.md (UNSPEC_SMULLB, UNSPEC_SMULLT) (UNSPEC_UMULLB, UNSPEC_UMULLT, UNSPEC_SHRNB, UNSPEC_SHRNT) (UNSPEC_RSHRNB, UNSPEC_RSHRNT, UNSPEC_SMULHS, UNSPEC_SMULHRS) UNSPEC_UMULHS, UNSPEC_UMULHRS): New unspecs. (MULLBT, SHRNB, SHRNT, MULHRS): New int iterators. (su, r): Handle the unspecs above. (bt): New int attribute. * internal-fn.def (IFN_MULHS, IFN_MULHRS): New internal functions. * internal-fn.c (first_commutative_argument): Commutativity info for above. * optabs.def (smulhs_optab, smulhrs_optab, umulhs_optab, umulhrs_optab): New optabs. * doc/md.texi (smulhs$var{m3}, umulhs$var{m3}) (smulhrs$var{m3}, umulhrs$var{m3}): Documentation for the above. * tree-vect-patterns.c (vect_recog_mulhs_pattern): New pattern function. (vect_vect_recog_func_ptrs): Add it. * testsuite/gcc.target/aarch64/sve2/mulhrs_1.c: New test. * testsuite/gcc.dg/vect/vect-mulhrs-1.c: As above. * testsuite/gcc.dg/vect/vect-mulhrs-2.c: As above. * testsuite/gcc.dg/vect/vect-mulhrs-3.c: As above. * testsuite/gcc.dg/vect/vect-mulhrs-4.c: As above. * doc/sourcebuild.texi (vect_mulhrs_hi): Document new target selector. * testsuite/lib/target-supports.exp (check_effective_target_vect_mulhrs_hi): Return true for AArch64 without SVE2. -Original Message- From: Richard Sandiford Sent: 30 August 2019 12:49 To: Yuliang Wang Cc: gcc-patches@gcc.gnu.org; nd Subject: Re: [PATCH][AArch64] Vectorize MULH(R)S patterns with SVE2 instructions Thanks for doing this. The patch looks good, so this review is mostly a list of very minor formatting comments, sorry. Yuliang Wang writes: > 2019-08-22 Yuliang Wang > Please add a line here pointing at the PR: PR tree-optimization/89386 The commit hooks pick this up automatically and link the commit to the bugzilla ticket. (The PR was filed for SSSE3, but the target-independent bits are still needed there.) > * config/aarch64/aarch64-sve2.md: support for SVE2 > instructions [S/U]MULL[T/B] + [R]SHRN[T/B] and MULHRS pattern variants Unfortunately the changelog format is pretty strict here. Lines have to be 80 chars or shorter, indented by tabs, and each pattern, function, variable or type needs to be listed individually regardless of how useful that seems. So I think this should be something like: * config/aarch64/aarch64-sve2.md (mull) (shrnb, shrnt, mulhs3): New patterns. (See below for why the "*" patterns aren't listed.) > * config/aarch64/iterators.md: iterators and attributes for > above Here too the iterators need to be listed: * config/aarch64/iterators.md (UNSPEC_SMULLB, UNSPEC_SMULLT) (UNSPEC_UMULLB, UNSPEC_UMULLT, UNSPEC_SHRNB, UNSPEC_SHRNT) (UNSPEC_RSHRNB, UNSPEC_RSHRNT, UNSPEC_SMULHS, UNSPEC_SMULHRS) UNSPEC_UMULHS, UNSPEC_UMULHRS): New unspecs. (MULLBT, SHRNB, SHRNT, MULHRS): New int iterators. (su, r): Handle the unspecs above. (bt): New int attribute. > * internal-fn.def: internal functions for MULH[R]S patterns * internal-fn.def (IFN_MULHS, IFN_MULHRS): New internal functions. > * optabs.def: optabs definitions for above and sign variants * optabs.def (smulhs_optab, smulhrs_optab, umulhs_optab) (umulhrs_optab): New optabs. > * tree-vect-patterns.c (vect_recog_multhi_pattern): pattern > recognition function for MULHRS * tree-vect-patterns.c (vect_recog_multhi_pattern): New function. (vect_vect_recog_func_ptrs): Add it. > * gcc.target/aarch64/sve2/mulhrs_1.c: new test for all > variants Just: * gcc.target/aarch64/sve2/mulhrs_1.c: New test. (Sorry that this is so finicky. I'm just the messenger. :-)) > diff --git a/gcc/config/aarch64/aarch64-sve2.md > b/gcc/config/aarch64/aarch64-sve2.md > index > 2334e5a7b7dc524bbd1f4d0a48ba5cd991970118..51783604ad8f83eb1d070c133009 > ed41a2a0252d 100644 > --- a/gcc/config/aarch64/aarch64-sve2.md > +++ b/gcc/config/aarch64/aarch64-sve2.md > @@ -63,3 +63,89 @@ > movprfx\t%0, %2\;h\t%0., %1/m, %0., > %3." >[(set_attr "movprfx" "*,yes")] > ) > + > +;; Multiply long top / bottom Very minor, but: GCC comments traditionally end with "." even if they're not full sentences. >
[PATCH][AArch64] Vectorize MULH(R)S patterns with SVE2 instructions
This patch allows for more efficient SVE2 vectorization of Multiply High with Round and Scale (MULHRS) patterns. The example snippet: uint16_t a[N], b[N], c[N]; void foo_round (void) { for (int i = 0; i < N; i++) a[i] = int32_t)b[i] * (int32_t)c[i]) >> 14) + 1) >> 1; } ... previously vectorized to: foo_round: ... ptrue p0.s whilelo p1.h, wzr, w2 ld1h{z2.h}, p1/z, [x4, x0, lsl #1] ld1h{z0.h}, p1/z, [x3, x0, lsl #1] uunpklo z3.s, z2.h // uunpklo z1.s, z0.h // uunpkhi z2.s, z2.h // uunpkhi z0.s, z0.h // mul z1.s, p0/m, z1.s, z3.s // mul z0.s, p0/m, z0.s, z2.s // asr z1.s, z1.s, #14 // asr z0.s, z0.s, #14 // add z1.s, z1.s, #1 // add z0.s, z0.s, #1 // asr z1.s, z1.s, #1 // asr z0.s, z0.s, #1 // uzp1z0.h, z1.h, z0.h// st1h{z0.h}, p1, [x1, x0, lsl #1] inchx0 whilelo p1.h, w0, w2 b.ne28 ret ... and now vectorizes to: foo_round: ... whilelo p0.h, wzr, w2 nop ld1h{z1.h}, p0/z, [x4, x0, lsl #1] ld1h{z2.h}, p0/z, [x3, x0, lsl #1] umullb z0.s, z1.h, z2.h// umullt z1.s, z1.h, z2.h// rshrnb z0.h, z0.s, #15 // rshrnt z0.h, z1.s, #15 // st1h{z0.h}, p0, [x1, x0, lsl #1] inchx0 whilelo p0.h, w0, w2 b.ne28 ret nop Also supported are: * Non-rounding cases The equivalent example snippet: void foo_trunc (void) { for (int i = 0; i < N; i++) a[i] = ((int32_t)b[i] * (int32_t)c[i]) >> 15; } ... vectorizes with SHRNT/SHRNB * 32-bit and 8-bit input/output types * Signed output types SMULLT/SMULLB are generated instead SQRDMULH was considered as a potential single-instruction optimization but saturates the intermediate value instead of truncating. Best Regards, Yuliang Wang ChangeLog: 2019-08-22 Yuliang Wang * config/aarch64/aarch64-sve2.md: support for SVE2 instructions [S/U]MULL[T/B] + [R]SHRN[T/B] and MULHRS pattern variants * config/aarch64/iterators.md: iterators and attributes for above * internal-fn.def: internal functions for MULH[R]S patterns * optabs.def: optabs definitions for above and sign variants * tree-vect-patterns.c (vect_recog_multhi_pattern): pattern recognition function for MULHRS * gcc.target/aarch64/sve2/mulhrs_1.c: new test for all variants rb11655.patch Description: rb11655.patch