Hi, SVE2 vectorization for BSL and NBSL fails when the element type is unsigned 8/16-bit.
The operands are being converted implicitly to corresponding signed types, which the mid-end fold pattern does not take into account; this patch augments the pattern with type conversion checks in order to rectify the above problem. #define TYPE uint{8,16}_t void foo (TYPE *a, TYPE *b, TYPE *c, TYPE *d, int n) { for (int i = 0; i < n; i++) a[i] = OP (b[i], c[i], d[i]); } BSL: // #define OP(x,y,z) (((x) & (z)) | ((y) & ~(z))) before and z1.d, z2.d, z1.d bic z0.d, z0.d, z2.d orr z0.d, z0.d, z1.d ... after bsl z0.d, z0.d, z1.d, z2.d NBSL: // #define OP(x,y,z) ~(((x) & (z)) | ((y) & ~(z))) before and z1.d, z2.d, z1.d bic z0.d, z0.d, z2.d orr z0.d, z0.d, z1.d not z0.{b,h}, p1/m, z0.{b,h} ... after nbsl z0.d, z0.d, z1.d, z2.d The GIMPLE output for BSL shows where conversions could be inserted: _1 = b[i]; _2 = d[i]; _3 = _1 & _2; _4 = (signed short) _3; _5 = c[i]; _6 = (signed short) _5; _7 = d[i]; _8 = (signed short) _7; _9 = ~_8; _10 = _6 & _9; _11 = _4 | _10; _12 = (short unsigned int) _11; a[i] = _12; In contrast, for 32/64-bit types (regardless of signedness): _1 = b[i]; _2 = d[i]; _3 = _1 & _2; _4 = c[i]; _5 = d[i]; _6 = ~_5; _7 = _4 & _6; _8 = _3 | _7; _9 = ~_8; a[i] = _9; Built and tested on aarch64-none-elf. Regards, Yuliang Wang gcc/ChangeLog: 2019-10-17 Yuliang Wang <yuliang.w...@arm.com> * match.pd (/* (x & ~m) | (y & m) -> ... */): Modified fold pattern. * genmatch.c (convert3): New convert operation to support the above. gcc/testsuite/ChangeLog: 2019-10-17 Yuliang Wang <yuliang.w...@arm.com> * gcc.target/aarch64/sve2/bitsel_1.c: Add testing for unsigned types. * gcc.target/aarch64/sve2/bitsel_2.c: As above. * gcc.target/aarch64/sve2/bitsel_3.c: As above. * gcc.target/aarch64/sve2/bitsel_4.c: As above. * gcc.target/aarch64/sve2/eor3_1.c: As above. diff --git a/gcc/genmatch.c b/gcc/genmatch.c index 7db1f135840e09e794e2921859fa8e9b76666fa8..ce87ae33e0b3c06f4d1fde8d8e74bf2210ee7a5a 100644 --- a/gcc/genmatch.c +++ b/gcc/genmatch.c @@ -227,6 +227,7 @@ enum tree_code { CONVERT0, CONVERT1, CONVERT2, +CONVERT3, VIEW_CONVERT0, VIEW_CONVERT1, VIEW_CONVERT2, @@ -1176,6 +1177,7 @@ lower_opt_convert (operand *o) = { CONVERT0, CONVERT_EXPR, CONVERT1, CONVERT_EXPR, CONVERT2, CONVERT_EXPR, + CONVERT3, CONVERT_EXPR, VIEW_CONVERT0, VIEW_CONVERT_EXPR, VIEW_CONVERT1, VIEW_CONVERT_EXPR, VIEW_CONVERT2, VIEW_CONVERT_EXPR }; @@ -4145,8 +4147,8 @@ parser::record_operlist (location_t loc, user_id *p) } } -/* Parse the operator ID, special-casing convert?, convert1? and - convert2? */ +/* Parse the operator ID, special-casing convert?, convert1?, convert2? and + convert3? */ id_base * parser::parse_operation () @@ -4167,6 +4169,8 @@ parser::parse_operation () ; else if (strcmp (id, "convert2") == 0) ; + else if (strcmp (id, "convert3") == 0) + ; else if (strcmp (id, "view_convert") == 0) id = "view_convert0"; else if (strcmp (id, "view_convert1") == 0) @@ -4183,6 +4187,7 @@ parser::parse_operation () } else if (strcmp (id, "convert1") == 0 || strcmp (id, "convert2") == 0 + || strcmp (id, "convert3") == 0 || strcmp (id, "view_convert1") == 0 || strcmp (id, "view_convert2") == 0) fatal_at (id_tok, "expected '?' after conditional operator"); @@ -4723,9 +4728,9 @@ parser::parse_for (location_t) id_base *idb = get_operator (oper, true); if (idb == NULL) fatal_at (token, "no such operator '%s'", oper); - if (*idb == CONVERT0 || *idb == CONVERT1 || *idb == CONVERT2 - || *idb == VIEW_CONVERT0 || *idb == VIEW_CONVERT1 - || *idb == VIEW_CONVERT2) + if (*idb == CONVERT0 || *idb == VIEW_CONVERT0 + || *idb == CONVERT1 || *idb == CONVERT2|| *idb == CONVERT3 + || *idb == VIEW_CONVERT1 || *idb == VIEW_CONVERT2) fatal_at (token, "conditional operators cannot be used inside for"); if (arity == -1) @@ -5136,6 +5141,7 @@ main (int argc, char **argv) add_operator (CONVERT0, "convert0", "tcc_unary", 1); add_operator (CONVERT1, "convert1", "tcc_unary", 1); add_operator (CONVERT2, "convert2", "tcc_unary", 1); +add_operator (CONVERT3, "convert3", "tcc_unary", 1); add_operator (VIEW_CONVERT0, "view_convert0", "tcc_unary", 1); add_operator (VIEW_CONVERT1, "view_convert1", "tcc_unary", 1); add_operator (VIEW_CONVERT2, "view_convert2", "tcc_unary", 1); diff --git a/gcc/match.pd b/gcc/match.pd index e3ac06c8ef5b893bd344734095b11047a43f98b8..0aa065c2941dd79477434fd3b6691c9a9b68d20c 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -1461,8 +1461,13 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) /* (x & ~m) | (y & m) -> ((x ^ y) & m) ^ x */ (simplify - (bit_ior:c (bit_and:cs @0 (bit_not @2)) (bit_and:cs @1 @2)) - (bit_xor (bit_and (bit_xor @0 @1) @2) @0)) + (bit_ior:c + (convert? (bit_and:cs @0 (bit_not (convert2? @2)))) + (convert1? (bit_and:cs @1 (convert3? @2)))) + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)) + && tree_nop_conversion_p (type, TREE_TYPE (@1))) + (bit_xor (bit_and + (bit_xor (convert @0) (convert @1)) (convert @2)) (convert @0)))) /* Fold A - (A & B) into ~B & A. */ (simplify diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c index 5c58ff54231d88a4ebf0a91fe4fac97079c8d992..05431e591887c589a1bc1516f99db39c66c353c4 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c @@ -7,27 +7,31 @@ #define OP(x,y,z) (((x) & (z)) | ((y) & ~(z))) #endif -#define TYPE(N) int##N##_t - -#define TEMPLATE(SIZE) \ -void __attribute__ ((noinline, noclone)) \ -f_##SIZE##_##OP \ - (TYPE(SIZE) *restrict a, TYPE(SIZE) *restrict b, \ - TYPE(SIZE) *restrict c, TYPE(SIZE) *restrict d, int n) \ -{ \ - for (int i = 0; i < n; i++) \ - a[i] = OP (b[i], c[i], d[i]); \ +#define TYPE(S,N) S##int##N##_t + +#define TEMPLATE(SIGN,SIZE) \ +void __attribute__ ((noinline, noclone)) \ +f_##SIGN##_##SIZE##_##OP \ + (TYPE(SIGN,SIZE) *restrict a, TYPE(SIGN,SIZE) *restrict b, \ + TYPE(SIGN,SIZE) *restrict c, TYPE(SIGN,SIZE) *restrict d, int n) \ +{ \ + for (int i = 0; i < n; i++) \ + a[i] = OP (b[i], c[i], d[i]); \ } -TEMPLATE (8); -TEMPLATE (16); -TEMPLATE (32); -TEMPLATE (64); +TEMPLATE (,8); +TEMPLATE (,16); +TEMPLATE (,32); +TEMPLATE (,64); +TEMPLATE (u,8); +TEMPLATE (u,16); +TEMPLATE (u,32); +TEMPLATE (u,64); -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */ /* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */ /* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.[bhsd]} } } */ -/* { dg-final { scan-assembler-times {\tbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 8 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_2.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_2.c index ac0d27213e84bb5c7f3d236f3cac59c71ac674ed..da6ac527e8c93e25e69a8db368fba79190b65202 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_2.c @@ -5,11 +5,11 @@ #include "bitsel_1.c" -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */ /* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */ /* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.[bhsd]} } } */ /* { dg-final { scan-assembler-not {\tnot\tz[0-9]+\.[bhsd]} } } */ -/* { dg-final { scan-assembler-times {\tnbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tnbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 8 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_3.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_3.c index 93995bb8bade89cd821ed85153d13e96bd4422a5..1036046a8119ef6aa19f7e975c90b2401cc43c0b 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_3.c +++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_3.c @@ -5,10 +5,10 @@ #include "bitsel_1.c" -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */ /* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */ /* { dg-final { scan-assembler-not {\tbic\tz[0-9]+\.[bhsd]} } } */ -/* { dg-final { scan-assembler-times {\tbsl1n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tbsl1n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 8 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_4.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_4.c index 7ccec619b4d1e8de366c0b0c53879a89a00c2c49..527dcf1a42009f484b2cf3d01e7aeb7448a4d1cc 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_4.c @@ -5,11 +5,11 @@ #include "bitsel_1.c" -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */ /* { dg-final { scan-assembler-not {\torr\tz[0-9]+\.[bhsd]} } } */ /* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.[bhsd]} } } */ /* { dg-final { scan-assembler-not {\tnot\tz[0-9]+\.[bhsd]} } } */ -/* { dg-final { scan-assembler-times {\tbsl2n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tbsl2n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 8 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/eor3_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/eor3_1.c index 551802a0c9f007273ddc68cc4ce77defe700d76e..29a023f9be705dcc67f96e0d2b97f8aef3e3ab4d 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve2/eor3_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve2/eor3_1.c @@ -5,9 +5,9 @@ #include "bitsel_1.c" -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */ /* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */ -/* { dg-final { scan-assembler-times {\teor3\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */ +/* { dg-final { scan-assembler-times {\teor3\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 8 } } */