This patch complements one from June 12th which is still awaiting review: https://gcc.gnu.org/pipermail/gcc-patches/2020-June/547937.html
This patch optimizes popcount and parity of an argument known to have at most a single bit set, to be that single bit. Hence, popcount(x&8) is simplified to (x>>3)&1. This generalizes the existing optimization of popcount(x&1) being simplified to x&1, which is moved with this patch to avoid a duplicate pattern warning in match.pd. This patch has been tested on x86_64-pc-linux-gnu with a "make bootstrap" and "make -k check" with no new failures. If this is approved after (or at the same time) as the patch above, I'm happy to resolve the conflicts and retest before committing. 2020-07-20 Roger Sayle <ro...@nextmovesoftware.com> gcc/ChangeLog * match.pd (popcount(x) -> x>>C): New simplification. gcc/testsuite * gcc.dg/fold-popcount-5.c: New test. * gcc.dg/fold-parity-5.c: Likewise. Ok for mainline? Thanks in advance, Roger -- Roger Sayle NextMove Software Cambridge, UK
diff --git a/gcc/match.pd b/gcc/match.pd index c6ae7a7..0b3b626 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -5966,11 +5966,6 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) /* POPCOUNT simplifications. */ (for popcount (BUILT_IN_POPCOUNT BUILT_IN_POPCOUNTL BUILT_IN_POPCOUNTLL BUILT_IN_POPCOUNTIMAX) - /* popcount(X&1) is nop_expr(X&1). */ - (simplify - (popcount @0) - (if (tree_nonzero_bits (@0) == 1) - (convert @0))) /* popcount(X) + popcount(Y) is popcount(X|Y) when X&Y must be zero. */ (simplify (plus (popcount:s @0) (popcount:s @1)) @@ -5983,6 +5978,23 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (cmp (popcount @0) integer_zerop) (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))) +/* popcount(X&C1) is (X>>C2)&1 when C1 == 1<<C2. Same for parity(X&C1). */ +(for pfun (BUILT_IN_POPCOUNT BUILT_IN_PARITY + BUILT_IN_POPCOUNTL BUILT_IN_PARITYL + BUILT_IN_POPCOUNTLL BUILT_IN_PARITYLL + BUILT_IN_POPCOUNTIMAX BUILT_IN_PARITYIMAX) + (simplify + (pfun @0) + (with { wide_int nz = tree_nonzero_bits (@0); } + (switch + (if (nz == 1) + (convert @0)) + (if (wi::popcount (nz) == 1) + (with { tree utype = unsigned_type_for (TREE_TYPE (@0)); + int bits = wi::ctz (nz); } + (convert (rshift:utype (convert:utype @0) + { build_int_cst (utype, bits); })))))))) + #if GIMPLE /* 64- and 32-bits branchless implementations of popcount are detected: diff --git a/gcc/testsuite/gcc.dg/fold-popcount-5.c b/gcc/testsuite/gcc.dg/fold-popcount-5.c new file mode 100644 index 0000000..943726f --- /dev/null +++ b/gcc/testsuite/gcc.dg/fold-popcount-5.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +int test_and4(unsigned int a) +{ + return __builtin_popcount(a&4); +} + +int test_and4l(unsigned long b) +{ + return __builtin_popcountl(b&4); +} + +int test_and4ll(unsigned long long c) +{ + return __builtin_popcountll(c&4); +} + +int test_shift(unsigned int d) +{ + int bits = 8*sizeof(unsigned int)-1; + return __builtin_popcount(d<<31); +} + +int test_shiftl(unsigned long e) +{ + int bits = 8*sizeof(unsigned long)-1; + return __builtin_popcountl(e<<bits); +} + +int test_shiftll(unsigned long long f) +{ + int bits = 8*sizeof(unsigned long long)-1; + return __builtin_popcountll(f<<bits); +} + +/* { dg-final { scan-tree-dump-times "popcount" 0 "optimized" } } */ + diff --git a/gcc/testsuite/gcc.dg/fold-parity-5.c b/gcc/testsuite/gcc.dg/fold-parity-5.c new file mode 100644 index 0000000..69d3a6a --- /dev/null +++ b/gcc/testsuite/gcc.dg/fold-parity-5.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +int test_and4(unsigned int a) +{ + return __builtin_parity(a&4); +} + +int test_and4l(unsigned long b) +{ + return __builtin_parityl(b&4); +} + +int test_and4ll(unsigned long long c) +{ + return __builtin_parityll(c&4); +} + +int test_shift(unsigned int d) +{ + int bits = 8*sizeof(unsigned int)-1; + return __builtin_parity(d<<31); +} + +int test_shiftl(unsigned long e) +{ + int bits = 8*sizeof(unsigned long)-1; + return __builtin_parityl(e<<bits); +} + +int test_shiftll(unsigned long long f) +{ + int bits = 8*sizeof(unsigned long long)-1; + return __builtin_parityll(f<<bits); +} + +/* { dg-final { scan-tree-dump-times "parity" 0 "optimized" } } */ +