On Tue, Aug 13, 2019 at 8:50 AM Wilco Dijkstra <wilco.dijks...@arm.com> wrote:
>
> Add simplifications for popcount (x) > 1 to (x & (x-1)) != 0 and
> popcount (x) == 1 into (x-1) <u (x & -x).  These trigger only for
> single-use cases and support an optional convert.  A microbenchmark
> shows a speedup of 2-2.5x on both x64 and AArch64.
>
> Bootstrap OK, OK for commit?

I think this should be in expand stage where there could be comparison
of the cost of the RTLs.
The only reason why it is faster for AARCH64 is the requirement of
moving between the GPRs and the SIMD registers.

Thanks,
Andrew Pinski

>
> ChangeLog:
> 2019-08-13  Wilco Dijkstra  <wdijk...@arm.com>
>
> gcc/
>         PR middle-end/90693
>         * match.pd: Add popcount simplifications.
>
> testsuite/
>         PR middle-end/90693
>         * gcc.dg/fold-popcount-5.c: Add new test.
>
> ---
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 
> 0317bc704f771f626ab72189b3a54de00087ad5a..bf4351a330f45f3a1424d9792cefc3da6267597d
>  100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -5356,7 +5356,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>         rep (eq eq ne ne)
>      (simplify
>        (cmp (popcount @0) integer_zerop)
> -      (rep @0 { build_zero_cst (TREE_TYPE (@0)); }))))
> +      (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))
> +  /* popcount(X) == 1 -> (X-1) <u (X & -X).  */
> +  (for cmp (eq ne)
> +       rep (lt ge)
> +    (simplify
> +      (cmp (convert? (popcount:s @0)) integer_onep)
> +      (with {
> +             tree utype = unsigned_type_for (TREE_TYPE (@0));
> +             tree a0 = fold_convert (utype, @0); }
> +       (rep (plus { a0; } { build_minus_one_cst (utype); })
> +            (bit_and (negate { a0; }) { a0; })))))
> +  /* popcount(X) > 1 -> (X & (X-1)) != 0.  */
> +  (for cmp (gt le)
> +       rep (ne eq)
> +    (simplify
> +      (cmp (convert? (popcount:s @0)) integer_onep)
> +      (rep (bit_and (plus @0 { build_minus_one_cst (TREE_TYPE (@0)); }) @0)
> +          { build_zero_cst (TREE_TYPE (@0)); }))))
>
>  /* Simplify:
>
> diff --git a/gcc/testsuite/gcc.dg/fold-popcount-5.c 
> b/gcc/testsuite/gcc.dg/fold-popcount-5.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..fcf3910587caacb8e39cf437dc3971df892f405a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/fold-popcount-5.c
> @@ -0,0 +1,69 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +/* Test popcount (x) > 1 -> (x & (x-1)) != 0.  */
> +
> +int test_1 (long x)
> +{
> +  return __builtin_popcountl (x) >= 2;
> +}
> +
> +int test_2 (int x)
> +{
> +  return (unsigned) __builtin_popcount (x) <= 1u;
> +}
> +
> +int test_3 (unsigned x)
> +{
> +  return __builtin_popcount (x) > 1u;
> +}
> +
> +int test_4 (unsigned long x)
> +{
> +  return (unsigned char) __builtin_popcountl (x) > 1;
> +}
> +
> +int test_5 (unsigned long x)
> +{
> +  return (signed char) __builtin_popcountl (x) <= (signed char)1;
> +}
> +
> +int test_6 (unsigned long long x)
> +{
> +  return 2u <= __builtin_popcountll (x);
> +}
> +
> +/* Test popcount (x) == 1 -> (x-1) <u (x & -x).  */
> +
> +int test_7 (unsigned long x)
> +{
> +  return __builtin_popcountl (x) != 1;
> +}
> +
> +int test_8 (long long x)
> +{
> +  return (unsigned) __builtin_popcountll (x) == 1u;
> +}
> +
> +int test_9 (int x)
> +{
> +  return (unsigned char) __builtin_popcount (x) != 1u;
> +}
> +
> +int test_10 (unsigned x)
> +{
> +  return (unsigned char) __builtin_popcount (x) == 1;
> +}
> +
> +int test_11 (long x)
> +{
> +  return (signed char) __builtin_popcountl (x) == 1;
> +}
> +
> +int test_12 (long x)
> +{
> +  return 1u == __builtin_popcountl (x);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "popcount" 0 "optimized" } } */
> +

Reply via email to