On Tue, Nov 12, 2019 at 3:36 PM Wilco Dijkstra <wilco.dijks...@arm.com> wrote:
>
> Hi,
>
> Support common idioms for count trailing zeroes using an array lookup.
> The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic
> constant which when multiplied by a power of 2 contains a unique value
> in the top 5 or 6 bits.  This is then indexed into a table which maps it
> to the number of trailing zeroes.  When the table is valid, we emit a
> sequence using the target defined value for ctz (0):
>
> int ctz1 (unsigned x)
> {
>   static const char table[32] =
>     {
>       0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
>       31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
>     };
>
>   return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27];
> }
>
> Is optimized to:
>
>         rbit    w0, w0
>         clz     w0, w0
>         and     w0, w0, 31
>         ret
>
> Bootstrapped on AArch64. OK for commit?

Uh.  Well.  I think that
the gimple-match-head.c hunk isn't something we want.  Instead,
since this optimizes a memory access, the handling should move
to tree-ssa-forwprop.c where you _may_ use a (match ...)
match.pd pattern to do the (rshift (mult (bit_and (negate @1) @1)
matching.  It might be the first to use that feature, you need to
declare the function to use it from tree-ssa-forwprop.c.  So

(match (clz_table_index @1 @2 @3)
  (rshift (mult (bit_and (negate @1) @1) INTEGER_CST@2)
                       INTEGER_CST@3))

in match.pd and then

extern bool gimple_clz_table_index (tree, tree *, tree (*)(tree));

and use it like

 tree res_ops[3];
 if (gimple_clz_table_index (TREE_OPERAND (array_ref, 1), &res_ops, NULL))
   {
      @1 @2 and @3 are now in res_ops
   }

btw, the bit_and probably needs :c

Thanks,
Richard.

> ChangeLog:
>
> 2019-11-12  Wilco Dijkstra  <wdijk...@arm.com>
>
>         PR tree-optimization/90838
>         * generic-match-head.c (optimize_count_trailing_zeroes):
>         Add stub function.
>         * gimple-match-head.c (gimple_simplify): Add support for ARRAY_REF.
>         (optimize_count_trailing_zeroes): Add new function.
>         * match.pd: Add matching for ctz idioms.
>         * testsuite/gcc.target/aarch64/pr90838.c: New test.
>
> --
>
> diff --git a/gcc/generic-match-head.c b/gcc/generic-match-head.c
> index 
> fdc603977fc5b03a843944f75ce262f5d2256308..5a38bd233585225d60f0159c9042a16d9fdc9d80
>  100644
> --- a/gcc/generic-match-head.c
> +++ b/gcc/generic-match-head.c
> @@ -88,3 +88,10 @@ optimize_successive_divisions_p (tree, tree)
>  {
>    return false;
>  }
> +
> +static bool
> +optimize_count_trailing_zeroes (tree type, tree array_ref, tree input,
> +                               tree mulc, tree shift, tree &zero_val)
> +{
> +  return false;
> +}
> diff --git a/gcc/gimple-match-head.c b/gcc/gimple-match-head.c
> index 
> 53278168a59f5ac10ce6760f04fd42589a0792e7..2d3b305f8ea54e4ca31c64994af30b34bb7eff09
>  100644
> --- a/gcc/gimple-match-head.c
> +++ b/gcc/gimple-match-head.c
> @@ -909,6 +909,24 @@ gimple_simplify (gimple *stmt, gimple_match_op *res_op, 
> gimple_seq *seq,
>                 res_op->set_op (TREE_CODE (op0), type, valueized);
>                 return true;
>               }
> +           else if (code == ARRAY_REF)
> +             {
> +               tree rhs1 = gimple_assign_rhs1 (stmt);
> +               tree op1 = TREE_OPERAND (rhs1, 1);
> +               tree op2 = TREE_OPERAND (rhs1, 2);
> +               tree op3 = TREE_OPERAND (rhs1, 3);
> +               tree op0 = TREE_OPERAND (rhs1, 0);
> +               bool valueized = false;
> +
> +               op0 = do_valueize (op0, top_valueize, valueized);
> +               op1 = do_valueize (op1, top_valueize, valueized);
> +
> +               if (op2 && op3)
> +                 res_op->set_op (code, type, op0, op1, op2, op3);
> +               else
> +                 res_op->set_op (code, type, op0, op1);
> +               return gimple_resimplify4 (seq, res_op, valueize) || 
> valueized;
> +             }
>             break;
>           case GIMPLE_UNARY_RHS:
>             {
> @@ -1222,3 +1240,57 @@ optimize_successive_divisions_p (tree divisor, tree 
> inner_div)
>      }
>    return true;
>  }
> +
> +/* Recognize count trailing zeroes idiom.
> +   The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic
> +   constant which when multiplied by a power of 2 contains a unique value
> +   in the top 5 or 6 bits.  This is then indexed into a table which maps it
> +   to the number of trailing zeroes.  Array[0] is returned so the caller can
> +   emit an appropriate sequence depending on whether ctz (0) is defined on
> +   the target.  */
> +static bool
> +optimize_count_trailing_zeroes (tree type, tree array, tree x, tree mulc,
> +                               tree tshift, tree &zero_val)
> +{
> +  gcc_assert (TREE_CODE (mulc) == INTEGER_CST);
> +  gcc_assert (TREE_CODE (tshift) == INTEGER_CST);
> +
> +  tree input_type = TREE_TYPE (x);
> +
> +  if (!direct_internal_fn_supported_p (IFN_CTZ, input_type, 
> OPTIMIZE_FOR_BOTH))
> +    return false;
> +
> +  unsigned HOST_WIDE_INT val = tree_to_uhwi (mulc);
> +  unsigned shiftval = tree_to_uhwi (tshift);
> +  unsigned input_bits = tree_to_shwi (TYPE_SIZE (input_type));
> +
> +  /* Check the array is not wider than integer type and the input is a 32-bit
> +     or 64-bit type.  The shift should extract the top 5..7 bits.  */
> +  if (TYPE_PRECISION (type) > 32)
> +    return false;
> +  if (input_bits != 32 && input_bits != 64)
> +    return false;
> +  if (shiftval < input_bits - 7 || shiftval > input_bits - 5)
> +    return false;
> +
> +  tree t = build4 (ARRAY_REF, type, array, size_int (0), NULL_TREE, 
> NULL_TREE);
> +  t = fold_const_aggregate_ref (t);
> +  if (t == NULL)
> +    return false;
> +
> +  zero_val = build_int_cst (integer_type_node, tree_to_shwi (t));
> +
> +  for (unsigned i = 0; i < input_bits; i++, val <<= 1)
> +    {
> +      if (input_bits == 32)
> +       val &= 0xffffffff;
> +      t = build4 (ARRAY_REF, type, array, size_int ((int)(val >> shiftval)),
> +                 NULL_TREE, NULL_TREE);
> +      t = fold_const_aggregate_ref (t);
> +      if (t == NULL || tree_to_shwi (t) != i)
> +       return false;
> +    }
> +
> +  return true;
> +}
> +
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 
> 6edf54b80012d87dbe7330f5ee638cdba2f9c099..bbe935e1e2af35e8e953a776eb3ecfb83414b047
>  100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -6060,3 +6060,33 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  (simplify
>   (vec_perm vec_same_elem_p@0 @0 @1)
>   @0)
> +
> +/* Recognize count trailing zeroes idiom.
> +   The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic
> +   constant which when multiplied by a power of 2 contains a unique value
> +   in the top 5 or 6 bits.  This is then indexed into a table which maps it
> +   to the number of trailing zeroes.  If valid, emit an optimal sequence
> +   depending on the result for zero.
> +*/
> +(simplify
> + (ARRAY_REF @0 (rshift (mult (bit_and (negate @1) @1) INTEGER_CST@2)
> +                       INTEGER_CST@3) @4 @5)
> + (with
> +  { tree zero_val;
> +    HOST_WIDE_INT val;
> +    HOST_WIDE_INT type_size = tree_to_shwi (TYPE_SIZE (TREE_TYPE (@1)));
> +    bool zero_ok = CTZ_DEFINED_VALUE_AT_ZERO (TYPE_MODE (TREE_TYPE (@1)), 
> val);
> +  }
> +  (if (optimize_count_trailing_zeroes (type, @0, @1, @2, @3, zero_val))
> +   (switch
> +    (if (zero_ok && tree_to_shwi (zero_val) == val)
> +     (convert (BUILT_IN_CTZ:integer_type_node @1)))
> +
> +    /* Emit ctz (x) & 31 if ctz (0) is 32 but we need to return 0.  */
> +    (if (zero_ok && val == type_size && integer_zerop (zero_val))
> +     (convert (bit_and (BUILT_IN_CTZ:integer_type_node @1)
> +       { build_int_cst (integer_type_node, type_size - 1); })))
> +
> +    /* Emit (x ? ctz (x) : zero_val).  */
> +    (if (true)
> +     (convert (cond @1 (BUILT_IN_CTZ:integer_type_node @1) { zero_val; } 
> )))))))
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr90838.c 
> b/gcc/testsuite/gcc.target/aarch64/pr90838.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..bff3144c0d1b3984016e5a404e986eae785c73ed
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr90838.c
> @@ -0,0 +1,64 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +int ctz1 (unsigned x)
> +{
> +  static const char table[32] =
> +    {
> +      0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
> +      31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
> +    };
> +
> +  return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27];
> +}
> +
> +int ctz2 (unsigned x)
> +{
> +  const int u = 0;
> +  static short table[64] =
> +    {
> +      32, 0, 1,12, 2, 6, u,13, 3, u, 7, u, u, u, u,14,
> +      10, 4, u, u, 8, u, u,25, u, u, u, u, u,21,27,15,
> +      31,11, 5, u, u, u, u, u, 9, u, u,24, u, u,20,26,
> +      30, u, u, u, u,23, u,19,29, u,22,18,28,17,16, u
> +    };
> +
> +  x = (x & -x) * 0x0450FBAF;
> +  return table[x >> 26];
> +}
> +
> +int ctz3 (unsigned x)
> +{
> +  static int table[32] =
> +    {
> +      0, 1, 2,24, 3,19, 6,25, 22, 4,20,10,16, 7,12,26,
> +      31,23,18, 5,21, 9,15,11,30,17, 8,14,29,13,28,27
> +    };
> +
> +  if (x == 0) return 32;
> +  x = (x & -x) * 0x04D7651F;
> +  return table[x >> 27];
> +}
> +
> +static const unsigned long long magic = 0x03f08c5392f756cdULL;
> +
> +static const char table[64] = {
> +     0,  1, 12,  2, 13, 22, 17,  3,
> +    14, 33, 23, 36, 18, 58, 28,  4,
> +    62, 15, 34, 26, 24, 48, 50, 37,
> +    19, 55, 59, 52, 29, 44, 39,  5,
> +    63, 11, 21, 16, 32, 35, 57, 27,
> +    61, 25, 47, 49, 54, 51, 43, 38,
> +    10, 20, 31, 56, 60, 46, 53, 42,
> +     9, 30, 45, 41,  8, 40,  7,  6,
> +};
> +
> +int ctz4 (unsigned long x)
> +{
> +  unsigned long lsb = x & -x;
> +  return table[(lsb * magic) >> 58];
> +}
> +
> +/* { dg-final { scan-assembler-times "clz\t" 4 } } */
> +/* { dg-final { scan-assembler-times "and\t" 2 } } */
> +/* { dg-final { scan-assembler-not "cmp\t.*0" } } */

Reply via email to