On Tue, Nov 12, 2019 at 3:36 PM Wilco Dijkstra <wilco.dijks...@arm.com> wrote: > > Hi, > > Support common idioms for count trailing zeroes using an array lookup. > The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic > constant which when multiplied by a power of 2 contains a unique value > in the top 5 or 6 bits. This is then indexed into a table which maps it > to the number of trailing zeroes. When the table is valid, we emit a > sequence using the target defined value for ctz (0): > > int ctz1 (unsigned x) > { > static const char table[32] = > { > 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, > 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 > }; > > return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27]; > } > > Is optimized to: > > rbit w0, w0 > clz w0, w0 > and w0, w0, 31 > ret > > Bootstrapped on AArch64. OK for commit?
Uh. Well. I think that the gimple-match-head.c hunk isn't something we want. Instead, since this optimizes a memory access, the handling should move to tree-ssa-forwprop.c where you _may_ use a (match ...) match.pd pattern to do the (rshift (mult (bit_and (negate @1) @1) matching. It might be the first to use that feature, you need to declare the function to use it from tree-ssa-forwprop.c. So (match (clz_table_index @1 @2 @3) (rshift (mult (bit_and (negate @1) @1) INTEGER_CST@2) INTEGER_CST@3)) in match.pd and then extern bool gimple_clz_table_index (tree, tree *, tree (*)(tree)); and use it like tree res_ops[3]; if (gimple_clz_table_index (TREE_OPERAND (array_ref, 1), &res_ops, NULL)) { @1 @2 and @3 are now in res_ops } btw, the bit_and probably needs :c Thanks, Richard. > ChangeLog: > > 2019-11-12 Wilco Dijkstra <wdijk...@arm.com> > > PR tree-optimization/90838 > * generic-match-head.c (optimize_count_trailing_zeroes): > Add stub function. > * gimple-match-head.c (gimple_simplify): Add support for ARRAY_REF. > (optimize_count_trailing_zeroes): Add new function. > * match.pd: Add matching for ctz idioms. > * testsuite/gcc.target/aarch64/pr90838.c: New test. > > -- > > diff --git a/gcc/generic-match-head.c b/gcc/generic-match-head.c > index > fdc603977fc5b03a843944f75ce262f5d2256308..5a38bd233585225d60f0159c9042a16d9fdc9d80 > 100644 > --- a/gcc/generic-match-head.c > +++ b/gcc/generic-match-head.c > @@ -88,3 +88,10 @@ optimize_successive_divisions_p (tree, tree) > { > return false; > } > + > +static bool > +optimize_count_trailing_zeroes (tree type, tree array_ref, tree input, > + tree mulc, tree shift, tree &zero_val) > +{ > + return false; > +} > diff --git a/gcc/gimple-match-head.c b/gcc/gimple-match-head.c > index > 53278168a59f5ac10ce6760f04fd42589a0792e7..2d3b305f8ea54e4ca31c64994af30b34bb7eff09 > 100644 > --- a/gcc/gimple-match-head.c > +++ b/gcc/gimple-match-head.c > @@ -909,6 +909,24 @@ gimple_simplify (gimple *stmt, gimple_match_op *res_op, > gimple_seq *seq, > res_op->set_op (TREE_CODE (op0), type, valueized); > return true; > } > + else if (code == ARRAY_REF) > + { > + tree rhs1 = gimple_assign_rhs1 (stmt); > + tree op1 = TREE_OPERAND (rhs1, 1); > + tree op2 = TREE_OPERAND (rhs1, 2); > + tree op3 = TREE_OPERAND (rhs1, 3); > + tree op0 = TREE_OPERAND (rhs1, 0); > + bool valueized = false; > + > + op0 = do_valueize (op0, top_valueize, valueized); > + op1 = do_valueize (op1, top_valueize, valueized); > + > + if (op2 && op3) > + res_op->set_op (code, type, op0, op1, op2, op3); > + else > + res_op->set_op (code, type, op0, op1); > + return gimple_resimplify4 (seq, res_op, valueize) || > valueized; > + } > break; > case GIMPLE_UNARY_RHS: > { > @@ -1222,3 +1240,57 @@ optimize_successive_divisions_p (tree divisor, tree > inner_div) > } > return true; > } > + > +/* Recognize count trailing zeroes idiom. > + The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic > + constant which when multiplied by a power of 2 contains a unique value > + in the top 5 or 6 bits. This is then indexed into a table which maps it > + to the number of trailing zeroes. Array[0] is returned so the caller can > + emit an appropriate sequence depending on whether ctz (0) is defined on > + the target. */ > +static bool > +optimize_count_trailing_zeroes (tree type, tree array, tree x, tree mulc, > + tree tshift, tree &zero_val) > +{ > + gcc_assert (TREE_CODE (mulc) == INTEGER_CST); > + gcc_assert (TREE_CODE (tshift) == INTEGER_CST); > + > + tree input_type = TREE_TYPE (x); > + > + if (!direct_internal_fn_supported_p (IFN_CTZ, input_type, > OPTIMIZE_FOR_BOTH)) > + return false; > + > + unsigned HOST_WIDE_INT val = tree_to_uhwi (mulc); > + unsigned shiftval = tree_to_uhwi (tshift); > + unsigned input_bits = tree_to_shwi (TYPE_SIZE (input_type)); > + > + /* Check the array is not wider than integer type and the input is a 32-bit > + or 64-bit type. The shift should extract the top 5..7 bits. */ > + if (TYPE_PRECISION (type) > 32) > + return false; > + if (input_bits != 32 && input_bits != 64) > + return false; > + if (shiftval < input_bits - 7 || shiftval > input_bits - 5) > + return false; > + > + tree t = build4 (ARRAY_REF, type, array, size_int (0), NULL_TREE, > NULL_TREE); > + t = fold_const_aggregate_ref (t); > + if (t == NULL) > + return false; > + > + zero_val = build_int_cst (integer_type_node, tree_to_shwi (t)); > + > + for (unsigned i = 0; i < input_bits; i++, val <<= 1) > + { > + if (input_bits == 32) > + val &= 0xffffffff; > + t = build4 (ARRAY_REF, type, array, size_int ((int)(val >> shiftval)), > + NULL_TREE, NULL_TREE); > + t = fold_const_aggregate_ref (t); > + if (t == NULL || tree_to_shwi (t) != i) > + return false; > + } > + > + return true; > +} > + > diff --git a/gcc/match.pd b/gcc/match.pd > index > 6edf54b80012d87dbe7330f5ee638cdba2f9c099..bbe935e1e2af35e8e953a776eb3ecfb83414b047 > 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -6060,3 +6060,33 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (simplify > (vec_perm vec_same_elem_p@0 @0 @1) > @0) > + > +/* Recognize count trailing zeroes idiom. > + The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic > + constant which when multiplied by a power of 2 contains a unique value > + in the top 5 or 6 bits. This is then indexed into a table which maps it > + to the number of trailing zeroes. If valid, emit an optimal sequence > + depending on the result for zero. > +*/ > +(simplify > + (ARRAY_REF @0 (rshift (mult (bit_and (negate @1) @1) INTEGER_CST@2) > + INTEGER_CST@3) @4 @5) > + (with > + { tree zero_val; > + HOST_WIDE_INT val; > + HOST_WIDE_INT type_size = tree_to_shwi (TYPE_SIZE (TREE_TYPE (@1))); > + bool zero_ok = CTZ_DEFINED_VALUE_AT_ZERO (TYPE_MODE (TREE_TYPE (@1)), > val); > + } > + (if (optimize_count_trailing_zeroes (type, @0, @1, @2, @3, zero_val)) > + (switch > + (if (zero_ok && tree_to_shwi (zero_val) == val) > + (convert (BUILT_IN_CTZ:integer_type_node @1))) > + > + /* Emit ctz (x) & 31 if ctz (0) is 32 but we need to return 0. */ > + (if (zero_ok && val == type_size && integer_zerop (zero_val)) > + (convert (bit_and (BUILT_IN_CTZ:integer_type_node @1) > + { build_int_cst (integer_type_node, type_size - 1); }))) > + > + /* Emit (x ? ctz (x) : zero_val). */ > + (if (true) > + (convert (cond @1 (BUILT_IN_CTZ:integer_type_node @1) { zero_val; } > ))))))) > diff --git a/gcc/testsuite/gcc.target/aarch64/pr90838.c > b/gcc/testsuite/gcc.target/aarch64/pr90838.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..bff3144c0d1b3984016e5a404e986eae785c73ed > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/pr90838.c > @@ -0,0 +1,64 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > + > +int ctz1 (unsigned x) > +{ > + static const char table[32] = > + { > + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, > + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 > + }; > + > + return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27]; > +} > + > +int ctz2 (unsigned x) > +{ > + const int u = 0; > + static short table[64] = > + { > + 32, 0, 1,12, 2, 6, u,13, 3, u, 7, u, u, u, u,14, > + 10, 4, u, u, 8, u, u,25, u, u, u, u, u,21,27,15, > + 31,11, 5, u, u, u, u, u, 9, u, u,24, u, u,20,26, > + 30, u, u, u, u,23, u,19,29, u,22,18,28,17,16, u > + }; > + > + x = (x & -x) * 0x0450FBAF; > + return table[x >> 26]; > +} > + > +int ctz3 (unsigned x) > +{ > + static int table[32] = > + { > + 0, 1, 2,24, 3,19, 6,25, 22, 4,20,10,16, 7,12,26, > + 31,23,18, 5,21, 9,15,11,30,17, 8,14,29,13,28,27 > + }; > + > + if (x == 0) return 32; > + x = (x & -x) * 0x04D7651F; > + return table[x >> 27]; > +} > + > +static const unsigned long long magic = 0x03f08c5392f756cdULL; > + > +static const char table[64] = { > + 0, 1, 12, 2, 13, 22, 17, 3, > + 14, 33, 23, 36, 18, 58, 28, 4, > + 62, 15, 34, 26, 24, 48, 50, 37, > + 19, 55, 59, 52, 29, 44, 39, 5, > + 63, 11, 21, 16, 32, 35, 57, 27, > + 61, 25, 47, 49, 54, 51, 43, 38, > + 10, 20, 31, 56, 60, 46, 53, 42, > + 9, 30, 45, 41, 8, 40, 7, 6, > +}; > + > +int ctz4 (unsigned long x) > +{ > + unsigned long lsb = x & -x; > + return table[(lsb * magic) >> 58]; > +} > + > +/* { dg-final { scan-assembler-times "clz\t" 4 } } */ > +/* { dg-final { scan-assembler-times "and\t" 2 } } */ > +/* { dg-final { scan-assembler-not "cmp\t.*0" } } */