ping Hi Richard,
> Uh. Well. I think that the gimple-match-head.c hunk isn't something we > want. Instead, > since this optimizes a memory access, the handling should move > to tree-ssa-forwprop.c where you _may_ use a (match ...) > match.pd pattern to do the (rshift (mult (bit_and (negate @1) @1) > matching. It might be the first to use that feature, you need to > declare the function to use it from tree-ssa-forwprop.c. So OK, I've moved to to fwprop, and it works just fine there while still using match.pd to do the idiom matching. Here is the updated version: [PATCH v2] PR90838: Support ctz idioms v2: Use fwprop pass rather than match.pd Support common idioms for count trailing zeroes using an array lookup. The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic constant which when multiplied by a power of 2 contains a unique value in the top 5 or 6 bits. This is then indexed into a table which maps it to the number of trailing zeroes. When the table is valid, we emit a sequence using the target defined value for ctz (0): int ctz1 (unsigned x) { static const char table[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27]; } Is optimized to: rbit w0, w0 clz w0, w0 and w0, w0, 31 ret Bootstrapped on AArch64. OK for commit? ChangeLog: 2019-11-15 Wilco Dijkstra <wdijk...@arm.com> PR tree-optimization/90838 * tree-ssa-forwprop.c (optimize_count_trailing_zeroes): Add new function. (simplify_count_trailing_zeroes): Add new function. (pass_forwprop::execute): Try ctz simplification. * match.pd: Add matching for ctz idioms. * testsuite/gcc.target/aarch64/pr90838.c: New test. --- diff --git a/gcc/match.pd b/gcc/match.pd index 6edf54b80012d87dbe7330f5ee638cdba2f9c099..479e9076f0d4deccda54425e93ee4567b85409aa 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -6060,3 +6060,11 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (simplify (vec_perm vec_same_elem_p@0 @0 @1) @0) + +/* Match count trailing zeroes for simplify_count_trailing_zeroes in fwprop. + The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic + constant which when multiplied by a power of 2 contains a unique value + in the top 5 or 6 bits. This is then indexed into a table which maps it + to the number of trailing zeroes. */ +(match (ctz_table_index @1 @2 @3) + (rshift (mult (bit_and (negate @1) @1) INTEGER_CST@2) INTEGER_CST@3)) diff --git a/gcc/testsuite/gcc.target/aarch64/pr90838.c b/gcc/testsuite/gcc.target/aarch64/pr90838.c new file mode 100644 index 0000000000000000000000000000000000000000..bff3144c0d1b3984016e5a404e986eae785c73ed --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr90838.c @@ -0,0 +1,64 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +int ctz1 (unsigned x) +{ + static const char table[32] = + { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 + }; + + return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27]; +} + +int ctz2 (unsigned x) +{ + const int u = 0; + static short table[64] = + { + 32, 0, 1,12, 2, 6, u,13, 3, u, 7, u, u, u, u,14, + 10, 4, u, u, 8, u, u,25, u, u, u, u, u,21,27,15, + 31,11, 5, u, u, u, u, u, 9, u, u,24, u, u,20,26, + 30, u, u, u, u,23, u,19,29, u,22,18,28,17,16, u + }; + + x = (x & -x) * 0x0450FBAF; + return table[x >> 26]; +} + +int ctz3 (unsigned x) +{ + static int table[32] = + { + 0, 1, 2,24, 3,19, 6,25, 22, 4,20,10,16, 7,12,26, + 31,23,18, 5,21, 9,15,11,30,17, 8,14,29,13,28,27 + }; + + if (x == 0) return 32; + x = (x & -x) * 0x04D7651F; + return table[x >> 27]; +} + +static const unsigned long long magic = 0x03f08c5392f756cdULL; + +static const char table[64] = { + 0, 1, 12, 2, 13, 22, 17, 3, + 14, 33, 23, 36, 18, 58, 28, 4, + 62, 15, 34, 26, 24, 48, 50, 37, + 19, 55, 59, 52, 29, 44, 39, 5, + 63, 11, 21, 16, 32, 35, 57, 27, + 61, 25, 47, 49, 54, 51, 43, 38, + 10, 20, 31, 56, 60, 46, 53, 42, + 9, 30, 45, 41, 8, 40, 7, 6, +}; + +int ctz4 (unsigned long x) +{ + unsigned long lsb = x & -x; + return table[(lsb * magic) >> 58]; +} + +/* { dg-final { scan-assembler-times "clz\t" 4 } } */ +/* { dg-final { scan-assembler-times "and\t" 2 } } */ +/* { dg-final { scan-assembler-not "cmp\t.*0" } } */ diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c index fe55ca958b49b986f79a9a710d92b5d906959105..a632d54712be55f8070c9816e3c3702d4a493182 100644 --- a/gcc/tree-ssa-forwprop.c +++ b/gcc/tree-ssa-forwprop.c @@ -48,6 +48,7 @@ along with GCC; see the file COPYING3. If not see #include "optabs-tree.h" #include "tree-vector-builder.h" #include "vec-perm-indices.h" +#include "internal-fn.h" /* This pass propagates the RHS of assignment statements into use sites of the LHS of the assignment. It's basically a specialized @@ -1778,6 +1779,126 @@ simplify_rotate (gimple_stmt_iterator *gsi) return true; } + +/* Recognize count trailing zeroes idiom. + The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic + constant which when multiplied by a power of 2 contains a unique value + in the top 5 or 6 bits. This is then indexed into a table which maps it + to the number of trailing zeroes. Array[0] is returned so the caller can + emit an appropriate sequence depending on whether ctz (0) is defined on + the target. */ +static bool +optimize_count_trailing_zeroes (tree type, tree array, tree x, tree mulc, + tree tshift, tree &zero_val) +{ + gcc_assert (TREE_CODE (mulc) == INTEGER_CST); + gcc_assert (TREE_CODE (tshift) == INTEGER_CST); + + tree input_type = TREE_TYPE (x); + + if (!direct_internal_fn_supported_p (IFN_CTZ, input_type, OPTIMIZE_FOR_BOTH)) + return false; + + unsigned HOST_WIDE_INT val = tree_to_uhwi (mulc); + unsigned shiftval = tree_to_uhwi (tshift); + unsigned input_bits = tree_to_shwi (TYPE_SIZE (input_type)); + + /* Check the array is not wider than integer type and the input is a 32-bit + or 64-bit type. The shift should extract the top 5..7 bits. */ + if (TYPE_PRECISION (type) > 32) + return false; + if (input_bits != 32 && input_bits != 64) + return false; + if (shiftval < input_bits - 7 || shiftval > input_bits - 5) + return false; + + tree t = build4 (ARRAY_REF, type, array, size_int (0), NULL_TREE, NULL_TREE); + t = fold_const_aggregate_ref (t); + if (t == NULL) + return false; + + zero_val = build_int_cst (integer_type_node, tree_to_shwi (t)); + + for (unsigned i = 0; i < input_bits; i++, val <<= 1) + { + if (input_bits == 32) + val &= 0xffffffff; + t = build4 (ARRAY_REF, type, array, size_int ((int)(val >> shiftval)), + NULL_TREE, NULL_TREE); + t = fold_const_aggregate_ref (t); + if (t == NULL || tree_to_shwi (t) != i) + return false; + } + + return true; +} + +/* Match.pd function to match the ctz expression. */ +extern bool gimple_ctz_table_index (tree, tree *, tree (*)(tree)); + +static bool +simplify_count_trailing_zeroes (gimple_stmt_iterator *gsi) +{ + gimple *stmt = gsi_stmt (*gsi); + tree array_ref = gimple_assign_rhs1 (stmt); + tree res_ops[3]; + tree zero_val; + + gcc_checking_assert (TREE_CODE (array_ref) == ARRAY_REF); + + if (!gimple_ctz_table_index (TREE_OPERAND (array_ref, 1), &res_ops[0], NULL)) + return false; + + if (optimize_count_trailing_zeroes (TREE_TYPE (array_ref), + TREE_OPERAND (array_ref, 0), res_ops[0], + res_ops[1], res_ops[2], zero_val)) + { + tree lhs = gimple_assign_lhs (stmt); + tree type = TREE_TYPE (res_ops[0]); + HOST_WIDE_INT val; + HOST_WIDE_INT type_size = tree_to_shwi (TYPE_SIZE (type)); + tree lhs = gimple_assign_lhs (stmt); + bool zero_ok = CTZ_DEFINED_VALUE_AT_ZERO (TYPE_MODE (type), val); + bool need_convert = !useless_type_conversion_p (TREE_TYPE (lhs), + integer_type_node); + + gcall *call = gimple_build_call_internal (IFN_CTZ, 1, res_ops[0]); + gimple_set_location (call, gimple_location (stmt)); + gimple_set_lhs (call, make_ssa_name (integer_type_node)); + + gimple *g = call; + tree prev_lhs = gimple_call_lhs (call); + + /* Emit ctz (x) & 31 if ctz (0) is 32 but we need to return 0. */ + if (zero_ok && val == type_size && integer_zerop (zero_val)) + { + gsi_insert_before (gsi, g, GSI_SAME_STMT); + g = gimple_build_assign (make_ssa_name (integer_type_node), + BIT_AND_EXPR, prev_lhs, + build_int_cst (integer_type_node, + type_size - 1)); + gimple_set_location (g, gimple_location (stmt)); + prev_lhs = gimple_assign_lhs (g); + } + else if (!zero_ok || tree_to_shwi (zero_val) != val) + return false; + + if (need_convert) + { + gsi_insert_before (gsi, g, GSI_SAME_STMT); + g = gimple_build_assign (lhs, NOP_EXPR, prev_lhs); + } + else + gimple_set_lhs (g, lhs); + + gsi_replace (gsi, g, false); + return true; + } + + return false; +} + + /* Combine an element access with a shuffle. Returns true if there were any changes made, else it returns false. */ @@ -2759,6 +2880,8 @@ pass_forwprop::execute (function *fun) else if (code == CONSTRUCTOR && TREE_CODE (TREE_TYPE (rhs1)) == VECTOR_TYPE) changed = simplify_vector_constructor (&gsi); + else if (code == ARRAY_REF) + changed = simplify_count_trailing_zeroes (&gsi); break; }