On Fri, Apr 10, 2026 at 3:44 PM Philipp Tomsich <[email protected]> wrote: > > Recognize a variant CLZ idiom where the OR-cascade is followed by > (value - (value >> 1)) to isolate the MSB as a power of two (2^k), > then a DeBruijn multiply-and-shift maps 2^k back to k: > > value |= value >> 1; > ... > value |= value >> 32; > result = table[((value - (value >> 1)) * MAGIC) >> 58]; > > After the OR-cascade value equals 2^(k+1) - 1, so (value - (value >> 1)) > equals 2^k. The multiply-and-shift is therefore a CTZ-style DeBruijn > lookup, and the table satisfies table[(magic << k) >> shift] == k. > > Add match.pd pattern clz_msb_iso_table_index matching the full chain. > In simplify_count_zeroes, validate the table with the existing CTZ > checkfn (which implements the direct-form check) while still emitting > IFN_CLZ code -- both forms store MSB positions, so the existing CLZ > emission path including zero_val pre-compensation works unchanged. > > Also relax the element-type check from "precision <= 32" to "integral > and precision <= 64" so tables declared as unsigned long (64-bit on > LP64) are handled. Table values are bit positions in [0, input_bits - 1] > and fit in any integer type with at least 6 bits of precision. > > Only a 64-bit variant is added: all known uses of this idiom > (Stockfish, zstd, cpython, the PR122569 comment 3 reproducer) are > 64-bit. A 32-bit variant is a straightforward copy if a real-world > case appears.
I wonder if we can factor out matching the shift cascading? Basically match (b * magic) >> 58 and (value - (value >> 1)) and then match b / value to the cascading? > gcc/ChangeLog: > > PR tree-optimization/122569 > * match.pd (clz_msb_iso_table_index): New match pattern. > * tree-ssa-forwprop.cc (gimple_clz_msb_iso_table_index): Declare. > (simplify_count_zeroes): Recognize the new pattern; route its > table validation through the CTZ checkfn. Relax the element > type check to accept integer types up to 64 bits. > > gcc/testsuite/ChangeLog: > > PR tree-optimization/122569 > * gcc.dg/tree-ssa/pr122569-3.c: New test. > Signed-off-by: Philipp Tomsich <[email protected]> > --- > Ref: vrull/gcc#450 > > gcc/match.pd | 35 ++++++++++++++++++++ > gcc/testsuite/gcc.dg/tree-ssa/pr122569-3.c | 38 ++++++++++++++++++++++ > gcc/tree-ssa-forwprop.cc | 32 +++++++++++++++--- > 3 files changed, 101 insertions(+), 4 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr122569-3.c > > diff --git a/gcc/match.pd b/gcc/match.pd > index 7b652afb43de..e64cddee9188 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -11968,6 +11968,41 @@ and, > && compare_tree_int (@c16, 16) == 0 > && compare_tree_int (@c32, 32) == 0))) > > +/* Match count leading zeros for simplify_count_zeroes in forwprop, > + variant that isolates the MSB as a power of two via (s - (s >> 1)) > + after the OR-cascade has set all bits from 0 to the original MSB. > + For such a value s = 2^(k+1) - 1, (s - (s >> 1)) equals 2^k, so the > + subsequent DeBruijn multiply-and-shift is a CTZ-style lookup on the > + isolated MSB. The table therefore has to satisfy the direct CTZ > + DeBruijn property (validated by the CTZ checkfn in > + simplify_count_zeroes). PR tree-optimization/122569. */ > +(match (clz_msb_iso_table_index @1 @2 @3) > + (rshift (mult > + (minus > + (bit_ior@f (rshift > + (bit_ior@e (rshift > + (bit_ior@d (rshift > + (bit_ior@c (rshift > + (bit_ior@b (rshift > + (bit_ior@a (rshift @1 INTEGER_CST@c1) @1) > + INTEGER_CST@c2) @a) > + INTEGER_CST@c4) @b) > + INTEGER_CST@c8) @c) > + INTEGER_CST@c16) @d) > + INTEGER_CST@c32) @e) > + (rshift @f INTEGER_CST@sub1)) > + INTEGER_CST@2) INTEGER_CST@3) > + (if (INTEGRAL_TYPE_P (type) > + && TYPE_UNSIGNED (type) > + && TYPE_PRECISION (type) == 64 > + && compare_tree_int (@c1, 1) == 0 > + && compare_tree_int (@c2, 2) == 0 > + && compare_tree_int (@c4, 4) == 0 > + && compare_tree_int (@c8, 8) == 0 > + && compare_tree_int (@c16, 16) == 0 > + && compare_tree_int (@c32, 32) == 0 > + && compare_tree_int (@sub1, 1) == 0))) > + > /* Floatint point/integer comparison and integer->integer > or floating point -> float point conversion. */ > (match (cond_expr_convert_p @0 @2 @3 @6) > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr122569-3.c > b/gcc/testsuite/gcc.dg/tree-ssa/pr122569-3.c > new file mode 100644 > index 000000000000..8cbffbe07465 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr122569-3.c > @@ -0,0 +1,38 @@ > +/* PR tree-optimization/122569 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-forwprop1-details" } */ > + > +/* Test that the forwprop DeBruijn CLZ matcher recognizes the variant > + idiom that isolates the MSB as a power of two via (s - (s >> 1)) > + after the OR-cascade. This pattern uses a CTZ-style DeBruijn magic > + applied to 2^MSB, not to the all-bits-below value directly. > + Reported on PR 122569 as a second reproducer. > + > + The table element type here is unsigned long (64-bit on LP64 targets) > + rather than int, which exercises the relaxed element-type check. */ > + > +typedef unsigned long long uint64_t; > + > +void > +get_msb_index (unsigned long *result, uint64_t value) > +{ > + static const unsigned long deBruijnTable64[64] = { > + 63, 0, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54, > + 33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, > + 34, 11, 43, 14, 22, 4, 62, 57, 46, 52, 38, 26, 32, > + 41, 50, 36, 17, 19, 29, 10, 13, 21, 56, 45, 25, 31, > + 35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5 > + }; > + > + value |= value >> 1; > + value |= value >> 2; > + value |= value >> 4; > + value |= value >> 8; > + value |= value >> 16; > + value |= value >> 32; > + > + *result = deBruijnTable64[((value - (value >> 1)) > + * (uint64_t) 0x07EDD5E59A4E28C2ULL) >> 58]; > +} > + > +/* { dg-final { scan-tree-dump "__builtin_clz|\\.CLZ" "forwprop1" } } */ > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc > index 26d1430b5252..2490e1e9dfa5 100644 > --- a/gcc/tree-ssa-forwprop.cc > +++ b/gcc/tree-ssa-forwprop.cc > @@ -3380,6 +3380,7 @@ check_table (tree ctor, tree type, HOST_WIDE_INT > &zero_val, unsigned bits, > /* Match.pd function to match the ctz expression. */ > extern bool gimple_ctz_table_index (tree, tree *, tree (*)(tree)); > extern bool gimple_clz_table_index (tree, tree *, tree (*)(tree)); > +extern bool gimple_clz_msb_iso_table_index (tree, tree *, tree (*)(tree)); > > /* Recognize count leading and trailing zeroes idioms. > The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic > @@ -3399,6 +3400,12 @@ simplify_count_zeroes (gimple_stmt_iterator *gsi) > gcc_checking_assert (TREE_CODE (array_ref) == ARRAY_REF); > > internal_fn fn = IFN_LAST; > + /* When true, the matched idiom is a CLZ using DeBruijn CTZ on the > + isolated MSB -- see clz_msb_iso_table_index in match.pd. The > + table stores MSB positions and must satisfy the direct CTZ > + DeBruijn property, so we validate it with the CTZ checkfn even > + though we emit IFN_CLZ code. */ > + bool clz_via_ctz = false; > /* For CTZ we recognize ((x & -x) * C) >> SHIFT where the array data > represents the number of trailing zeros. */ > if (gimple_ctz_table_index (TREE_OPERAND (array_ref, 1), &res_ops[0], > NULL)) > @@ -3414,6 +3421,17 @@ simplify_count_zeroes (gimple_stmt_iterator *gsi) > else if (gimple_clz_table_index (TREE_OPERAND (array_ref, 1), &res_ops[0], > NULL)) > fn = IFN_CLZ; > + /* Variant CLZ idiom: after the OR-cascade sets all bits from 0 to > + the original MSB, (value - (value >> 1)) isolates the MSB as a > + power of two (2^k), and the subsequent DeBruijn multiply-and- > + shift is a CTZ-style lookup on 2^k. The table stores MSB > + positions directly. */ > + else if (gimple_clz_msb_iso_table_index (TREE_OPERAND (array_ref, 1), > + &res_ops[0], NULL)) > + { > + fn = IFN_CLZ; > + clz_via_ctz = true; > + } > else > return false; > > @@ -3423,9 +3441,13 @@ simplify_count_zeroes (gimple_stmt_iterator *gsi) > tree input_type = TREE_TYPE (res_ops[0]); > unsigned input_bits = tree_to_shwi (TYPE_SIZE (input_type)); > > - /* Check the array element type is not wider than 32 bits and the input is > - an unsigned 32-bit or 64-bit type. */ > - if (TYPE_PRECISION (type) > 32 || !TYPE_UNSIGNED (input_type)) > + /* Check the array element type is integral and not wider than 64 bits, > + and the input is an unsigned 32-bit or 64-bit type. The table values > + are bit positions in [0, input_bits - 1], so any integer element type > + with at least 6 bits of precision suffices; the cap is just to keep > + the transformation simple. */ > + if (!INTEGRAL_TYPE_P (type) || TYPE_PRECISION (type) > 64 > + || !TYPE_UNSIGNED (input_type)) > return false; > if (input_bits != 32 && input_bits != 64) > return false; > @@ -3447,7 +3469,9 @@ simplify_count_zeroes (gimple_stmt_iterator *gsi) > if (!ctor) > return false; > unsigned HOST_WIDE_INT mulval = tree_to_uhwi (res_ops[1]); > - if (fn == IFN_CTZ) > + /* CTZ and the MSB-isolation CLZ variant both use the direct CTZ > + DeBruijn check (table[(magic << data) >> shift] == data). */ > + if (fn == IFN_CTZ || clz_via_ctz) > { > auto checkfn = [&](unsigned data, unsigned i) -> bool > { > -- > 2.34.1 > > base-commit: c684613dea0b00d222ebbae8439ea8fd8c1f1865 > branch: ptomsich/pr122569
