On Fri, Apr 10, 2026 at 3:44 PM Philipp Tomsich
<[email protected]> wrote:
>
> Recognize a variant CLZ idiom where the OR-cascade is followed by
> (value - (value >> 1)) to isolate the MSB as a power of two (2^k),
> then a DeBruijn multiply-and-shift maps 2^k back to k:
>
>   value |= value >> 1;
>   ...
>   value |= value >> 32;
>   result = table[((value - (value >> 1)) * MAGIC) >> 58];
>
> After the OR-cascade value equals 2^(k+1) - 1, so (value - (value >> 1))
> equals 2^k.  The multiply-and-shift is therefore a CTZ-style DeBruijn
> lookup, and the table satisfies table[(magic << k) >> shift] == k.
>
> Add match.pd pattern clz_msb_iso_table_index matching the full chain.
> In simplify_count_zeroes, validate the table with the existing CTZ
> checkfn (which implements the direct-form check) while still emitting
> IFN_CLZ code -- both forms store MSB positions, so the existing CLZ
> emission path including zero_val pre-compensation works unchanged.
>
> Also relax the element-type check from "precision <= 32" to "integral
> and precision <= 64" so tables declared as unsigned long (64-bit on
> LP64) are handled.  Table values are bit positions in [0, input_bits - 1]
> and fit in any integer type with at least 6 bits of precision.
>
> Only a 64-bit variant is added: all known uses of this idiom
> (Stockfish, zstd, cpython, the PR122569 comment 3 reproducer) are
> 64-bit.  A 32-bit variant is a straightforward copy if a real-world
> case appears.

I wonder if we can factor out matching the shift cascading?  Basically
match (b * magic) >> 58 and (value - (value >> 1)) and then match
b / value to the cascading?

> gcc/ChangeLog:
>
>         PR tree-optimization/122569
>         * match.pd (clz_msb_iso_table_index): New match pattern.
>         * tree-ssa-forwprop.cc (gimple_clz_msb_iso_table_index): Declare.
>         (simplify_count_zeroes): Recognize the new pattern; route its
>         table validation through the CTZ checkfn.  Relax the element
>         type check to accept integer types up to 64 bits.
>
> gcc/testsuite/ChangeLog:
>
>         PR tree-optimization/122569
>         * gcc.dg/tree-ssa/pr122569-3.c: New test.
> Signed-off-by: Philipp Tomsich <[email protected]>
> ---
> Ref: vrull/gcc#450
>
>  gcc/match.pd                               | 35 ++++++++++++++++++++
>  gcc/testsuite/gcc.dg/tree-ssa/pr122569-3.c | 38 ++++++++++++++++++++++
>  gcc/tree-ssa-forwprop.cc                   | 32 +++++++++++++++---
>  3 files changed, 101 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr122569-3.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 7b652afb43de..e64cddee9188 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -11968,6 +11968,41 @@ and,
>         && compare_tree_int (@c16, 16) == 0
>         && compare_tree_int (@c32, 32) == 0)))
>
> +/* Match count leading zeros for simplify_count_zeroes in forwprop,
> +   variant that isolates the MSB as a power of two via (s - (s >> 1))
> +   after the OR-cascade has set all bits from 0 to the original MSB.
> +   For such a value s = 2^(k+1) - 1, (s - (s >> 1)) equals 2^k, so the
> +   subsequent DeBruijn multiply-and-shift is a CTZ-style lookup on the
> +   isolated MSB.  The table therefore has to satisfy the direct CTZ
> +   DeBruijn property (validated by the CTZ checkfn in
> +   simplify_count_zeroes).  PR tree-optimization/122569.  */
> +(match (clz_msb_iso_table_index @1 @2 @3)
> +  (rshift (mult
> +   (minus
> +    (bit_ior@f (rshift
> +     (bit_ior@e (rshift
> +       (bit_ior@d (rshift
> +         (bit_ior@c (rshift
> +           (bit_ior@b (rshift
> +            (bit_ior@a (rshift @1 INTEGER_CST@c1) @1)
> +            INTEGER_CST@c2) @a)
> +          INTEGER_CST@c4) @b)
> +         INTEGER_CST@c8) @c)
> +       INTEGER_CST@c16) @d)
> +     INTEGER_CST@c32) @e)
> +    (rshift @f INTEGER_CST@sub1))
> +   INTEGER_CST@2) INTEGER_CST@3)
> +  (if (INTEGRAL_TYPE_P (type)
> +       && TYPE_UNSIGNED (type)
> +       && TYPE_PRECISION (type) == 64
> +       && compare_tree_int (@c1, 1) == 0
> +       && compare_tree_int (@c2, 2) == 0
> +       && compare_tree_int (@c4, 4) == 0
> +       && compare_tree_int (@c8, 8) == 0
> +       && compare_tree_int (@c16, 16) == 0
> +       && compare_tree_int (@c32, 32) == 0
> +       && compare_tree_int (@sub1, 1) == 0)))
> +
>  /* Floatint point/integer comparison and integer->integer
>     or floating point -> float point conversion.  */
>  (match (cond_expr_convert_p @0 @2 @3 @6)
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr122569-3.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/pr122569-3.c
> new file mode 100644
> index 000000000000..8cbffbe07465
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr122569-3.c
> @@ -0,0 +1,38 @@
> +/* PR tree-optimization/122569 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-forwprop1-details" } */
> +
> +/* Test that the forwprop DeBruijn CLZ matcher recognizes the variant
> +   idiom that isolates the MSB as a power of two via (s - (s >> 1))
> +   after the OR-cascade.  This pattern uses a CTZ-style DeBruijn magic
> +   applied to 2^MSB, not to the all-bits-below value directly.
> +   Reported on PR 122569 as a second reproducer.
> +
> +   The table element type here is unsigned long (64-bit on LP64 targets)
> +   rather than int, which exercises the relaxed element-type check.  */
> +
> +typedef unsigned long long uint64_t;
> +
> +void
> +get_msb_index (unsigned long *result, uint64_t value)
> +{
> +  static const unsigned long deBruijnTable64[64] = {
> +    63, 0,  58, 1,  59, 47, 53, 2,  60, 39, 48, 27, 54,
> +    33, 42, 3,  61, 51, 37, 40, 49, 18, 28, 20, 55, 30,
> +    34, 11, 43, 14, 22, 4,  62, 57, 46, 52, 38, 26, 32,
> +    41, 50, 36, 17, 19, 29, 10, 13, 21, 56, 45, 25, 31,
> +    35, 16, 9,  12, 44, 24, 15, 8,  23, 7,  6,  5
> +  };
> +
> +  value |= value >> 1;
> +  value |= value >> 2;
> +  value |= value >> 4;
> +  value |= value >> 8;
> +  value |= value >> 16;
> +  value |= value >> 32;
> +
> +  *result = deBruijnTable64[((value - (value >> 1))
> +                            * (uint64_t) 0x07EDD5E59A4E28C2ULL) >> 58];
> +}
> +
> +/* { dg-final { scan-tree-dump "__builtin_clz|\\.CLZ" "forwprop1" } } */
> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> index 26d1430b5252..2490e1e9dfa5 100644
> --- a/gcc/tree-ssa-forwprop.cc
> +++ b/gcc/tree-ssa-forwprop.cc
> @@ -3380,6 +3380,7 @@ check_table (tree ctor, tree type, HOST_WIDE_INT 
> &zero_val, unsigned bits,
>  /* Match.pd function to match the ctz expression.  */
>  extern bool gimple_ctz_table_index (tree, tree *, tree (*)(tree));
>  extern bool gimple_clz_table_index (tree, tree *, tree (*)(tree));
> +extern bool gimple_clz_msb_iso_table_index (tree, tree *, tree (*)(tree));
>
>  /* Recognize count leading and trailing zeroes idioms.
>     The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic
> @@ -3399,6 +3400,12 @@ simplify_count_zeroes (gimple_stmt_iterator *gsi)
>    gcc_checking_assert (TREE_CODE (array_ref) == ARRAY_REF);
>
>    internal_fn fn = IFN_LAST;
> +  /* When true, the matched idiom is a CLZ using DeBruijn CTZ on the
> +     isolated MSB -- see clz_msb_iso_table_index in match.pd.  The
> +     table stores MSB positions and must satisfy the direct CTZ
> +     DeBruijn property, so we validate it with the CTZ checkfn even
> +     though we emit IFN_CLZ code.  */
> +  bool clz_via_ctz = false;
>    /* For CTZ we recognize ((x & -x) * C) >> SHIFT where the array data
>       represents the number of trailing zeros.  */
>    if (gimple_ctz_table_index (TREE_OPERAND (array_ref, 1), &res_ops[0], 
> NULL))
> @@ -3414,6 +3421,17 @@ simplify_count_zeroes (gimple_stmt_iterator *gsi)
>    else if (gimple_clz_table_index (TREE_OPERAND (array_ref, 1), &res_ops[0],
>                                    NULL))
>      fn = IFN_CLZ;
> +  /* Variant CLZ idiom: after the OR-cascade sets all bits from 0 to
> +     the original MSB, (value - (value >> 1)) isolates the MSB as a
> +     power of two (2^k), and the subsequent DeBruijn multiply-and-
> +     shift is a CTZ-style lookup on 2^k.  The table stores MSB
> +     positions directly.  */
> +  else if (gimple_clz_msb_iso_table_index (TREE_OPERAND (array_ref, 1),
> +                                          &res_ops[0], NULL))
> +    {
> +      fn = IFN_CLZ;
> +      clz_via_ctz = true;
> +    }
>    else
>      return false;
>
> @@ -3423,9 +3441,13 @@ simplify_count_zeroes (gimple_stmt_iterator *gsi)
>    tree input_type = TREE_TYPE (res_ops[0]);
>    unsigned input_bits = tree_to_shwi (TYPE_SIZE (input_type));
>
> -  /* Check the array element type is not wider than 32 bits and the input is
> -     an unsigned 32-bit or 64-bit type.  */
> -  if (TYPE_PRECISION (type) > 32 || !TYPE_UNSIGNED (input_type))
> +  /* Check the array element type is integral and not wider than 64 bits,
> +     and the input is an unsigned 32-bit or 64-bit type.  The table values
> +     are bit positions in [0, input_bits - 1], so any integer element type
> +     with at least 6 bits of precision suffices; the cap is just to keep
> +     the transformation simple.  */
> +  if (!INTEGRAL_TYPE_P (type) || TYPE_PRECISION (type) > 64
> +      || !TYPE_UNSIGNED (input_type))
>      return false;
>    if (input_bits != 32 && input_bits != 64)
>      return false;
> @@ -3447,7 +3469,9 @@ simplify_count_zeroes (gimple_stmt_iterator *gsi)
>    if (!ctor)
>      return false;
>    unsigned HOST_WIDE_INT mulval = tree_to_uhwi (res_ops[1]);
> -  if (fn == IFN_CTZ)
> +  /* CTZ and the MSB-isolation CLZ variant both use the direct CTZ
> +     DeBruijn check (table[(magic << data) >> shift] == data).  */
> +  if (fn == IFN_CTZ || clz_via_ctz)
>      {
>        auto checkfn = [&](unsigned data, unsigned i) -> bool
>         {
> --
> 2.34.1
>
> base-commit: c684613dea0b00d222ebbae8439ea8fd8c1f1865
> branch: ptomsich/pr122569

Reply via email to