Hi! As mentioned in the PR, the a r<< (bitsize-b) to a r>> b and similar match.pd optimization which has been introduced in GCC 15 can introduce UB which wasn't there before, in particular if b is equal at runtime to bitsize, then a r<< 0 is turned into a r>> bitsize.
The following patch fixes it by optimizing it early only if VRP tells us the count isn't equal to the bitsize, and late into a r>> (b & (bitsize - 1)) if bitsize is power of two and the subtraction has single use, on various targets the masking then goes away because its rotate instructions do masking already. The latter can't be done too early though, because then the expr_not_equal_to case is basically useless and we introduce the masking always and can't find out anymore that there was originally no masking. Even cfun->after_inlining check would be too early, there is forwprop before vrp, so the patch introduces a new PROP for vrp2 finish, after which there is another forwprop. Or do you have ideas about what other condition could be used for late matching only? Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2025-01-08 Jakub Jelinek <ja...@redhat.com> Andrew Pinski <quic_apin...@quicinc.com> PR tree-optimization/117927 * tree-pass.h (PROP_vrp): Define. * tree-vrp.cc (pass_vrp::execute): Set PROP_vrp in curr_properties at the end if final_p. * match.pd (a rrotate (32-b) -> a lrotate b): Only optimize either if @2 is known not to be equal to prec or if after vrp2 the subtraction has single use and prec is power of two; in that case transform it into orotate by masked count. * gcc.dg/tree-ssa/pr117927.c: New test. --- gcc/tree-pass.h.jj 2025-01-07 20:10:42.617965365 +0100 +++ gcc/tree-pass.h 2025-01-07 21:34:13.859794503 +0100 @@ -230,6 +230,7 @@ protected: #define PROP_assumptions_done (1 << 19) /* Assume function kept around. */ #define PROP_gimple_lbitint (1 << 20) /* lowered large _BitInt */ +#define PROP_vrp (1 << 21) /* vrp2 pass finished. */ #define PROP_gimple \ (PROP_gimple_any | PROP_gimple_lcf | PROP_gimple_leh | PROP_gimple_lomp) --- gcc/tree-vrp.cc.jj 2025-01-07 20:10:42.617965365 +0100 +++ gcc/tree-vrp.cc 2025-01-07 21:35:24.308814807 +0100 @@ -1081,8 +1081,8 @@ private: gimple *m_last_bb_stmt; }; -/* Main entry point for a VRP pass using just ranger. This can be called - from anywhere to perform a VRP pass, including from EVRP. */ +/* Main entry point for a VRP pass using just ranger. This can be called + from anywhere to perform a VRP pass, including from EVRP. */ unsigned int execute_ranger_vrp (struct function *fun, bool final_p) @@ -1334,6 +1334,7 @@ public: { // Check for fast vrp. bool use_fvrp = (&data == &pass_data_fast_vrp); + unsigned int todo; if (!use_fvrp && last_basic_block_for_fn (fun) > param_vrp_block_limit) { use_fvrp = true; @@ -1344,8 +1345,12 @@ public: param_vrp_block_limit); } if (use_fvrp) - return execute_fast_vrp (fun, final_p); - return execute_ranger_vrp (fun, final_p); + todo = execute_fast_vrp (fun, final_p); + else + todo = execute_ranger_vrp (fun, final_p); + if (final_p) + fun->curr_properties |= PROP_vrp; + return todo; } private: --- gcc/match.pd.jj 2025-01-07 20:10:42.615965392 +0100 +++ gcc/match.pd 2025-01-07 21:36:27.991929199 +0100 @@ -4967,14 +4967,31 @@ (define_operator_list SYNC_FETCH_AND_AND build_int_cst (TREE_TYPE (@1), element_precision (type)), @1); })) -/* a rrotate (32-b) -> a lrotate b */ -/* a lrotate (32-b) -> a rrotate b */ +/* a rrotate (bitsize-b) -> a lrotate b */ +/* a lrotate (bitsize-b) -> a rrotate b */ +/* Only do the above when it is known that b != bitsize. + Otherwise canonicalize to a orotate (b & mask) when the subtraction + has single use and prec is a power of two. */ (for rotate (lrotate rrotate) orotate (rrotate lrotate) (simplify - (rotate @0 (minus INTEGER_CST@1 @2)) - (if (element_precision (TREE_TYPE (@0)) == wi::to_wide (@1)) - (orotate @0 @2)))) + (rotate @0 (minus@3 INTEGER_CST@1 @2)) + (with { auto prec = element_precision (TREE_TYPE (@0)); } + (if (prec == wi::to_wide (@1)) + (switch + (if (expr_not_equal_to (@2, wi::uhwi (prec, TYPE_PRECISION (TREE_TYPE (@2))))) + (orotate @0 @2)) + (if (single_use (@3) + && pow2p_hwi (prec) + /* Defer this case until VRP2 could be run and expr_not_equal_to + had a chance to match. Otherwise we'd do pretty much always + just the second case. */ + && cfun + && ((cfun->curr_properties & PROP_vrp) != 0 + || !flag_tree_vrp + || optimize_debug)) + (orotate @0 + (bit_and @2 { build_int_cst (TREE_TYPE (@2), prec - 1); })))))))) /* Turn (a OP c1) OP c2 into a OP (c1+c2). */ (for op (lrotate rrotate rshift lshift) --- gcc/testsuite/gcc.dg/tree-ssa/pr117927.c.jj 2025-01-07 20:48:59.827825909 +0100 +++ gcc/testsuite/gcc.dg/tree-ssa/pr117927.c 2025-01-07 20:48:59.827825909 +0100 @@ -0,0 +1,97 @@ +/* PR tree-optimization/117927 */ +/* { dg-do compile { target { int32 && longlong64 } } } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump-times " r<< " 4 "optimized" } } */ +/* { dg-final { scan-tree-dump-times " r>> " 4 "optimized" } } */ +/* { dg-final { scan-tree-dump-times " & 31;" 2 "optimized" } } */ +/* { dg-final { scan-tree-dump-times " & 63;" 2 "optimized" } } */ +/* { dg-final { scan-tree-dump-not "32 - " "optimized" } } */ +/* { dg-final { scan-tree-dump-not "64 - " "optimized" } } */ + +static inline +unsigned lrotate32 (unsigned x, int t) +{ + unsigned tl = x << t; + unsigned th = x >> (-t & 31); + return tl | th; +} + +static inline +unsigned rrotate32 (unsigned x, int t) +{ + unsigned tl = x >> t; + unsigned th = x << (-t & 31); + return tl | th; +} + +static inline +unsigned long long lrotate64 (unsigned long long x, int t) +{ + unsigned long long tl = x << t; + unsigned long long th = x >> (-t & 63); + return tl | th; +} + +static inline +unsigned long long rrotate64 (unsigned long long x, int t) +{ + unsigned long long tl = x >> t; + unsigned long long th = x << (-t & 63); + return tl | th; +} + +unsigned +f1 (unsigned x, int t) +{ + return lrotate32 (x, 32 - t); +} + +unsigned long long +f2 (unsigned long long x, int t) +{ + return lrotate64 (x, 64 - t); +} + +unsigned +f3 (unsigned x, int t) +{ + if (t == 32) + __builtin_unreachable (); + return lrotate32 (x, 32 - t); +} + +unsigned long long +f4 (unsigned long long x, int t) +{ + if (t == 64) + __builtin_unreachable (); + return lrotate64 (x, 64 - t); +} + +unsigned +f5 (unsigned x, int t) +{ + return rrotate32 (x, 32 - t); +} + +unsigned long long +f6 (unsigned long long x, int t) +{ + return rrotate64 (x, 64 - t); +} + +unsigned +f7 (unsigned x, int t) +{ + if (t == 32) + __builtin_unreachable (); + return rrotate32 (x, 32 - t); +} + +unsigned long long +f8 (unsigned long long x, int t) +{ + if (t == 64) + __builtin_unreachable (); + return rrotate64 (x, 64 - t); +} Jakub