Hi! This is my proposal for fixing this PR, just a heuristics when optimizing successive divides might not be a good idea (first testcase), and includes Marc's testcases which showed cases where optimizing successive divides is a good idea even if the inner divide is not a single use.
Unfortunately match.pd doesn't allow to capture the outermost expression, so I can't narrow it even more by checking if the outer divide is in the same bb as the modulo. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2018-12-06 Jakub Jelinek <ja...@redhat.com> PR tree-optimization/85726 * generic-match-head.c (optimize_successive_divisions_p): New function. * gimple-match-head.c (optimize_successive_divisions_p): Likewise. * match.pd: Don't combine successive divisions if they aren't exact and optimize_successive_divisions_p is false. * gcc.dg/tree-ssa/pr85726-1.c: New test. * gcc.dg/tree-ssa/pr85726-2.c: New test. * gcc.dg/tree-ssa/pr85726-3.c: New test. * gcc.dg/tree-ssa/pr85726-4.c: New test. --- gcc/generic-match-head.c.jj 2018-03-28 21:14:28.124743854 +0200 +++ gcc/generic-match-head.c 2018-12-05 16:07:43.710801347 +0100 @@ -77,3 +77,12 @@ canonicalize_math_after_vectorization_p { return false; } + +/* Return true if successive divisions can be optimized. + Defer to GIMPLE opts. */ + +static inline bool +optimize_successive_divisions_p (tree, tree) +{ + return false; +} --- gcc/gimple-match-head.c.jj 2018-10-10 10:50:55.812109572 +0200 +++ gcc/gimple-match-head.c 2018-12-05 16:39:50.358122589 +0100 @@ -1163,3 +1163,27 @@ optimize_pow_to_exp (tree arg0, tree arg return false; return true; } + +/* Return true if a division INNER_DIV / DIVISOR where INNER_DIV + is another division can be optimized. Don't optimize if INNER_DIV + is used in a TRUNC_MOD_EXPR with DIVISOR as second operand. */ + +static bool +optimize_successive_divisions_p (tree divisor, tree inner_div) +{ + if (!gimple_in_ssa_p (cfun) || TREE_CODE (inner_div) != SSA_NAME) + return false; + + imm_use_iterator imm_iter; + use_operand_p use_p; + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, inner_div) + { + gimple *use_stmt = USE_STMT (use_p); + if (!is_gimple_assign (use_stmt) + || gimple_assign_rhs_code (use_stmt) != TRUNC_MOD_EXPR + || !operand_equal_p (gimple_assign_rhs2 (use_stmt), divisor, 0)) + continue; + return false; + } + return true; +} --- gcc/match.pd.jj 2018-11-30 21:36:22.273762329 +0100 +++ gcc/match.pd 2018-12-05 16:47:27.798596450 +0100 @@ -312,17 +312,19 @@ (define_operator_list COND_TERNARY and floor_div is trickier and combining round_div even more so. */ (for div (trunc_div exact_div) (simplify - (div (div @0 INTEGER_CST@1) INTEGER_CST@2) + (div (div@3 @0 INTEGER_CST@1) INTEGER_CST@2) (with { wi::overflow_type overflow; wide_int mul = wi::mul (wi::to_wide (@1), wi::to_wide (@2), TYPE_SIGN (type), &overflow); } - (if (!overflow) - (div @0 { wide_int_to_tree (type, mul); }) - (if (TYPE_UNSIGNED (type) - || mul != wi::min_value (TYPE_PRECISION (type), SIGNED)) - { build_zero_cst (type); }))))) + (if (div == EXACT_DIV_EXPR + || optimize_successive_divisions_p (@2, @3)) + (if (!overflow) + (div @0 { wide_int_to_tree (type, mul); }) + (if (TYPE_UNSIGNED (type) + || mul != wi::min_value (TYPE_PRECISION (type), SIGNED)) + { build_zero_cst (type); })))))) /* Combine successive multiplications. Similar to above, but handling overflow is different. */ --- gcc/testsuite/gcc.dg/tree-ssa/pr85726-1.c.jj 2018-12-05 16:55:24.852680964 +0100 +++ gcc/testsuite/gcc.dg/tree-ssa/pr85726-1.c 2018-12-05 16:50:14.489853926 +0100 @@ -0,0 +1,19 @@ +/* PR tree-optimization/85726 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump " / 16;" "optimized" } } */ +/* { dg-final { scan-tree-dump " / 3;" "optimized" } } */ +/* { dg-final { scan-tree-dump " % 3;" "optimized" } } */ +/* { dg-final { scan-tree-dump-not " / 48;" "optimized" } } */ + +int ww, vv; + +int +foo (int y) +{ + int z = y / 16; + int w = z / 3; + int v = z % 3; + ww = w; + return v; +} --- gcc/testsuite/gcc.dg/tree-ssa/pr85726-2.c.jj 2018-12-05 16:55:27.620634707 +0100 +++ gcc/testsuite/gcc.dg/tree-ssa/pr85726-2.c 2018-12-05 16:51:25.636678886 +0100 @@ -0,0 +1,15 @@ +/* PR tree-optimization/85726 */ +/* { dg-do compile { target int32 } } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump " / 3145728;" "optimized" } } */ +/* { dg-final { scan-tree-dump "y = 0;" "optimized" } } */ + +int x, y; + +void +foo (int n) +{ + int c = 3 << 20; + x = n / c; + y = x / c; +} --- gcc/testsuite/gcc.dg/tree-ssa/pr85726-3.c.jj 2018-12-05 16:55:30.948579089 +0100 +++ gcc/testsuite/gcc.dg/tree-ssa/pr85726-3.c 2018-12-05 16:52:57.350146115 +0100 @@ -0,0 +1,15 @@ +/* PR tree-optimization/85726 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump-times " / 3;" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times " / 15;" 1 "optimized" } } */ + +int x, y, z; + +void +foo (int n) +{ + x = n / 3; + y = x / 5; + z = n / 15; +} --- gcc/testsuite/gcc.dg/tree-ssa/pr85726-4.c.jj 2018-12-05 16:55:34.588518255 +0100 +++ gcc/testsuite/gcc.dg/tree-ssa/pr85726-4.c 2018-12-05 16:55:04.789016282 +0100 @@ -0,0 +1,15 @@ +/* PR tree-optimization/85726 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump-times " / 4;" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times " / 16;" 1 "optimized" } } */ + +int x, y, z; + +void +foo (int n) +{ + x = n / 4; + y = x / 4; + z = y * 16 | 15; +} Jakub