The following adjusts the heuristic when we perform PHI insertion
during GIMPLE PRE from requiring at least one edge that is supposed
to be optimized for speed to also doing insertion when the expression
is available on all edges (but possibly with different value) and
we'd at most have one copy from a constant.  The first ensures
we optimize two computations on all paths to one plus a possible
copy due to the PHI, the second makes sure we do not need to insert
many possibly large copies from constants, disregarding the
cummulative size cost of the register copies when they are not
coalesced.

The case in the testcase is

  <bb 5>
  _14 = h;
  if (_14 == 0B)
    goto <bb 7>;
  else
    goto <bb 6>;

  <bb 6>
  h = 0B;

  <bb 7>
  h.6_12 = h;

and we want to optimize that to

  <bb 7>
  # h.6_12 = PHI <_14(5), 0B(6)>

If we want to consider the cost of the register copies I think the
only simplistic enough way would be to restrict the special-case to
two incoming edges - we'd assume one register copy is coalesced
leaving one copy from a register or from a constant.

As with every optimization the downstream effects are probably
bigger than what we can locally estimate.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

Any comments?

Thanks,
Richard.

        PR tree-optimization/110963
        * tree-ssa-pre.cc (do_pre_regular_insertion): Also insert
        a PHI node when the expression is available on all edges
        and we insert at most one copy from a constant.

        * gcc.dg/tree-ssa/ssa-pre-34.c: New testcase.
---
 gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-34.c | 56 ++++++++++++++++++++++
 gcc/tree-ssa-pre.cc                        | 11 +++++
 2 files changed, 67 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-34.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-34.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-34.c
new file mode 100644
index 00000000000..9ac37c44336
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-34.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-Os -fdump-tree-pre-stats -fdump-tree-optimized" } */
+
+void foo(void);
+static int c = 76, f, g;
+static int *h, *j, *k = &g;
+static int **i = &h;
+static short a;
+static signed char(l)(signed char b) {
+    if (!(((b) >= 77) && ((b) <= 77))) {
+        __builtin_unreachable();
+    }
+    return 0;
+}
+static short(m)(short d, short e) { return d + e; }
+static short n(signed char) {
+    j = *i;
+    if (j == 0)
+        ;
+    else
+        *i = 0;
+    *k = 0;
+    return 0;
+}
+static signed char o() {
+    l(0);
+    return 0;
+}
+static signed char p(int ad) {
+    a = m(!0, ad);
+    l(a);
+    if (f) {
+        *i &&n(o());
+        *i = 0;
+    } else
+        n(0);
+    if (h == &f || h == 0)
+        ;
+    else
+        foo();
+    return 0;
+}
+int main() {
+    p(c);
+    c = 8;
+}
+
+/* Even with main being cold we should optimize the redundant load of h
+   which is available on all incoming edges (but none considered worth
+   optimizing for speed) when doing that doesn't needlessly increase
+   code size.  */
+
+/* { dg-final { scan-tree-dump "Insertions: 1" "pre" } } */
+/* { dg-final { scan-tree-dump "HOIST inserted: 1" "pre" } } */
+/* { dg-final { scan-tree-dump "Eliminated: 3" "pre" } } */
+/* { dg-final { scan-tree-dump-not "foo" "optimized" } } */
diff --git a/gcc/tree-ssa-pre.cc b/gcc/tree-ssa-pre.cc
index 0f2e458395c..07fb165b2a8 100644
--- a/gcc/tree-ssa-pre.cc
+++ b/gcc/tree-ssa-pre.cc
@@ -3314,6 +3314,8 @@ do_pre_regular_insertion (basic_block block, basic_block 
dom,
          bool by_some = false;
          bool cant_insert = false;
          bool all_same = true;
+         unsigned num_inserts = 0;
+         unsigned num_const = 0;
          pre_expr first_s = NULL;
          edge pred;
          basic_block bprime;
@@ -3370,11 +3372,14 @@ do_pre_regular_insertion (basic_block block, 
basic_block dom,
                {
                  avail[pred->dest_idx] = eprime;
                  all_same = false;
+                 num_inserts++;
                }
              else
                {
                  avail[pred->dest_idx] = edoubleprime;
                  by_some = true;
+                 if (edoubleprime->kind == CONSTANT)
+                   num_const++;
                  /* We want to perform insertions to remove a redundancy on
                     a path in the CFG we want to optimize for speed.  */
                  if (optimize_edge_for_speed_p (pred))
@@ -3391,6 +3396,12 @@ do_pre_regular_insertion (basic_block block, basic_block 
dom,
             partially redundant.  */
          if (!cant_insert && !all_same && by_some)
            {
+             /* If the expression is redundant on all edges and we need
+                to at most insert one copy from a constant do the PHI
+                insertion even when not optimizing a path that's to be
+                optimized for speed.  */
+             if (num_inserts == 0 && num_const <= 1)
+               do_insertion = true;
              if (!do_insertion)
                {
                  if (dump_file && (dump_flags & TDF_DETAILS))
-- 
2.35.3

Reply via email to