https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94274

--- Comment #5 from z.zhanghaijian at huawei dot com <z.zhanghaijian at huawei 
dot com> ---
Created attachment 48659
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=48659&action=edit
Fold phi whose incoming args are defined from binary operations

I tried to make a patch to do this optimization(in attachment):

replaces

bb0:
  if (cond) goto bb1; else goto bb2;
bb1:
  x1 = a + b;
  goto <bb3>
bb2:
  x2 = a + c;
bb3:
  x = PHI <x1 (bb1), x2 (bb2), ...>;

with

bb0:
  if (cond) goto bb1; else goto bb2;
bb1:
bb2:
bb3:
  x3 = PHI <b (bb1), c (bb2), ...>;
  x = a + x3;

This patch will check all the phi nodes in bb3. We do the optimization only if
these phis can all be converted, This can avoid most scenes that the blocks is
not empty after the optimization, but there are still some scenes that the
block is not empty.

For example 1:

int f1(int cond, int a, int b, int c, int d, int e, int f, int x, int y, int z,
int w, int m, int n)
{

  if (cond) {
    x = e + f;
    b = x >> w;
    c = m + 12;
    a = b + z;
  }
  else {
    d = y >> w;
    c = n + 12;
    a = d + z;
  }
  a = a + 18;
  return c + a;

}

Tree dump before optimization:

  <bb 3> [local count: 536870913]:
  x_13 = e_11(D) + f_12(D);
  b_14 = x_13 >> w_5(D);
  c_16 = m_15(D) + 12;
  a_17 = z_9(D) + b_14;
  goto <bb 5>; [100.00%]

  <bb 4> [local count: 536870913]:
  d_6 = y_4(D) >> w_5(D);
  c_8 = n_7(D) + 12;
  a_10 = d_6 + z_9(D);

  <bb 5> [local count: 1073741824]:
  # a_1 = PHI <a_17(3), a_10(4)>
  # c_2 = PHI <c_16(3), c_8(4)>
  a_18 = a_1 + 18;
  _19 = c_2 + a_18;
  return _19;

Tree dump after optimization:

  <bb 3> [local count: 536870913]:
  x_13 = e_11(D) + f_12(D);
  goto <bb 5>; [100.00%]

  <bb 4> [local count: 536870913]:

  <bb 5> [local count: 1073741824]:
  # _21 = PHI <x_13(3), y_4(D)(4)>
  # _23 = PHI <m_15(D)(3), n_7(D)(4)>
  c_2 = _23 + 12;
  _22 = _21 >> w_5(D);
  a_1 = z_9(D) + _22;
  a_18 = a_1 + 18;
  _19 = c_2 + a_18;
  return _19;

Assemble before optimization:

.LFB0:
        .cfi_startproc
        ldr     w1, [sp, 8]
        ldr     w2, [sp, 16]
        cbz     w0, .L2
        add     w5, w5, w6
        ldr     w0, [sp, 24]
        asr     w5, w5, w2
        add     w1, w5, w1
        add     w1, w1, 18
        add     w0, w0, 12
        add     w0, w1, w0
        ret
        .p2align 2,,3
.L2:
        ldr     w0, [sp]
        asr     w2, w0, w2
        ldr     w0, [sp, 32]
        add     w1, w2, w1
        add     w1, w1, 18
        add     w0, w0, 12
        add     w0, w1, w0
        ret
        .cfi_endproc

Assemble after optimization:

.LFB0:
        .cfi_startproc
        ldr     w1, [sp]
        ldr     w3, [sp, 8]
        ldr     w4, [sp, 16]
        ldr     w2, [sp, 32]
        cbz     w0, .L2
        ldr     w2, [sp, 24]
        add     w1, w5, w6
.L2:
        asr     w0, w1, w4
        add     w0, w0, w3
        add     w0, w0, w2
        add     w0, w0, 30
        ret

Because the statement x_13 = e_11 (D) + f_12 (D) in bb3 does not have a phi
node in bb5, so bb3 cannot be emptied. I have not found a good way to solve it.
Any suggestions?

Without considering the register pressure, the performance of example 1 is
profitable. And this patch is effective for 500.perlbench_r in comment 3.

Reply via email to