https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113903

            Bug ID: 113903
           Summary: sched1 should schedule across EBBS
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

The following testcase:

#define N 306
#define NEEDLE 136
int table[N];

int foo (int i, unsigned short parse_tables_n)
{
  parse_tables_n >>= 9;
  parse_tables_n += 11;
  while (i < N && parse_tables_n--)
    table[i++] = 0;
  return table[NEEDLE];
}

compiled at -O3 shows an issue we've started getting with the support for early
break vectorization.

sched1 doesn't seem to be able to schedule across EBBs, which is logical since
we never really needed to before.

However the above code generates:

.L10:
        st1w    z28.s, p7, [x1, #1, mul vl]
        st1w    z28.s, p7, [x1]
        add     x1, x1, x5
        cmp     w0, w2
        bcc     .L17
.L8:
        cmpne   p15.h, p7/z, z31.h, #0
        mov     z29.d, z31.d
        not     p15.b, p14/z, p15.b
        mov     z27.d, z30.d
        add     w2, w2, w4
        dech    z31.h
        ptest   p14, p15.b
        incw    z30.s, all, mul #2
        b.none  .L10
        umov    w1, v29.h[0]
        umov    w20, v27.s[0]
        and     w3, w1, 65535
        b       .L6

and the AArch64 codegen inefficiencies aside (which I will tackle myself) shows
that we're copying the old value of the induction variables in every loop
iteration to keep them for the reductions if we exit.

However the new values are not live in L8 and so the operations can be moved to
L10:

.L10:
        incw    z30.s, all, mul #2
        dech    z31.h
        st1w    z28.s, p7, [x1, #1, mul vl]
        st1w    z28.s, p7, [x1]
        add     x1, x1, x5
        cmp     w0, w2
        bcc     .L17
.L8:
        cmpne   p15.h, p7/z, z31.h, #0
        not     p15.b, p14/z, p15.b
        add     w2, w2, w4
        ptest   p14, p15.b
        b.none  .L10
        umov    w1, v31.h[0]
        umov    w20, v30.s[0]
        and     w3, w1, 65535
        b       .L6

and thus decreasing the live ranges.  The optimal codegen for this sequence is:

.L10:
        dech    z31.h
        incw    z30.s, all, mul #2
        st1w    z28.s, p7, [x1, #1, mul vl]
        st1w    z28.s, p7, [x1]
        add     x1, x1, x5
        cmp     w0, w2
        bcc     .L17
.L8:
        cmpeq   p15.h, p7/z, z31.h, #0
        add     w2, w2, w4
        b.none  .L10
        umov    w1, v31.h[0]
        umov    w20, v30.s[0]
        b       .L6

Reply via email to