https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125201

            Bug ID: 125201
           Summary: [17 Regression] 20% slowdown in TSVC s116 since
                    r17-140-gf8d911e6ae3fc1
           Product: gcc
           Version: 17.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: dhruvc at gcc dot gnu.org
                CC: rguenth at gcc dot gnu.org
  Target Milestone: ---

Looks like the loop wasn't getting vectorized before and is now, but the
emission of a tbl instruction is killing the backend.

On compiler explorer: https://godbolt.org/z/xK43M5K3M

==============================

Flags: -O3 -mcpu=grace
-----

Source:
------

real_t s116(struct args_t * func_args)
{

//    linear dependence testing

    initialise_arrays(__func__);
    gettimeofday(&func_args->t1, NULL);

    for (int nl = 0; nl < iterations*10; nl++) {
        for (int i = 0; i < LEN_1D - 5; i += 5) {
            a[i] = a[i + 1] * a[i];
            a[i + 1] = a[i + 2] * a[i + 1];
            a[i + 2] = a[i + 3] * a[i + 2];
            a[i + 3] = a[i + 4] * a[i + 3];
            a[i + 4] = a[i + 5] * a[i + 4];
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }

    gettimeofday(&func_args->t2, NULL);
    return calc_checksum(__func__);
}

==============================

In the SLP dumps (194t.slp1):

It looks like there is a new VEC_PERM_EXPR and a vector multiply being
generated.

Good:
----

src/tsvc.c:315:18: note: Cost model analysis for part in loop 2:
  Vector cost: 28
  Scalar cost: 16

GIMPLE loop body:
----------------

  <bb 3> [local count: 1063004408]:
  # i_46 = PHI <_18(8), 0(5)>
  # ivtmp_44 = PHI <ivtmp_43(8), 6399(5)>
  # a_I_lsm0.15_24 = PHI <_19(8), _41(5)>
  _2 = i_46 + 1;
  _3 = a[_2];
  _5 = _3 * a_I_lsm0.15_24;
  a[i_46] = _5;
  _6 = i_46 + 2;
  _7 = a[_6];
  _9 = _3 * _7;
  a[_2] = _9;
  _10 = i_46 + 3;
  _11 = a[_10];
  _13 = _7 * _11;
  a[_6] = _13;
  _14 = i_46 + 4;
  _15 = a[_14];
  _17 = _11 * _15;
  a[_10] = _17;
  _18 = i_46 + 5;
  _19 = a[_18];
  _21 = _15 * _19;
  a[_14] = _21;
  ivtmp_43 = ivtmp_44 - 1;
  if (ivtmp_43 != 0)
    goto <bb 8>; [98.99%]
  else
    goto <bb 4>; [1.01%]

Bad:
---

src/tsvc.c:315:18: note: Cost model analysis for part in loop 2:
  Vector cost: 12
  Scalar cost: 16

GIMPLE loop body:
----------------

  <bb 3> [local count: 1063004408]:
  # i_46 = PHI <_18(8), 0(5)>
  # ivtmp_44 = PHI <ivtmp_43(8), 6399(5)>
  # a_I_lsm0.15_24 = PHI <_19(8), _41(5)>
  _2 = i_46 + 1;
  vectp.19_51 = &a[_2];
  vect__3.20_4 = MEM <vector(4) float> [(float *)vectp.19_51];
  vectp.19_25 = vectp.19_51 + 16;
  vectp.19_12 = vectp.19_51 + 4;
  vect__3.22_45 = VEC_PERM_EXPR <vect__3.20_4, vect__3.20_4, { 0, 0, 1, 2 }>;
  _3 = a[_2];
  _5 = _3 * a_I_lsm0.15_24;
  _6 = i_46 + 2;
  _7 = a[_6];
  _9 = _3 * _7;
  _10 = i_46 + 3;
  _11 = a[_10];
  _13 = _7 * _11;
  _14 = i_46 + 4;
  _15 = a[_14];
  _16 = {a_I_lsm0.15_24, _7, _11, _15};
  vect__5.23_23 = vect__3.22_45 * _16;
  _17 = _11 * _15;
  vectp.25_20 = &a[i_46];
  MEM <vector(4) float> [(float *)vectp.25_20] = vect__5.23_23;
  _18 = i_46 + 5;
  _19 = a[_18];
  _21 = _15 * _19;
  a[_14] = _21;
  ivtmp_43 = ivtmp_44 - 1;
  if (ivtmp_43 != 0)
    goto <bb 8>; [98.99%]
  else
    goto <bb 4>; [1.01%]

==============================

This causes generation of a tbl instruction and a loop-carried dependency,
which seem to be the cause:

Good:
----

.L2:
        ldr     s31, [x26]
        mov     x0, x26
        .p2align 5,,15
.L3:
        ldp     s1, s0, [x0, 4]
        ldp     s30, s29, [x0, 12]
        add     x0, x0, 20
        fmul    s2, s1, s31
        ldr     s31, [x0]
        fmul    s1, s1, s0
        fmul    s0, s0, s30
        fmul    s30, s30, s29
        fmul    s29, s29, s31
        stp     s2, s1, [x0, -20]
        stp     s0, s30, [x0, -12]
        str     s29, [x0, -4]
        cmp     x0, x28
        bne     .L3
        movi    v0.2s, #0
        mov     x7, x25
        mov     x6, x24
        mov     x5, x23
        mov     x4, x22
        mov     x3, x21
        mov     x2, x20
        mov     x1, x19
        mov     x0, x26
        bl      dummy
        subs    w27, w27, #1
        bne     .L2

Bad:
---

.L2:
        ldr     s27, [x26]
        mov     x0, x26
        .p2align 5,,15
.L3:
        ldp     s28, s31, [x0, 8]
        ldr     q29, [x0, 4]
        uzp1    v31.2s, v27.2s, v31.2s
        ldp     s30, s27, [x0, 16]
        add     x0, x0, 20
        tbl     v29.16b, {v29.16b}, v26.16b   <--- loop-carried dep from q26
        uzp1    v28.2s, v28.2s, v30.2s
        fmul    s30, s30, s27
        zip1    v31.4s, v31.4s, v28.4s
        str     s30, [x0, -4]
        fmul    v31.4s, v31.4s, v29.4s
        str     q31, [x0, -20]
        cmp     x0, x28
        bne     .L3
        movi    v0.2s, #0
        mov     x7, x25
        mov     x6, x24
        mov     x5, x23
        mov     x4, x22
        mov     x3, x21
        mov     x2, x20
        mov     x1, x19
        mov     x0, x26
        bl      dummy
        adrp    x0, .LANCHOR0+16
        subs    w27, w27, #1
        ldr     q26, [x0, #:lo12:.LANCHOR0+16] ---> loop-carried dep to tbl
        bne     .L2

On compiler explorer: https://godbolt.org/z/WvW5ErqYq

==============================

This seems to lead to a huge increase in the number of backend stalls:

Good:
----

>> perf stat -e stalled-cycles-backend -- ./good/tsvc/bin/S116/tsvc.exe
Loop    Time(sec)       Checksum
 s116        4.064      32000.000000

 Performance counter stats for './good/tsvc/bin/S116/tsvc.exe':

       174,173,615      stalled-cycles-backend

Bad:
---

>> perf stat -e stalled-cycles-backend -- ./bad/tsvc/bin/S116/tsvc.exe
Loop    Time(sec)       Checksum
 s116        4.898      32000.000000

 Performance counter stats for './bad/tsvc/bin/S116/tsvc.exe':

     1,661,928,005      stalled-cycles-backend

==============================

Reply via email to