https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122750

            Bug ID: 122750
           Summary: missing optimization in induction variable usage
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

This loop

int foo2 (char *buf, int len) {
    int x = 0;
    for (int i =0; i < len; i++) {
        x += (int) (i*10) * buf[i];
    }
    return x;
}


with -O3 -march=armv8-a+sve generates

.L3:
        ld1b    z28.s, p7/z, [x0, x2]
        add     x2, x2, x3
        mul     z28.s, p6/m, z28.s, z29.s
        incw    z29.s
        mul     z28.s, z28.s, #10
        add     z30.s, p7/m, z30.s, z28.s
        whilelo p7.s, w2, w1
        b.any   .L3
        uaddv   d31, p6, z30.s
        fmov    w0, s31
        ret

because we've re-associated (i*10)*buf[i] into i * (10*buf[i]) and so the
induction variable becomes a linear update and then multiplied by 10 * buf[i];

However if instead the induction variable was doing i*10, e.g.:

int foo1 (char *buf, int len) {
    int x = 0;
    for (int i =0, y = 0; i < len; i++, y = i * 10) {
        x += (int) y * buf[i];
    }
    return x;
}

we can generate much better code:

.L9:
        ld1b    z28.s, p7/z, [x0, x2]
        add     x2, x2, x3
        mla     z30.s, p7/m, z28.s, z29.s
        incw    z29.s, all, mul #10
        whilelo p7.s, w2, w1
        b.any   .L9

note that this is true even for Adv. SIMD:

.L4:
        ldr     q31, [x2], 16
        add     v21.4s, v27.4s, v17.4s
        add     v22.4s, v27.4s, v18.4s
        zip1    v29.16b, v31.16b, v28.16b
        zip2    v31.16b, v31.16b, v28.16b
        add     v23.4s, v27.4s, v19.4s
        zip1    v24.8h, v29.8h, v28.8h
        zip2    v29.8h, v29.8h, v28.8h
        zip1    v25.8h, v31.8h, v28.8h
        zip2    v31.8h, v31.8h, v28.8h
        mul     v24.4s, v24.4s, v27.4s
        mul     v29.4s, v29.4s, v21.4s
        mul     v25.4s, v25.4s, v22.4s
        mul     v31.4s, v31.4s, v23.4s
        mla     v30.4s, v24.4s, v26.4s
        add     v27.4s, v27.4s, v20.4s
        mla     v30.4s, v29.4s, v26.4s
        mla     v30.4s, v25.4s, v26.4s
        mla     v30.4s, v31.4s, v26.4s
        cmp     x2, x3
        bne     .L4

vs

.L12:
        ldr     q30, [x2], 16
        add     v23.4s, v28.4s, v18.4s
        add     v25.4s, v28.4s, v19.4s
        zip1    v27.16b, v30.16b, v29.16b
        zip2    v30.16b, v30.16b, v29.16b
        add     v26.4s, v28.4s, v20.4s
        zip1    v22.8h, v27.8h, v29.8h
        zip2    v27.8h, v27.8h, v29.8h
        zip1    v24.8h, v30.8h, v29.8h
        zip2    v30.8h, v30.8h, v29.8h
        mla     v31.4s, v22.4s, v28.4s
        add     v28.4s, v28.4s, v21.4s
        mla     v31.4s, v27.4s, v23.4s
        mla     v31.4s, v24.4s, v25.4s
        mla     v31.4s, v30.4s, v26.4s
        cmp     x2, x3
        bne     .L12

Reply via email to