https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122069

            Bug ID: 122069
           Summary: Missed use of UDOT for char->int reduction
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ktkachov at gcc dot gnu.org
                CC: tnfchris at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64

Testcase:
inline char char_abs(char i) {
  return (i < 0 ? -i : i);
}

int foo_int(unsigned char *x, unsigned char *y) {
  int sum = 0;
  for (int i = 0; i < 8000; i++)
     sum += char_abs(x[i] - y[i]);
  return sum;
}

Clang for AArch64 can, with the options -O3 -mcpu=grace -mllvm
-force-vector-interleave=1 produce:
foo_int(unsigned char*, unsigned char*):
        movi    v1.16b, #1
        mov     w8, #8000
        movi    v0.2d, #0000000000000000
.LBB0_1:
        ldr     q2, [x0], #16
        ldr     q3, [x1], #16
        subs    x8, x8, #16
        sub     v2.16b, v2.16b, v3.16b
        udot    z0.s, z2.b, z1.b
        b.ne    .LBB0_1
        addv    s0, v0.4s
        fmov    w0, s0
        ret

GCC seems to miss it and generates:
foo_int(unsigned char*, unsigned char*):
        movi    v30.4s, 0
        mov     x2, 8000
        str     d15, [sp, -16]!
        add     x2, x1, x2
        mov     v31.16b, v30.16b
        mov     v29.16b, v30.16b
        mov     v28.16b, v30.16b
        mov     v27.16b, v30.16b
        mov     v26.16b, v30.16b
        mov     v25.16b, v30.16b
        mov     v24.16b, v30.16b
        mov     v23.16b, v30.16b
        mov     v22.16b, v30.16b
        mov     v21.16b, v30.16b
        mov     v20.16b, v30.16b
        mov     v19.16b, v30.16b
        mov     v18.16b, v30.16b
        mov     v17.16b, v30.16b
        mov     v16.16b, v30.16b
        mov     v7.16b, v30.16b
.L2:
        ldp     q6, q4, [x0]
        ldp     q2, q0, [x0, 32]
        add     x0, x0, 64
        ldp     q5, q3, [x1]
        ldp     q1, q15, [x1, 32]
        add     x1, x1, 64
        sub     v5.16b, v6.16b, v5.16b
        sub     v3.16b, v4.16b, v3.16b
        sub     v1.16b, v2.16b, v1.16b
        sub     v15.16b, v0.16b, v15.16b
        zip1    v6.16b, v5.16b, v31.16b
        zip1    v4.16b, v3.16b, v31.16b
        zip1    v2.16b, v1.16b, v31.16b
        zip1    v0.16b, v15.16b, v31.16b
        zip2    v5.16b, v5.16b, v31.16b
        zip2    v3.16b, v3.16b, v31.16b
        zip2    v1.16b, v1.16b, v31.16b
        zip2    v15.16b, v15.16b, v31.16b
        uaddw   v7.4s, v7.4s, v6.4h
        uaddw2  v16.4s, v16.4s, v6.8h
        uaddw   v17.4s, v17.4s, v5.4h
        uaddw2  v18.4s, v18.4s, v5.8h
        uaddw   v19.4s, v19.4s, v4.4h
        uaddw2  v20.4s, v20.4s, v4.8h
        uaddw   v21.4s, v21.4s, v3.4h
        uaddw2  v22.4s, v22.4s, v3.8h
        uaddw   v23.4s, v23.4s, v2.4h
        uaddw2  v24.4s, v24.4s, v2.8h
        uaddw   v25.4s, v25.4s, v1.4h
        uaddw2  v26.4s, v26.4s, v1.8h
        uaddw   v27.4s, v27.4s, v0.4h
        uaddw2  v28.4s, v28.4s, v0.8h
        uaddw   v29.4s, v29.4s, v15.4h
        uaddw2  v30.4s, v30.4s, v15.8h
        cmp     x1, x2
        bne     .L2
        add     v7.4s, v7.4s, v16.4s
        ldr     d15, [sp], 16
        add     v7.4s, v7.4s, v17.4s
        add     v7.4s, v7.4s, v18.4s
        add     v7.4s, v7.4s, v19.4s
        add     v7.4s, v7.4s, v20.4s
        add     v7.4s, v7.4s, v21.4s
        add     v7.4s, v7.4s, v22.4s
        add     v7.4s, v7.4s, v23.4s
        add     v7.4s, v7.4s, v24.4s
        add     v7.4s, v7.4s, v25.4s
        add     v7.4s, v7.4s, v26.4s
        add     v7.4s, v7.4s, v27.4s
        add     v7.4s, v7.4s, v28.4s
        add     v7.4s, v7.4s, v29.4s
        add     v7.4s, v7.4s, v30.4s
        addv    s31, v7.4s
        fmov    w0, s31
        ret

https://godbolt.org/z/Eq66dvoeK

Reply via email to