https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88834

            Bug ID: 88834
           Summary: [SVE] Poor addressing mode choices for LD2 and ST2
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: rsandifo at gcc dot gnu.org
  Target Milestone: ---

Compiling this code with -O3 -march=armv8-a+sve:

void
f (int *restrict x, int *restrict y, int *restrict z, int n)
{
  for (int i = 0; i < n; i += 2)
    {
      x[i] = y[i] + z[i];
      x[i + 1] = y[i + 1] - z[i + 1];
    }
}

gives:

f:
.LFB0:
        .cfi_startproc
        cmp     w3, 0
        ble     .L1
        sub     w4, w3, #1
        cntw    x3
        ptrue   p1.s, all
        lsr     w4, w4, 1
        add     w4, w4, 1
        whilelo p0.s, xzr, x4
        .p2align 3,,7
.L3:
        ld2w    {z4.s - z5.s}, p0/z, [x1]
        ld2w    {z2.s - z3.s}, p0/z, [x2]
        add     z0.s, z4.s, z2.s
        sub     z1.s, z5.s, z3.s
        st2w    {z0.s - z1.s}, p0, [x0]
        incb    x1, all, mul #2
        whilelo p0.s, x3, x4
        incb    x0, all, mul #2
        incb    x2, all, mul #2
        incw    x3
        ptest   p1, p0.b
        bne     .L3
.L1:
        ret
        .cfi_endproc

Rather than have one INCB per address, we should have a single IV
that tracks the index, something like:

        ld2w    {z4.s - z5.s}, p0/z, [x1, x4, lsl #2]
        ld2w    {z2.s - z3.s}, p0/z, [x2, x4, lsl #2]
        add     z0.s, z4.s, z2.s
        sub     z1.s, z5.s, z3.s
        st2w    {z0.s - z1.s}, p0, [x0, x4, lsl #2]
        incw    x4, all, mul #2     // or inch

I think this will need work in both the target code and ivopts.

Reply via email to