https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89606

            Bug ID: 89606
           Summary: Extra mov after structure load instructions on aarch64
           Product: gcc
           Version: 8.3.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: yyc1992 at gmail dot com
  Target Milestone: ---

Code to reproduce,

```
#include <arm_neon.h>

#ifdef __aarch64__
float64x2x2_t f(const double *p1, const double *p2)
{
    float64x2x2_t v = vld2q_f64(p1);
    return vld2q_lane_f64(p2, v, 1);
}

float32x2x2_t f2(const float *p1, const float *p2)
{
    float32x2x2_t v = vld2_f32(p1);
    return vld2_lane_f32(p2, v, 1);
}
#endif

void f3(float32x2x2_t *p, const float *p1, const float *p2)
{
    float32x2x2_t v = vld2_f32(p1);
    *p = vld2_lane_f32(p2, v, 1);
}
```

GCC produces (aarch64, -O1/-O2/-O3/-Ofast/-Os),

```
f:
        ld2     {v4.2d - v5.2d}, [x0]
        mov     v0.16b, v4.16b
        mov     v1.16b, v5.16b
        ld2     {v0.d - v1.d}[1], [x1]
        ret
f2:
        ld2     {v0.2s - v1.2s}, [x0]
        mov     v2.8b, v0.8b
        mov     v3.8b, v1.8b
        ld2     {v2.s - v3.s}[1], [x1]
        mov     v1.8b, v3.8b
        mov     v0.8b, v2.8b
        ret
f3:
        ld2     {v2.2s - v3.2s}, [x1]
        mov     v0.8b, v2.8b
        mov     v1.8b, v3.8b
        ld2     {v0.s - v1.s}[1], [x2]
        stp     d0, d1, [x0]
        ret
```

For all three functions, none of the mov's seems necessary. Even if there's
some performance issue when reusing the registers (I highly doubt it...) at
least the `-Os` version should not have those mov's.

Clang produces what I expect in this case,

```
f:
        ld2     { v0.2d, v1.2d }, [x0]
        ld2     { v0.d, v1.d }[1], [x1]
        ret
f2:
        ld2     { v0.2s, v1.2s }, [x0]
        ld2     { v0.s, v1.s }[1], [x1]
        ret
f3:
        ld2     { v0.2s, v1.2s }, [x1]
        ld2     { v0.s, v1.s }[1], [x2]
        stp     d0, d1, [x0]
        ret
```

Aarch32 doesn't have this issue either with GCC,

```
f3:
        vld2.32 {d16-d17}, [r1]
        vld2.32 {d16[1], d17[1]}, [r2]
        vst1.64 {d16-d17}, [r0:64]
        bx      lr
```

so this seems to be aarch64 specific.

Reply via email to