https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100638

            Bug ID: 100638
           Summary: FP16 vector compare missed optimization on AArch64
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64-*

The following testcase

```
#include <arm_neon.h>

void foo(float16_t *x, uint16x8_t *out) {
    float16x8x2_t xk = vld2q_f16(x);
    float16x8_t xk_re = xk.val[0];
    float16x8_t xk_im = xk.val[1];
    uint16x8_t theta_rx = xk_re < 0;
    uint16x8_t theta_ix = xk_im < 0;
    out[0] = theta_rx;
    out[1] = theta_ix;
}
```
on AArch64 with `-Ofast -march=armv8.2-a+fp16` generates

```
foo:
        ld2     {v0.8h - v1.8h}, [x0]
        mov     w2, -1
        fcvt    s2, h0
        fcvt    s18, h1
        dup     h3, v0.h[1]
        dup     h24, v0.h[2]
        dup     h23, v0.h[3]
        fcmpe   s2, #0.0
        fcvt    s3, h3
        fcvt    s24, h24
        dup     h22, v0.h[4]
        csel    w0, w2, wzr, mi
        fcvt    s23, h23
        fcmpe   s3, #0.0
        dup     v2.4h, w0
        dup     h17, v1.h[1]
        dup     h16, v1.h[2]
        csel    w0, w2, wzr, mi
        fcmpe   s24, #0.0
        dup     h7, v1.h[3]
        dup     h6, v1.h[4]
        dup     h5, v1.h[5]
        dup     h4, v1.h[6]
        dup     h3, v1.h[7]
        mov     v1.16b, v2.16b
        dup     h21, v0.h[5]
        fcvt    s22, h22
        fcvt    s17, h17
        dup     h20, v0.h[6]
        ins     v1.h[1], w0
        csel    w0, w2, wzr, mi
        fcmpe   s23, #0.0
        fcvt    s21, h21
        dup     h19, v0.h[7]
        fcvt    s20, h20
        fcvt    s16, h16
        ins     v1.h[2], w0
        fcvt    s7, h7
        csel    w0, w2, wzr, mi
        fcmpe   s22, #0.0
        fcvt    s19, h19
        fcvt    s6, h6
        fcvt    s5, h5
        fcvt    s4, h4
        ins     v1.h[3], w0
        fcvt    s3, h3
        csel    w0, w2, wzr, mi
        fcmpe   s21, #0.0
        ins     v1.h[4], w0
        csel    w0, w2, wzr, mi
        fcmpe   s20, #0.0
        ins     v1.h[5], w0
        csel    w0, w2, wzr, mi
        fcmpe   s19, #0.0
        ins     v1.h[6], w0
        csel    w0, w2, wzr, mi
        fcmpe   s18, #0.0
        ins     v1.h[7], w0
        csel    w0, w2, wzr, mi
        fcmpe   s17, #0.0
        dup     v0.4h, w0
        csel    w0, w2, wzr, mi
        fcmpe   s16, #0.0
        ins     v0.h[1], w0
        csel    w0, w2, wzr, mi
        fcmpe   s7, #0.0
        ins     v0.h[2], w0
        csel    w0, w2, wzr, mi
        fcmpe   s6, #0.0
        ins     v0.h[3], w0
        csel    w0, w2, wzr, mi
        fcmpe   s5, #0.0
        ins     v0.h[4], w0
        csel    w0, w2, wzr, mi
        fcmpe   s4, #0.0
        ins     v0.h[5], w0
        csel    w0, w2, wzr, mi
        fcmpe   s3, #0.0
        ins     v0.h[6], w0
        csel    w2, w2, wzr, mi
        ins     v0.h[7], w2
        stp     q1, q0, [x1]
        ret
```

instead of simply

```
foo:
        ld2     { v0.8h, v1.8h }, [x0]
        fcmlt   v2.8h, v0.8h, #0.0
        fcmlt   v0.8h, v1.8h, #0.0
        stp     q2, q0, [x1]
        ret
```

This is happening because veclower doesn't find a pattern for the FP16 vector
compare and then lowers to operations on scalar.

However even the lowered operations are inefficient:

```
        fcvt    s23, h23
        fcmpe   s23, #0.0
```

indicates that the backend doesn't know how to do this operation on fp16.

Reply via email to