https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122528

--- Comment #2 from Manuel López-Ibáñez <manu at gcc dot gnu.org> ---
A smaller testcase showing the problem:

int cmp(const double * restrict a, const double * restrict b)
{
    bool a_lt_b_0 = a[0] < b[0];
    bool a_lt_b_1 = a[1] < b[1];
    bool a_lt_b_2 = a[2] < b[2];
    bool a_eq_b_0 = a[0] == b[0];
    bool a_eq_b_1 = a[1] == b[1];
    bool a_eq_b_2 = a[2] == b[2];
    bool a_leq_b_0 = a_lt_b_0 | a_eq_b_0;
    bool a_leq_b_1 = a_lt_b_1 | a_eq_b_1;
    bool a_leq_b_2 = a_lt_b_2 | a_eq_b_2;

    if (a_leq_b_0 && a_leq_b_1 && a_leq_b_2)
        return -1;

    if (a_lt_b_0 || (a_eq_b_0 && a_lt_b_1 || (a_eq_b_1 && a_leq_b_2)))
        return 1;

    return 0;
}

Compiled with -O3 -march=x86-64-v3 -ffast-math


The above generates 8 vcomisd, but there are only 6 comparisons (and it should
be possible to implement the function with just 3).

"cmp":
        vmovsd  xmm3, QWORD PTR [rdi+8]
        vmovsd  xmm2, QWORD PTR [rsi+8]
        vmovsd  xmm1, QWORD PTR [rdi]
        vmovsd  xmm0, QWORD PTR [rsi]
        vcomisd xmm2, xmm3
        vmovsd  xmm4, QWORD PTR [rdi+16]
        vmovsd  xmm5, QWORD PTR [rsi+16]
        seta    dl
        jb      .L2
        vcomisd xmm0, xmm1
        jb      .L2
        vcomisd xmm5, xmm4
        mov     eax, -1
        jnb     .L1
        vcomisd xmm0, xmm1
        seta    al
        or      eax, edx
        movzx   eax, al
        ret
.L2:
        vcomisd xmm0, xmm1
        mov     eax, 1
        ja      .L1
        vcomisd xmm1, xmm0
        jne     .L8
        test    dl, dl
        je      .L8
.L1:
        ret
.L8:
        vcomisd xmm3, xmm2
        sete    dl
        xor     eax, eax
        vcomisd xmm5, xmm4
        setnb   al
        and     eax, edx
        ret

https://godbolt.org/z/s3v5dcvn6

Reply via email to