[Bug tree-optimization/88713] Vectorized code slow vs. flang

rguenther at suse dot de Wed, 23 Jan 2019 08:51:08 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88713


--- Comment #47 from rguenther at suse dot de <rguenther at suse dot de> ---
On January 23, 2019 5:13:12 PM GMT+01:00, "hjl.tools at gmail dot com"
<gcc-bugzi...@gcc.gnu.org> wrote:
>https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88713
>
>--- Comment #46 from H.J. Lu <hjl.tools at gmail dot com> ---
>We generate sqrtps for scalar sqrtf:
>
>[hjl@gnu-skx-1 pr88713]$ cat s.i
>extern float sqrtf(float x);
>
>float
>rsqrt(float r)
>{
>  return sqrtf (r);
>}
>[hjl@gnu-skx-1 pr88713]$ gcc -Ofast -S s.i
>[hjl@gnu-skx-1 pr88713]$ cat s.s
>        .file   "s.i"
>        .text
>        .p2align 4,,15
>        .globl  rsqrt
>        .type   rsqrt, @function
>rsqrt:
>.LFB0:
>        .cfi_startproc
>        sqrtss  %xmm0, %xmm0
>        ret
>        .cfi_endproc
>.LFE0:
>        .size   rsqrt, .-rsqrt
>        .ident  "GCC: (GNU) 8.2.1 20190109 (Red Hat 8.2.1-7)"
>        .section        .note.GNU-stack,"",@progbits
>[hjl@gnu-skx-1 pr88713]$ 
>
>But why don't we generate sqrtps for vector sqrtf?

That's the default for - mrecip back in time we benchmarked it and scalar recip
miscompares sth.

>
>[hjl@gnu-skx-1 pr88713]$ cat y.i
>extern float sqrtf(float x);
>
>void
>rsqrt(float* restrict r, float* restrict a){
>    for (int i = 0; i < 16; i++){
>        r[i] = sqrtf(a[i]);
>    }
>}
>[hjl@gnu-skx-1 pr88713]$ gcc -S -Ofast y.i 
>[hjl@gnu-skx-1 pr88713]$ cat y.s
>        .file   "y.i"
>        .text
>        .p2align 4,,15
>        .globl  rsqrt
>        .type   rsqrt, @function
>rsqrt:
>.LFB0:
>        .cfi_startproc
>        movups  (%rsi), %xmm1
>        pxor    %xmm2, %xmm2
>        movaps  .LC0(%rip), %xmm4
>        movaps  %xmm2, %xmm3
>        rsqrtps %xmm1, %xmm0
>        cmpneqps        %xmm1, %xmm3
>        movaps  %xmm1, %xmm5
>        andps   %xmm3, %xmm0
>        movaps  .LC1(%rip), %xmm3
>        mulps   %xmm0, %xmm5
>        mulps   %xmm5, %xmm0
>        mulps   %xmm3, %xmm5
>        movaps  %xmm0, %xmm1
>        movups  16(%rsi), %xmm0
>        addps   %xmm4, %xmm1
>        mulps   %xmm5, %xmm1
>        movaps  %xmm2, %xmm5
>        cmpneqps        %xmm0, %xmm5
>        movups  %xmm1, (%rdi)
>        rsqrtps %xmm0, %xmm1
>        andps   %xmm5, %xmm1
>        movaps  %xmm2, %xmm5
>        mulps   %xmm1, %xmm0
>        mulps   %xmm0, %xmm1
>        mulps   %xmm3, %xmm0
>        addps   %xmm4, %xmm1
>        mulps   %xmm0, %xmm1
>        movups  32(%rsi), %xmm0
>        cmpneqps        %xmm0, %xmm5
>        movups  %xmm1, 16(%rdi)
>        rsqrtps %xmm0, %xmm1
>        andps   %xmm5, %xmm1
>        mulps   %xmm1, %xmm0
>        mulps   %xmm0, %xmm1
>        mulps   %xmm3, %xmm0
>        addps   %xmm4, %xmm1
>        mulps   %xmm0, %xmm1
>        movups  %xmm1, 32(%rdi)
>        movups  48(%rsi), %xmm1
>        rsqrtps %xmm1, %xmm0
>        cmpneqps        %xmm1, %xmm2
>        andps   %xmm2, %xmm0
>        mulps   %xmm0, %xmm1
>        mulps   %xmm1, %xmm0
>        mulps   %xmm3, %xmm1
>        addps   %xmm4, %xmm0
>        mulps   %xmm1, %xmm0
>        movups  %xmm0, 48(%rdi)
>        ret
>        .cfi_endproc
>.LFE0:
>        .size   rsqrt, .-rsqrt
>        .section        .rodata.cst16,"aM",@progbits,16
>        .align 16
>.LC0:
>        .long   3225419776
>        .long   3225419776
>        .long   3225419776
>        .long   3225419776
>        .align 16
>.LC1:
>        .long   3204448256
>        .long   3204448256
>        .long   3204448256
>        .long   3204448256
>        .ident  "GCC: (GNU) 8.2.1 20190109 (Red Hat 8.2.1-7)"
>        .section        .note.GNU-stack,"",@progbits
>[hjl@gnu-skx-1 pr88713]$

[Bug tree-optimization/88713] Vectorized code slow vs. flang

Reply via email to