https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88713
--- Comment #47 from rguenther at suse dot de <rguenther at suse dot de> --- On January 23, 2019 5:13:12 PM GMT+01:00, "hjl.tools at gmail dot com" <gcc-bugzi...@gcc.gnu.org> wrote: >https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88713 > >--- Comment #46 from H.J. Lu <hjl.tools at gmail dot com> --- >We generate sqrtps for scalar sqrtf: > >[hjl@gnu-skx-1 pr88713]$ cat s.i >extern float sqrtf(float x); > >float >rsqrt(float r) >{ > return sqrtf (r); >} >[hjl@gnu-skx-1 pr88713]$ gcc -Ofast -S s.i >[hjl@gnu-skx-1 pr88713]$ cat s.s > .file "s.i" > .text > .p2align 4,,15 > .globl rsqrt > .type rsqrt, @function >rsqrt: >.LFB0: > .cfi_startproc > sqrtss %xmm0, %xmm0 > ret > .cfi_endproc >.LFE0: > .size rsqrt, .-rsqrt > .ident "GCC: (GNU) 8.2.1 20190109 (Red Hat 8.2.1-7)" > .section .note.GNU-stack,"",@progbits >[hjl@gnu-skx-1 pr88713]$ > >But why don't we generate sqrtps for vector sqrtf? That's the default for - mrecip back in time we benchmarked it and scalar recip miscompares sth. > >[hjl@gnu-skx-1 pr88713]$ cat y.i >extern float sqrtf(float x); > >void >rsqrt(float* restrict r, float* restrict a){ > for (int i = 0; i < 16; i++){ > r[i] = sqrtf(a[i]); > } >} >[hjl@gnu-skx-1 pr88713]$ gcc -S -Ofast y.i >[hjl@gnu-skx-1 pr88713]$ cat y.s > .file "y.i" > .text > .p2align 4,,15 > .globl rsqrt > .type rsqrt, @function >rsqrt: >.LFB0: > .cfi_startproc > movups (%rsi), %xmm1 > pxor %xmm2, %xmm2 > movaps .LC0(%rip), %xmm4 > movaps %xmm2, %xmm3 > rsqrtps %xmm1, %xmm0 > cmpneqps %xmm1, %xmm3 > movaps %xmm1, %xmm5 > andps %xmm3, %xmm0 > movaps .LC1(%rip), %xmm3 > mulps %xmm0, %xmm5 > mulps %xmm5, %xmm0 > mulps %xmm3, %xmm5 > movaps %xmm0, %xmm1 > movups 16(%rsi), %xmm0 > addps %xmm4, %xmm1 > mulps %xmm5, %xmm1 > movaps %xmm2, %xmm5 > cmpneqps %xmm0, %xmm5 > movups %xmm1, (%rdi) > rsqrtps %xmm0, %xmm1 > andps %xmm5, %xmm1 > movaps %xmm2, %xmm5 > mulps %xmm1, %xmm0 > mulps %xmm0, %xmm1 > mulps %xmm3, %xmm0 > addps %xmm4, %xmm1 > mulps %xmm0, %xmm1 > movups 32(%rsi), %xmm0 > cmpneqps %xmm0, %xmm5 > movups %xmm1, 16(%rdi) > rsqrtps %xmm0, %xmm1 > andps %xmm5, %xmm1 > mulps %xmm1, %xmm0 > mulps %xmm0, %xmm1 > mulps %xmm3, %xmm0 > addps %xmm4, %xmm1 > mulps %xmm0, %xmm1 > movups %xmm1, 32(%rdi) > movups 48(%rsi), %xmm1 > rsqrtps %xmm1, %xmm0 > cmpneqps %xmm1, %xmm2 > andps %xmm2, %xmm0 > mulps %xmm0, %xmm1 > mulps %xmm1, %xmm0 > mulps %xmm3, %xmm1 > addps %xmm4, %xmm0 > mulps %xmm1, %xmm0 > movups %xmm0, 48(%rdi) > ret > .cfi_endproc >.LFE0: > .size rsqrt, .-rsqrt > .section .rodata.cst16,"aM",@progbits,16 > .align 16 >.LC0: > .long 3225419776 > .long 3225419776 > .long 3225419776 > .long 3225419776 > .align 16 >.LC1: > .long 3204448256 > .long 3204448256 > .long 3204448256 > .long 3204448256 > .ident "GCC: (GNU) 8.2.1 20190109 (Red Hat 8.2.1-7)" > .section .note.GNU-stack,"",@progbits >[hjl@gnu-skx-1 pr88713]$