https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103797
Jan Hubicka <hubicka at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|WAITING |NEW
--- Comment #7 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
OK, here is completely fake testcase that does similar operaitons:
#include <math.h>
struct test {float x; float y; float z;} test;
float f;
void
t()
{
float x = test.x;
float y = test.y;
float z = test.z;
x = x * f;
y = y * f;
z = z * f;
x = sqrt (x);
y = sqrt (y);
z = sqrt (z);
x = x / f;
y = y / f;
z = z / f;
test.x=x;
test.y=y;
test.z=z;
}
We seem to fail to vectorize it with:
t.c:20:9: missed: op not supported by target.
t.c:17:5: missed: not vectorized: relevant stmt not supported: x_15 = x_24 /
f.0_1;
clang seems to use divps happilly, so I am not sure why it is not supported.
Even more funny is that with -Ofast it is compiled into multiplication by
reciprocal:
t:
.LFB0:
.cfi_startproc
movss f(%rip), %xmm4
movss .LC0(%rip), %xmm2
movss test(%rip), %xmm0
movss test+4(%rip), %xmm3
divss %xmm4, %xmm2
movss test+8(%rip), %xmm1
mulss %xmm4, %xmm0
mulss %xmm4, %xmm3
mulss %xmm4, %xmm1
sqrtss %xmm0, %xmm0
sqrtss %xmm3, %xmm3
sqrtss %xmm1, %xmm1
mulss %xmm2, %xmm0
mulss %xmm2, %xmm3
mulss %xmm2, %xmm1
unpcklps %xmm3, %xmm0
movlps %xmm0, test(%rip)
movss %xmm1, test+8(%rip)
ret
and rewriting it that way by hand:
#include <math.h>
struct test {float x; float y; float z;} test;
float f;
void
t()
{
float x = test.x;
float y = test.y;
float z = test.z;
float m = 1/f;
x = x * f;
y = y * f;
z = z * f;
x = sqrt (x);
y = sqrt (y);
z = sqrt (z);
x = x * m;
y = y * m;
z = z * m;
test.x=x;
test.y=y;
test.z=z;
}
gets the expected result:
t:
.LFB0:
.cfi_startproc
movss f(%rip), %xmm0
movq test(%rip), %xmm1
movaps %xmm0, %xmm2
shufps $0xe0, %xmm2, %xmm2
mulps %xmm1, %xmm2
movss .LC0(%rip), %xmm1
divss %xmm0, %xmm1
mulss test+8(%rip), %xmm0
sqrtps %xmm2, %xmm2
sqrtss %xmm0, %xmm0
movaps %xmm1, %xmm3
shufps $0xe0, %xmm3, %xmm3
mulss %xmm0, %xmm1
mulps %xmm3, %xmm2
movss %xmm1, test+8(%rip)
movlps %xmm2, test(%rip)
ret
.cfi_endproc
Having this however I do not see slp analyzing the divide in the original code
at all.