[Bug tree-optimization/63945] Missing vectorization optimization

2019-10-16 Thread witold.baryluk+gcc at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63945

Witold Baryluk  changed:

   What|Removed |Added

 CC||witold.baryluk+gcc at gmail 
dot co
   ||m

--- Comment #1 from Witold Baryluk  ---
It does vectorize for me on gcc 9.2.1:

-march=skylake-avx512

aa.cpp:34:29: optimized: loop vectorized using 32 byte vectors
aa.cpp:25:27: optimized: loop vectorized using 32 byte vectors


  if (val<100.)
1279:   c5 fb 10 0b vmovsd (%rbx),%xmm1
127d:   c5 fb 10 05 8b 0d 00vmovsd 0xd8b(%rip),%xmm0# 2010
<_IO_stdin_used+0x10>
1284:   00 
1285:   c5 f9 2f c1 vcomisd %xmm1,%xmm0
1289:   76 2b   jbe12b6 <_ZN4TEST4testEv+0xc6>
128b:   c4 e2 7d 19 c9  vbroadcastsd %xmm1,%ymm1
1290:   31 c0   xor%eax,%eax
1292:   66 0f 1f 44 00 00   nopw   0x0(%rax,%rax,1)
  c[i] = val*a[i]+b[i];
1298:   c4 c1 7d 10 04 04   vmovupd (%r12,%rax,1),%ymm0
129e:   c4 c2 f5 a8 44 05 00vfmadd213pd
0x0(%r13,%rax,1),%ymm1,%ymm0
12a5:   c5 fd 11 04 07  vmovupd %ymm0,(%rdi,%rax,1)
for (unsigned int i=0; i
::operator delete(__p);
12b6:   c5 f8 77vzeroupper 


Similarly:

-march=knm

aa.cpp:34:29: optimized: loop vectorized using 64 byte vectors
aa.cpp:25:27: optimized: loop vectorized using 64 byte vectors

  if (val<100.)
15bc:   31 c0   xor%eax,%eax
15be:   66 90   xchg   %ax,%ax
  c[i] = val*a[i]+b[i];
15c0:   62 f1 fd 48 28 04 01vmovapd (%rcx,%rax,1),%zmm0
15c7:   62 f2 ed 48 a8 04 06vfmadd213pd (%rsi,%rax,1),%zmm2,%zmm0
15ce:   62 d1 fd 48 11 04 01vmovupd %zmm0,(%r9,%rax,1)
for (unsigned int i=0; i

(plus a lot of handling for unaligned stack).

-march=znver2

aa.cpp:34:29: optimized: loop vectorized using 32 byte vectors
aa.cpp:25:27: optimized: loop vectorized using 32 byte vectors

  if (val<100.)
1279:   c5 fb 10 0b vmovsd (%rbx),%xmm1
127d:   c5 fb 10 05 8b 0d 00vmovsd 0xd8b(%rip),%xmm0# 2010
<_IO_stdin_used+0x10>
1284:   00 
1285:   c5 f9 2f c1 vcomisd %xmm1,%xmm0
1289:   76 33   jbe12be <_ZN4TEST4testEv+0xce>
128b:   c4 e2 7d 19 c9  vbroadcastsd %xmm1,%ymm1
1290:   31 c0   xor%eax,%eax
1292:   66 66 2e 0f 1f 84 00data16 nopw %cs:0x0(%rax,%rax,1)
1299:   00 00 00 00 
129d:   0f 1f 00nopl   (%rax)
  c[i] = val*a[i]+b[i];
12a0:   c4 c1 7d 10 04 04   vmovupd (%r12,%rax,1),%ymm0
12a6:   c4 c2 f5 a8 44 05 00vfmadd213pd
0x0(%r13,%rax,1),%ymm1,%ymm0
12ad:   c5 fd 11 04 07  vmovupd %ymm0,(%rdi,%rax,1)
for (unsigned int i=0; i

-march=core2

aa.cpp:34:29: optimized: loop vectorized using 16 byte vectors
aa.cpp:25:27: optimized: loop vectorized using 16 byte vectors

  if (val<100.)
1276:   f2 0f 10 13 movsd  (%rbx),%xmm2
127a:   f2 0f 10 05 8e 0d 00movsd  0xd8e(%rip),%xmm0# 2010
<_IO_stdin_used+0x10>
1281:   00 
1282:   66 0f 2f c2 comisd %xmm2,%xmm0
1286:   76 40   jbe12c8 <_ZN4TEST4testEv+0xd8>
1288:   31 c0   xor%eax,%eax
128a:   66 0f 14 d2 unpcklpd %xmm2,%xmm2
128e:   66 90   xchg   %ax,%ax
  c[i] = val*a[i]+b[i];
1290:   f3 0f 7e 44 05 00   movq   0x0(%rbp,%rax,1),%xmm0
1296:   f3 41 0f 7e 0c 04   movq   (%r12,%rax,1),%xmm1
129c:   66 0f 16 44 05 08   movhpd 0x8(%rbp,%rax,1),%xmm0
12a2:   66 0f 59 c2 mulpd  %xmm2,%xmm0
12a6:   66 41 0f 16 4c 04 08movhpd 0x8(%r12,%rax,1),%xmm1
12ad:   66 0f 58 c1 addpd  %xmm1,%xmm0
12b1:   66 0f 13 04 07  movlpd %xmm0,(%rdi,%rax,1)
12b6:   66 0f 17 44 07 08   movhpd %xmm0,0x8(%rdi,%rax,1)
for (unsigned int i=0; i



Looks all pretty optimally vectorized to me.

The code can be made even better, if you ensure proper alignment of std::vector
arrrays, which they might not be at the moment.

[Bug tree-optimization/63945] Missing vectorization optimization

2016-09-12 Thread pinskia at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63945

Andrew Pinski  changed:

   What|Removed |Added

   Keywords||missed-optimization
   Severity|normal  |enhancement