http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55723



vincenzo Innocente <vincenzo.innocente at cern dot ch> changed:



           What    |Removed                     |Added

----------------------------------------------------------------------------

            Summary|SLP vectorization vs loop:  |loop vectorization

                   |SLP more efficient: loop    |inefficient in presence of

                   |vectorization inefficient   |multiple identical

                   |in presence of multiple     |conditions

                   |"blends"                    |



--- Comment #2 from vincenzo Innocente <vincenzo.innocente at cern dot ch> 
2012-12-20 15:39:13 UTC ---

It seems that in presence of identical conditions the vectorizer prefers to

compute two "full" branches

and do just one blend.

This is not always the most efficient choice as the  benchmark in comment 1

demonstrates.



Another simple example:

for bar two rsqrtps and one blend

for foo one rsqrtps and two blends



#include<cmath>

float a[1024];

float b[1024];





void bar(){

  for (int i=0;i!=1024;++i) {

    auto z = a[i];

    if (a[i] > 3.14f) z-=1.f;

    b[i] = 1.f/std::sqrt(z);

    if (a[i] > 3.14f) b[i]-=1.f;

  }

}



void foo(){

  for (int i=0;i!=1024;++i) {

    auto z = a[i];

    if (a[i] > 3.14f) z-=1.f;

    b[i] = 1.f/std::sqrt(z);

    if (a[i] > 1.f) b[i]-=1.f;

  }

}



c++ -std=c++11 -Ofast -march=corei7 -S twoif.cc -ftree-vectorizer-verbose=1 

-ftree-loop-if-convert-stores; cat twoif.s | c++filt





bar():

LFB221:

    movaps    LC0(%rip), %xmm6

    leaq    signed char(%rip), %rax

    movaps    LC1(%rip), %xmm5

    leaq    bool(%rip), %rdx

    movaps    LC2(%rip), %xmm4

    leaq    4096+signed char(%rip), %rcx

    movaps    LC3(%rip), %xmm7

    .align 4,0x90

L3:

    movaps    (%rax), %xmm0

    addq    $16, %rax

    addq    $16, %rdx

    rsqrtps    %xmm0, %xmm3

    movaps    %xmm0, %xmm2

    subps    %xmm6, %xmm2

    rsqrtps    %xmm2, %xmm1

    mulps    %xmm1, %xmm2

    mulps    %xmm1, %xmm2

    mulps    %xmm4, %xmm1

    addps    %xmm5, %xmm2

    mulps    %xmm1, %xmm2

    movaps    %xmm3, %xmm1

    mulps    %xmm0, %xmm1

    subps    %xmm6, %xmm2

    mulps    %xmm3, %xmm1

    mulps    %xmm4, %xmm3

    addps    %xmm5, %xmm1

    mulps    %xmm3, %xmm1

    movaps    %xmm7, %xmm3

    cmpltps    %xmm0, %xmm3

    movaps    %xmm3, %xmm0

    blendvps    %xmm0, %xmm2, %xmm1

    movaps    %xmm1, -16(%rdx)

    cmpq    %rcx, %rax

    jne    L3

    rep; ret

LFE221:

    .align 4,0x90

    .globl foo()

foo():

LFB222:

    movaps    LC3(%rip), %xmm7

    leaq    signed char(%rip), %rax

    movaps    LC0(%rip), %xmm3

    leaq    bool(%rip), %rdx

    movaps    LC1(%rip), %xmm6

    leaq    4096+signed char(%rip), %rcx

    movaps    LC2(%rip), %xmm5

    .align 4,0x90

L7:

    movaps    (%rax), %xmm2

    movaps    %xmm7, %xmm0

    addq    $16, %rax

    addq    $16, %rdx

    movaps    %xmm2, %xmm1

    cmpltps    %xmm2, %xmm0

    movaps    %xmm2, %xmm4

    subps    %xmm3, %xmm1

    blendvps    %xmm0, %xmm1, %xmm4

    rsqrtps    %xmm4, %xmm0

    movaps    %xmm4, %xmm1

    mulps    %xmm0, %xmm1

    mulps    %xmm0, %xmm1

    mulps    %xmm5, %xmm0

    addps    %xmm6, %xmm1

    mulps    %xmm0, %xmm1

    movaps    %xmm3, %xmm0

    cmpltps    %xmm2, %xmm0

    movaps    %xmm1, %xmm4

    subps    %xmm3, %xmm4

    blendvps    %xmm0, %xmm4, %xmm1

    movaps    %xmm1, -16(%rdx)

    cmpq    %rcx, %rax

    jne    L7

    rep; ret

Reply via email to