Under some conditions (generally if you upset the inlining heuristic ie by force
inlining something), SSE intrinsics don't get inlined and some truely horrible
code ensues; the fix, tinkering with params, isn't much prettier.
Happened to me with various 4.x versions, on x86 or x86-64.

silly testcase:
#include <xmmintrin.h>



static __attribute__ ((always_inline)) bool bloatit(const __m128 a, const __m128
b) {

        const __m128

                v0 = _mm_max_ps(a,b),

                v1 = _mm_min_ps(a,b),

                v2 = _mm_mul_ps(a,b),

                v3 = _mm_div_ps(a,b),

                g0 = _mm_or_ps(_mm_or_ps(_mm_or_ps(v0,v1), v2), v3);

        

        return _mm_movemask_ps(g0);

}



bool finalblow(const __m128 a, const __m128 b, const __m128 c, const __m128 d,
const __m128 e, const __m128 f) {

        return bloatit(a,b) & bloatit(c,d) & bloatit(e,f) & bloatit(a,c) & 
bloatit(b,d)
& bloatit(c,e) & bloatit(d,f);

}


int main() { return 0; }


At -O3, on x86-64-linux, g++-4120050417 gets funky with:
0000000000400540 <_mm_mul_ps(float __vector, float __vector)>:
  400540:       mulps  %xmm1,%xmm0
  400543:       retq
...
0000000000400550 <_mm_div_ps(float __vector, float __vector)>:
  400550:       divps  %xmm1,%xmm0
  400553:       retq
...
0000000000400560 <_mm_min_ps(float __vector, float __vector)>:
  400560:       minps  %xmm1,%xmm0
  400563:       retq
...
0000000000400570 <_mm_max_ps(float __vector, float __vector)>:
  400570:       maxps  %xmm1,%xmm0
  400573:       retq
...
0000000000400580 <_mm_or_ps(float __vector, float __vector)>:
  400580:       orps   %xmm1,%xmm0
  400583:       retq
...
0000000000400590 <_mm_movemask_ps(float __vector)>:
  400590:       movmskps %xmm0,%eax
  400593:       retq

... only to conclude with this wonder
00000000004005b0 <finalblow(float __vector, float __vector, float __vector,
float __vector, float __vector, float __vector)>:
  4005b0:       push   %rbx
  4005b1:       xor    %ebx,%ebx
  4005b3:       sub    $0x1b0,%rsp
  4005ba:       movaps %xmm2,0x180(%rsp)
  4005c2:       movaps %xmm3,0x170(%rsp)
  4005ca:       movaps %xmm4,0x160(%rsp)
  4005d2:       movaps %xmm5,0x150(%rsp)
  4005da:       movaps %xmm1,0x190(%rsp)
  4005e2:       movaps %xmm0,0x1a0(%rsp)
  4005ea:       callq  400550 <_mm_div_ps(float __vector, float __vector)>
  4005ef:       movaps %xmm0,0x140(%rsp)
  4005f7:       movaps 0x190(%rsp),%xmm1
  4005ff:       movaps 0x1a0(%rsp),%xmm0
  400607:       callq  400540 <_mm_mul_ps(float __vector, float __vector)>
  40060c:       movaps 0x190(%rsp),%xmm1
  400614:       movaps %xmm0,0x130(%rsp)
  40061c:       movaps 0x1a0(%rsp),%xmm0
  400624:       callq  400560 <_mm_min_ps(float __vector, float __vector)>
  400629:       movaps 0x190(%rsp),%xmm1
  400631:       movaps %xmm0,0x120(%rsp)
  400639:       movaps 0x1a0(%rsp),%xmm0
  400641:       callq  400570 <_mm_max_ps(float __vector, float __vector)>
  400646:       movaps 0x120(%rsp),%xmm1
  40064e:       callq  400580 <_mm_or_ps(float __vector, float __vector)>
  400653:       movaps 0x130(%rsp),%xmm1
  40065b:       callq  400580 <_mm_or_ps(float __vector, float __vector)>
  400660:       movaps 0x140(%rsp),%xmm1
  400668:       callq  400580 <_mm_or_ps(float __vector, float __vector)>
  40066d:       callq  400590 <_mm_movemask_ps(float __vector)>
  400672:       movaps 0x170(%rsp),%xmm1
etc...


As said earlier, that's just one way to make that happen.
It would be a real plus if those intrinsics could be inconditionnaly inlined.

-- 
           Summary: SSE intrinsics not inlined, sometimes.
           Product: gcc
           Version: 4.1.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P2
         Component: other
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: tbptbp at gmail dot com
                CC: gcc-bugs at gcc dot gnu dot org
  GCC host triplet: x86*


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21195

Reply via email to