On the day of Saturday 23 February 2008 Uros Bizjak hast written:
> Hello!
>
> >   f7:   0f 7f 5c 24 f0          movq   %mm3,-0x10(%rsp)
> >   fc:   0f 7f 54 24 f8          movq   %mm2,-0x8(%rsp)
> >  101:   48 8b 5c 24 f8          mov    -0x8(%rsp),%rbx
> >  106:   48 89 5c 38 40          mov    %rbx,0x40(%rax,%rdi,1)
> >  10b:   48 8b 5c 24 f0          mov    -0x10(%rsp),%rbx
> >  110:   48 89 5c 38 48          mov    %rbx,0x48(%rax,%rdi,1)
> >
> > As you see in the intrinsic version gcc moves to mmx register to the
> > stack, reloads from the stack and writes to the destination. Why?
> >
> > I don't know whether earlier gcc 4.2 versions produced such stupid code.
> > Compiling as 32 does similar stupidity, though gcc reloads into a mmx
> > register...
>
> This is a variant of "Strange code for MMX register moves" [1] or its
> dupe "mmx and movd/movq on x86_64" [2]. Since touching %mm register
> switches x87 register stack to MMX mode, we penalize mmx moves severely
> in order to prevent gcc to ever allocate %mm for DImode moves, unless
> really necessary.

[...]

Just as a side note. The equivalent SSE2 code looks fine, but I have question 
regarding the used store instruction:

#include <emmintrin.h>
void diff_pixels_mmx4(char *block, const uint8_t *s1, const uint8_t *s2, long 
                      stride)
{
        
        long offset = -128;
        block+=64;
        __m128i mm7 = _mm_setzero_si128();
        do {
                __m128i mm0 = *(__m128i*)s1;
                __m128i mm2 = *(__m128i*)s2;
                __m128i mm1 = mm0;
                __m128i mm3 = mm2;
                mm0 = _mm_unpacklo_epi8(mm0, mm7);
                mm1 = _mm_unpackhi_epi8(mm1, mm7);
                mm2 = _mm_unpacklo_epi8(mm2, mm7);
                mm3 = _mm_unpackhi_epi8(mm3, mm7);
                mm0 = _mm_sub_epi16(mm0, mm2);
                mm1 = _mm_sub_epi16(mm1, mm3);
                *(__m128i*)(block+offset) = mm0;
                *(__m128i*)(block+offset+16) = mm1;
                s1 += stride;
                s2 += stride;
                offset +=32;
        } while (offset < 0);
}

generated assembly (-O2 -march=k8):

0000000000000050 <diff_pixels_mmx4>:
  50:   66 0f ef e4             pxor   %xmm4,%xmm4
  54:   48 c7 c0 80 ff ff ff    mov    $0xffffffffffffff80,%rax
  5b:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  60:   66 0f 6f 0e             movdqa (%rsi),%xmm1
  64:   48 01 ce                add    %rcx,%rsi
  67:   66 0f 6f 02             movdqa (%rdx),%xmm0
  6b:   48 01 ca                add    %rcx,%rdx
  6e:   66 0f 6f d1             movdqa %xmm1,%xmm2
  72:   66 0f 6f d8             movdqa %xmm0,%xmm3
  76:   66 0f 68 cc             punpckhbw %xmm4,%xmm1
  7a:   66 0f 60 d4             punpcklbw %xmm4,%xmm2
  7e:   66 0f 60 dc             punpcklbw %xmm4,%xmm3
  82:   66 0f 68 c4             punpckhbw %xmm4,%xmm0
  86:   66 0f f9 d3             psubw  %xmm3,%xmm2
  8a:   0f 29 54 38 40          movaps %xmm2,0x40(%rax,%rdi,1)
  8f:   66 0f f9 c8             psubw  %xmm0,%xmm1
  93:   0f 29 4c 38 50          movaps %xmm1,0x50(%rax,%rdi,1)
  98:   48 83 c0 20             add    $0x20,%rax
  9c:   75 c2                   jne    60 <diff_pixels_mmx4+0x10>
  9e:   f3 c3                   repz retq

Why is movaps (SSE, floating point data) instead of movdqa (SSE2. integer 
data) used as store? Bug or feature? Even with -O0 compiled it is used.

Regards,
-- 
(°=                 =°)
//\ Prakash Punnoor /\\
V_/                 \_V

Attachment: signature.asc
Description: This is a digitally signed message part.

Reply via email to