In the following trivial test case, gcc-4.1 produces very ineffecient code for the loop. gcc-3.3 produces much better code.
typedef int __m64 __attribute__ ((__vector_size__ (8))); __m64 unsigned_add3( const __m64 *a, const __m64 *b, unsigned long count ) { __m64 sum; unsigned int i; for( i = 1; i < count; i++ ) { sum = (__m64) __builtin_ia32_paddq ((long long)a[i], (long long)b[i]); } return sum; } 1) Loop when compiled with gcc-4.1 -O2 -msse2 (note in particular the extra movq to memory): L4: movl 12(%ebp), %esi movq (%eax,%edx,8), %mm0 paddq (%esi,%edx,8), %mm0 incl %edx cmpl %edx, %ecx movq %mm0, -16(%ebp) movl -16(%ebp), %esi movl -12(%ebp), %edi jne L4 2) Loop using gcc-3.3 compiled with -O2 -msse2: L6: movq (%esi,%edx,8), %mm0 paddq (%eax,%edx,8), %mm0 addl $1, %edx cmpl %ecx, %edx jb L6 AFAICT, culprit is reload which generates extra load and store of %mm0: (insn 62 30 63 2 (set (mem:V2SI (plus:SI (reg/f:SI 6 bp) (const_int -16 [0xfffffffffffffff0])) [0 S8 A8]) (reg:V2SI 29 mm0)) 736 {*movv2si_internal} (nil) (nil)) (insn 63 62 32 2 (set (reg/v:V2SI 4 si [orig:61 sum ] [61]) (mem:V2SI (plus:SI (reg/f:SI 6 bp) (const_int -16 [0xfffffffffffffff0])) [0 S8 A8])) 736 {*movv2si_internal} (nil) (nil)) Here is the larger test case from which above test was extracted: #include <xmmintrin.h> __m64 unsigned_add3( const __m64 *a, const __m64 *b, __m64 *result, unsigned long count ) { __m64 carry, temp, sum, one, onesCarry, _a, _b; unsigned int i; if( count > 0 ) { _a = a[0]; _b = b[0]; one = _mm_cmpeq_pi8( _a, _a ); //-1 one = _mm_sub_si64( _mm_xor_si64( one, one ), one ); //1 sum = _mm_add_si64( _a, _b ); onesCarry = _mm_and_si64( _a, _b ); //the 1's bit is set only if the 1's bit add generates a carry onesCarry = _mm_and_si64( onesCarry, one ); //onesCarry &= 1 //Trim off the one's bit on both vA and vB to make room for a carry bit at the top after the add _a = _mm_srli_si64( _a, 1 ); //vA >>= 1 _b = _mm_srli_si64( _b, 1 ); //vB >>= 1 //Add vA to vB and add the carry bit carry = _mm_add_si64( _a, _b ); carry = _mm_add_si64( carry, onesCarry ); //right shift by 63 bits to get the carry bit for the high 64 bit quantity carry = _mm_srli_si64( carry, 63 ); for( i = 1; i < count; i++ ) { result[i-1] = sum; _a = a[i]; _b = b[i]; onesCarry = _mm_and_si64( _a, _b ); onesCarry = _mm_and_si64( onesCarry, one ); sum = _mm_add_si64( _a, _b ); _a = _mm_add_si64( _a, onesCarry ); onesCarry = _mm_and_si64( carry, _a ); //find low bit carry sum = _mm_add_si64( sum, carry ); //add in carry bit to low word sum carry = _mm_add_si64( _a, onesCarry ); //add in low bit carry to high result } result[i-1] = sum; } return carry; } Again, gcc-3.3 produces much better code for this loop. -- Summary: Poor loop optimization when using sse2 builtins - regression from 3.3 Product: gcc Version: 4.1.0 Status: UNCONFIRMED Severity: normal Priority: P2 Component: rtl-optimization AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: fjahanian at apple dot com CC: gcc-bugs at gcc dot gnu dot org GCC build triplet: apple-x86-darwin GCC host triplet: apple-x86-darwin GCC target triplet: apple-x86-darwin http://gcc.gnu.org/bugzilla/show_bug.cgi?id=22152