On Fri, 22 Apr 2011, Justin Ruggles wrote:
Could someone test the patch below with a modern Intel CPU other than Atom?
I'm getting slower results for the SSE version than the MMX version on
Athlon64, but SSE is faster on Atom. I'm guessing it's another Athlon issue,
but it could be something else...
penryn:
3656 +/- 3 6ch c
1475 +/- 3 6ch mmx
1516 +/- 1 6ch sse
1091 +/- 5 6ch sse patched
908 +/- 1 2ch c
468 +/- 2 2ch mmx
314 +/- 1 2ch sse
conroe:
3221 +/- 6 6ch c
1295 +/- 5 6ch mmx
1672 +/- 3 6ch sse
1010 +/- 5 6ch sse patched
818 +/- 1 2ch c
403 +/- 2 2ch mmx
296 +/- 3 2ch sse
I don't know why conroe got lower totals than penryn, when every single
instruction is equal or slower in isolation.
--Loren Merritt
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 9732197..722d56f 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -94,8 +94,15 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2
; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
;-----------------------------------------------------------------------------
+%macro BUTTERFLYPS 3
+ movaps m%3, m%1
+ unpcklps m%1, m%2
+ unpckhps m%3, m%2
+ SWAP %2, %3
+%endmacro
+
%macro FLOAT_INTERLEAVE6 1
-cglobal float_interleave6_%1, 2,7,4, dst, src, src1, src2, src3, src4, src5
+cglobal float_interleave6_%1, 2,7,7, dst, src, src1, src2, src3, src4, src5
%ifdef ARCH_X86_64
%define lend r10d
mov lend, r2d
@@ -115,25 +122,29 @@ cglobal float_interleave6_%1, 2,7,4, dst, src, src1,
src2, src3, src4, src5
sub src5q, srcq
.loop:
%ifidn %1, sse
- movlps m0, [srcq]
- movhps m0, [srcq+src3q]
- movlps m1, [srcq+src1q]
- movhps m1, [srcq+src4q]
- movlps m2, [srcq+src2q]
- movhps m2, [srcq+src5q]
- movlhps m3, m0
- movhlps m3, m0
- unpcklps m0, m1
- unpckhps m1, m2
- unpcklps m2, m3
- movlhps m3, m0
- movhlps m3, m0
+ movaps m0, [srcq]
+ movaps m1, [srcq+src1q]
+ movaps m2, [srcq+src2q]
+ movaps m3, [srcq+src3q]
+ movaps m4, [srcq+src4q]
+ movaps m5, [srcq+src5q]
+ BUTTERFLYPS 0, 1, 6
+ BUTTERFLYPS 2, 3, 6
+ BUTTERFLYPS 4, 5, 6
+ movaps m6, m4
+ shufps m4, m0, 0xe4
movlhps m0, m2
- shufps m2, m1, 0xee
- movlhps m1, m3
+ movhlps m6, m2
movaps [dstq ], m0
- movaps [dstq+16], m1
- movaps [dstq+32], m2
+ movaps [dstq+16], m4
+ movaps [dstq+32], m6
+ movaps m6, m5
+ shufps m5, m1, 0xe4
+ movlhps m1, m3
+ movhlps m6, m3
+ movaps [dstq+48], m1
+ movaps [dstq+64], m5
+ movaps [dstq+80], m6
%else ; mmx
movq m0, [srcq]
movq m1, [srcq+src1q]
@@ -163,9 +174,9 @@ cglobal float_interleave6_%1, 2,7,4, dst, src, src1, src2,
src3, src4, src5
movq [dstq+32], m6
movq [dstq+40], m7
%endif
- add srcq, 8
- add dstq, 48
- sub lend, 2
+ add srcq, mmsize
+ add dstq, mmsize*6
+ sub lend, mmsize/4
jg .loop
%ifidn %1, mmx
emms
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel