On Fri, 22 Apr 2011, Justin Ruggles wrote:

Could someone test the patch below with a modern Intel CPU other than Atom?
I'm getting slower results for the SSE version than the MMX version on
Athlon64, but SSE is faster on Atom.  I'm guessing it's another Athlon issue,
but it could be something else...

penryn:
3656 +/- 3  6ch c
1475 +/- 3  6ch mmx
1516 +/- 1  6ch sse
1091 +/- 5  6ch sse patched
 908 +/- 1  2ch c
 468 +/- 2  2ch mmx
 314 +/- 1  2ch sse

conroe:
3221 +/- 6  6ch c
1295 +/- 5  6ch mmx
1672 +/- 3  6ch sse
1010 +/- 5  6ch sse patched
 818 +/- 1  2ch c
 403 +/- 2  2ch mmx
 296 +/- 3  2ch sse

I don't know why conroe got lower totals than penryn, when every single instruction is equal or slower in isolation.

--Loren Merritt
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 9732197..722d56f 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -94,8 +94,15 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2
 ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
 ;-----------------------------------------------------------------------------
 
+%macro BUTTERFLYPS 3
+    movaps    m%3, m%1
+    unpcklps  m%1, m%2
+    unpckhps  m%3, m%2
+    SWAP %2, %3
+%endmacro
+
 %macro FLOAT_INTERLEAVE6 1
-cglobal float_interleave6_%1, 2,7,4, dst, src, src1, src2, src3, src4, src5
+cglobal float_interleave6_%1, 2,7,7, dst, src, src1, src2, src3, src4, src5
 %ifdef ARCH_X86_64
     %define lend r10d
     mov     lend, r2d
@@ -115,25 +122,29 @@ cglobal float_interleave6_%1, 2,7,4, dst, src, src1, 
src2, src3, src4, src5
     sub    src5q, srcq
 .loop:
 %ifidn %1, sse
-    movlps    m0, [srcq]
-    movhps    m0, [srcq+src3q]
-    movlps    m1, [srcq+src1q]
-    movhps    m1, [srcq+src4q]
-    movlps    m2, [srcq+src2q]
-    movhps    m2, [srcq+src5q]
-    movlhps   m3, m0
-    movhlps   m3, m0
-    unpcklps  m0, m1
-    unpckhps  m1, m2
-    unpcklps  m2, m3
-    movlhps   m3, m0
-    movhlps   m3, m0
+    movaps    m0, [srcq]
+    movaps    m1, [srcq+src1q]
+    movaps    m2, [srcq+src2q]
+    movaps    m3, [srcq+src3q]
+    movaps    m4, [srcq+src4q]
+    movaps    m5, [srcq+src5q]
+    BUTTERFLYPS 0, 1, 6
+    BUTTERFLYPS 2, 3, 6
+    BUTTERFLYPS 4, 5, 6
+    movaps    m6, m4
+    shufps    m4, m0, 0xe4
     movlhps   m0, m2
-    shufps    m2, m1, 0xee
-    movlhps   m1, m3
+    movhlps   m6, m2
     movaps [dstq   ], m0
-    movaps [dstq+16], m1
-    movaps [dstq+32], m2
+    movaps [dstq+16], m4
+    movaps [dstq+32], m6
+    movaps    m6, m5
+    shufps    m5, m1, 0xe4
+    movlhps   m1, m3
+    movhlps   m6, m3
+    movaps [dstq+48], m1
+    movaps [dstq+64], m5
+    movaps [dstq+80], m6
 %else ; mmx
     movq       m0, [srcq]
     movq       m1, [srcq+src1q]
@@ -163,9 +174,9 @@ cglobal float_interleave6_%1, 2,7,4, dst, src, src1, src2, 
src3, src4, src5
     movq [dstq+32], m6
     movq [dstq+40], m7
 %endif
-    add      srcq, 8
-    add      dstq, 48
-    sub      lend, 2
+    add      srcq, mmsize
+    add      dstq, mmsize*6
+    sub      lend, mmsize/4
     jg .loop
 %ifidn %1, mmx
     emms
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to