Eliminate a few instructions and simplify. No effect on speed though. mmx should be memory-bound.

--Loren Merritt
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 73b3763..f5115b3 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -20,6 +20,7 @@
 ;******************************************************************************
 
 %include "x86inc.asm"
+%include "x86util.asm"
 
 section .text align=16
 
@@ -152,24 +153,15 @@ cglobal float_interleave6_%1, 2,7,7, dst, src, src1, 
src2, src3, src4, src5
     movq       m3, [srcq+src3q]
     movq       m4, [srcq+src4q]
     movq       m5, [srcq+src5q]
-
-    movq       m6, m0
-    punpckldq  m6, m1
-    movq       m7, m2
-    punpckldq  m7, m3
-    movq [dstq  ], m6
-    movq [dstq+8], m7
-
-    movq       m6, m4
-    punpckldq  m6, m5
-    punpckhdq  m0, m1
-    movq [dstq+16], m6
-    movq [dstq+24], m0
-
-    punpckhdq  m2, m3
-    punpckhdq  m4, m5
-    movq [dstq+32], m2
-    movq [dstq+40], m4
+    SBUTTERFLY dq, 0, 1, 6
+    SBUTTERFLY dq, 2, 3, 6
+    SBUTTERFLY dq, 4, 5, 6
+    movq [dstq   ], m0
+    movq [dstq+ 8], m2
+    movq [dstq+16], m4
+    movq [dstq+24], m1
+    movq [dstq+32], m3
+    movq [dstq+40], m5
 %endif
     add      srcq, mmsize
     add      dstq, mmsize*6
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to