>From 78dbd81719fcb063cccfa7fc8551c34620abe1bb Mon Sep 17 00:00:00 2001
From: Vitor Sessak <[email protected]>
Date: Sat, 14 May 2011 14:16:30 +0200
Subject: [PATCH 2/3] dct32: Change pass 6 permutation to allow for AVX implementation

---
 libavcodec/x86/dct32_sse.asm |  102 +++++++++++++++++++++---------------------
 1 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
index c565ea5..3c9bfac 100644
--- a/libavcodec/x86/dct32_sse.asm
+++ b/libavcodec/x86/dct32_sse.asm
@@ -155,7 +155,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
 
     movaps      xmm1, [outq+16]
     BUTTERFLY3  xmm1, xmm3, xmm2, xmm5
-    movaps      [outq+16], xmm1
+    movaps      [outq+96], xmm1
 
     BUTTERFLY3  xmm4, xmm3, xmm2, xmm5
     movaps      [outq+64], xmm4
@@ -172,117 +172,117 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
     movaps      [outq+48], xmm4
 
     BUTTERFLY3  xmm6, xmm3, xmm2, xmm7
-    movaps      [outq+96], xmm6
+    movaps      [outq+16], xmm6
 
     BUTTERFLY3  xmm0, xmm3, xmm2, xmm7
     movaps      [outq+112], xmm0
 
 
     ;    pass 6, no SIMD...
-    movss       xmm3, [outq+56]
     mov         tmpd, [outd+4]
-    addss       xmm3, [outq+60]
     movss       xmm7, [outd+72]
+    addss       xmm7, [outq+76]
+    movss       xmm3, [outq+56]
+    addss       xmm3, [outq+60]
     addss       xmm4, xmm3
     movss       xmm2, [outq+52]
     addss       xmm2, xmm3
-    movss       xmm3, [outq+24]
-    addss       xmm3, [outq+28]
-    addss       xmm7, [outq+76]
+    movss       xmm3, [outq+104]
+    addss       xmm3, [outq+108]
     addss       xmm1, xmm3
     addss       xmm5, xmm4
     movss [outq+ 16], xmm1
-    movss       xmm1, [outq+20]
+    movss       xmm1, [outq+100]
     addss       xmm1, xmm3
     movss       xmm3, [outq+40]
     movss [outq+ 48], xmm1
     addss       xmm3, [outq+44]
-    movss       xmm1, [outq+20]
+    movss       xmm1, [outq+100]
     addss       xmm4, xmm3
     addss       xmm3, xmm2
-    addss       xmm1, [outq+28]
+    addss       xmm1, [outq+108]
     movss [outq+ 40], xmm3
     addss       xmm2, [outq+36]
     movss       xmm3, [outq+8]
     movss [outq+ 56], xmm2
     addss       xmm3, [outq+12]
-    movss [outq+  8], xmm5
     movss [outq+ 32], xmm3
-    movss       xmm2, [outq+52]
     movss       xmm3, [outq+80]
-    movss       xmm5, [outq+120]
+    movss [outq+  8], xmm5
     movss [outq+ 80], xmm1
-    movss [outq+ 24], xmm4
+    movss       xmm2, [outq+52]
+    movss       xmm5, [outq+120]
     addss       xmm5, [outq+124]
     movss       xmm1, [outq+64]
     addss       xmm2, [outq+60]
     addss       xmm0, xmm5
     addss       xmm5, [outq+116]
-    mov    [outq+64], tmpd
+    mov   [outq+ 64], tmpd
     addss       xmm6, xmm0
     addss       xmm1, xmm6
     mov         tmpd, [outq+12]
-    movss [outq+  4], xmm1
-    movss       xmm1, [outq+88]
     mov   [outq+ 96], tmpd
-    addss       xmm1, [outq+92]
-    movss       xmm4, [outq+104]
-    mov         tmpd, [outq+28]
-    addss       xmm4, [outq+108]
-    addss       xmm0, xmm4
-    addss       xmm3, xmm1
-    addss       xmm1, [outq+84]
-    addss       xmm4, xmm5
+    movss [outq+  4], xmm1
+    movss       xmm1, [outq+24]
+    movss [outq+ 24], xmm4
+    movss       xmm4, [outq+88]
+    addss       xmm4, [outq+92]
+    addss       xmm3, xmm4
+    addss       xmm4, [outq+84]
+    mov         tmpd, [outq+108]
+    addss       xmm1, [outq+28]
+    addss       xmm0, xmm1
+    addss       xmm1, xmm5
     addss       xmm6, xmm3
     addss       xmm3, xmm0
     addss       xmm0, xmm7
-    addss       xmm5, [outq+100]
-    addss       xmm7, xmm4
+    addss       xmm5, [outq+20]
+    addss       xmm7, xmm1
+    movss [outq+ 12], xmm6
     mov   [outq+112], tmpd
+    movss       xmm6, [outq+28]
     movss [outq+ 28], xmm0
     movss       xmm0, [outq+36]
     movss [outq+ 36], xmm7
-    addss       xmm4, xmm1
+    addss       xmm1, xmm4
     movss       xmm7, [outq+116]
     addss       xmm0, xmm2
     addss       xmm7, [outq+124]
     movss [outq+ 72], xmm0
     movss       xmm0, [outq+44]
-    movss [outq+ 12], xmm6
-    movss [outq+ 20], xmm3
     addss       xmm2, xmm0
-    movss [outq+ 44], xmm4
+    movss [outq+ 44], xmm1
     movss [outq+ 88], xmm2
     addss       xmm0, [outq+60]
     mov         tmpd, [outq+60]
     mov   [outq+120], tmpd
     movss [outq+104], xmm0
-    addss       xmm1, xmm5
+    addss       xmm4, xmm5
     addss       xmm5, [outq+68]
-    movss  [outq+52], xmm1
-    movss  [outq+60], xmm5
-    movss       xmm1, [outq+68]
-    movss       xmm5, [outq+100]
+    movss [outq+ 52], xmm4
+    movss [outq+ 60], xmm5
+    movss       xmm4, [outq+68]
+    movss       xmm5, [outq+20]
+    movss [outq+ 20], xmm3
     addss       xmm5, xmm7
-    addss       xmm7, [outq+108]
-    addss       xmm1, xmm5
+    addss       xmm7, xmm6
+    addss       xmm4, xmm5
     movss       xmm2, [outq+84]
     addss       xmm2, [outq+92]
     addss       xmm5, xmm2
-    movss [outq+ 68], xmm1
+    movss  [outq+68], xmm4
     addss       xmm2, xmm7
-    movss       xmm1, [outq+76]
+    movss       xmm4, [outq+76]
     movss [outq+ 84], xmm2
     movss [outq+ 76], xmm5
-    movss       xmm2, [outq+108]
-    addss       xmm7, xmm1
-    addss       xmm2, [outq+124]
-    addss       xmm1, xmm2
-    addss       xmm2, [outq+92]
-    movss [outq+100], xmm1
-    movss [outq+108], xmm2
-    movss       xmm2, [outq+92]
-    movss [outq+ 92], xmm7
-    addss       xmm2, [outq+124]
-    movss [outq+116], xmm2
+    addss       xmm7, xmm4
+    addss       xmm6, [outq+124]
+    addss       xmm4, xmm6
+    addss       xmm6, [outq+92]
+    movss [outq+100], xmm4
+    movss [outq+108], xmm6
+    movss       xmm6, [outq+92]
+    movss  [outq+92], xmm7
+    addss       xmm6, [outq+124]
+    movss [outq+116], xmm6
     RET
-- 
1.7.4.1

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to