>From 8c83cf63e7cfd0221ef989892228572443acebcf Mon Sep 17 00:00:00 2001
From: Vitor Sessak <[email protected]>
Date: Sat, 14 May 2011 14:16:30 +0200
Subject: [PATCH 2/3] dct32: Change pass 6 permutation to allow for AVX implementation

---
 libavcodec/x86/dct32_sse.asm |  103 +++++++++++++++++++++---------------------
 1 files changed, 51 insertions(+), 52 deletions(-)

diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
index 4cf71f5..ac25eb3 100644
--- a/libavcodec/x86/dct32_sse.asm
+++ b/libavcodec/x86/dct32_sse.asm
@@ -154,7 +154,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
 
     movaps      xmm1, [outq+16]
     BUTTERFLY3  xmm1, xmm3, xmm2, xmm5
-    movaps      [outq+16], xmm1
+    movaps      [outq+96], xmm1
 
     BUTTERFLY3  xmm4, xmm3, xmm2, xmm5
     movaps      [outq+64], xmm4
@@ -171,47 +171,45 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
     movaps      [outq+48], xmm4
 
     BUTTERFLY3  xmm6, xmm3, xmm2, xmm7
-    movaps      [outq+96], xmm6
+    movaps      [outq+16], xmm6
 
     BUTTERFLY3  xmm0, xmm3, xmm2, xmm7
     movaps      [outq+112], xmm0
 
-
-    ;    pass no, 6 SIMD...
-    movss xmm3, [outq+56]
+    ;    pass 6, no SIMD...
     mov   tmpq, [outd+4]
-    addss xmm3, [outq+60]
     movss xmm7, [outd+72]
+    addss xmm7, [outq+76]
+    movss xmm3, [outq+56]
+    addss xmm3, [outq+60]
     addss xmm4, xmm3
     movss xmm2, [outq+52]
     addss xmm2, xmm3
-    movss xmm3, [outq+24]
-    addss xmm3, [outq+28]
-    addss xmm7, [outq+76]
+    movss xmm3, [outq+104]
+    addss xmm3, [outq+108]
     addss xmm1, xmm3
     addss xmm5, xmm4
     movss [outq+16], xmm1
-    movss xmm1, [outq+20]
+    movss xmm1, [outq+100]
     addss xmm1, xmm3
     movss xmm3, [outq+40]
     movss [outq+48], xmm1
     addss xmm3, [outq+44]
-    movss xmm1, [outq+20]
+    movss xmm1, [outq+100]
     addss xmm4, xmm3
     addss xmm3, xmm2
-    addss xmm1, [outq+28]
+    addss xmm1, [outq+108]
     movss [outq+40], xmm3
     addss xmm2, [outq+36]
     movss xmm3, [outq+8]
     movss [outq+56], xmm2
     addss xmm3, [outq+12]
-    movss [outq+8], xmm5
     movss [outq+32], xmm3
-    movss xmm2, [outq+52]
     movss xmm3, [outq+80]
-    movss xmm5, [outq+120]
+    movss [outq+8], xmm5
     movss [outq+80], xmm1
-    movss [outq+24], xmm4
+    movss xmm2, [outq+52]
+    movss xmm5, [outq+120]
     addss xmm5, [outq+124]
     movss xmm1, [outq+64]
     addss xmm2, [outq+60]
@@ -220,68 +218,69 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
     mov  [outq+64], tmpq
     addss xmm6, xmm0
     addss xmm1, xmm6
-    mov   tmpq, [outq+12]
-    movss [outq+4], xmm1
-    movss xmm1, [outq+88]
+    mov  tmpq, [outq+12]
     mov  [outq+96], tmpq
-    addss xmm1, [outq+92]
-    movss xmm4, [outq+104]
-    mov   tmpq, [outq+28]
-    addss xmm4, [outq+108]
-    addss xmm0, xmm4
-    addss xmm3, xmm1
-    addss xmm1, [outq+84]
-    addss xmm4, xmm5
+    movss [outq+4], xmm1
+    movss xmm1, [outq+24]
+    movss [outq+24], xmm4
+    movss xmm4, [outq+88]
+    addss xmm4, [outq+92]
+    addss xmm3, xmm4
+    addss xmm4, [outq+84]
+    mov  tmpq, [outq+108]
+    addss xmm1, [outq+28]
+    addss xmm0, xmm1
+    addss xmm1, xmm5
     addss xmm6, xmm3
     addss xmm3, xmm0
     addss xmm0, xmm7
-    addss xmm5, [outq+100]
-    addss xmm7, xmm4
-    mov   [outq+112], tmpq
+    addss xmm5, [outq+20]
+    addss xmm7, xmm1
+    movss [outq+12], xmm6
+    mov  [outq+112], tmpq
+    movss xmm6, [outq+28]
     movss [outq+28], xmm0
     movss xmm0, [outq+36]
     movss [outq+36], xmm7
-    addss xmm4, xmm1
+    addss xmm1, xmm4
     movss xmm7, [outq+116]
     addss xmm0, xmm2
     addss xmm7, [outq+124]
     movss [outq+72], xmm0
     movss xmm0, [outq+44]
-    movss [outq+12], xmm6
-    movss [outq+20], xmm3
     addss xmm2, xmm0
-    movss [outq+44], xmm4
+    movss [outq+44], xmm1
     movss [outq+88], xmm2
     addss xmm0, [outq+60]
     mov   tmpq, [outq+60]
     mov   [outq+120], tmpq
     movss [outq+104], xmm0
-    addss xmm1, xmm5
+    addss xmm4, xmm5
     addss xmm5, [outq+68]
-    movss [outq+52], xmm1
+    movss [outq+52], xmm4
     movss [outq+60], xmm5
-    movss xmm1, [outq+68]
-    movss xmm5, [outq+100]
+    movss xmm4, [outq+68]
+    movss xmm5, [outq+20]
+    movss [outq+20], xmm3
     addss xmm5, xmm7
-    addss xmm7, [outq+108]
-    addss xmm1, xmm5
+    addss xmm7, xmm6
+    addss xmm4, xmm5
     movss xmm2, [outq+84]
     addss xmm2, [outq+92]
     addss xmm5, xmm2
-    movss [outq+68], xmm1
+    movss [outq+68], xmm4
     addss xmm2, xmm7
-    movss xmm1, [outq+76]
+    movss xmm4, [outq+76]
     movss [outq+84], xmm2
     movss [outq+76], xmm5
-    movss xmm2, [outq+108]
-    addss xmm7, xmm1
-    addss xmm2, [outq+124]
-    addss xmm1, xmm2
-    addss xmm2, [outq+92]
-    movss [outq+100], xmm1
-    movss [outq+108], xmm2
-    movss xmm2, [outq+92]
+    addss xmm7, xmm4
+    addss xmm6, [outq+124]
+    addss xmm4, xmm6
+    addss xmm6, [outq+92]
+    movss [outq+100], xmm4
+    movss [outq+108], xmm6
+    movss xmm6, [outq+92]
     movss [outq+92], xmm7
-    addss xmm2, [outq+124]
-    movss [outq+116], xmm2
+    addss xmm6, [outq+124]
+    movss [outq+116], xmm6
     REP_RET
-- 
1.7.4.1

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to