>From 748bd158ffd964574f04c9310db0b148abfcf21b Mon Sep 17 00:00:00 2001
From: Vitor Sessak <[email protected]>
Date: Sat, 14 May 2011 14:16:30 +0200
Subject: [PATCH 2/3] dct32: Change pass 6 permutation to allow for AVX implementation
---
libavcodec/x86/dct32_sse.asm | 99 +++++++++++++++++++++---------------------
1 files changed, 49 insertions(+), 50 deletions(-)
diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
index 27ea943..d94c0e7 100644
--- a/libavcodec/x86/dct32_sse.asm
+++ b/libavcodec/x86/dct32_sse.asm
@@ -154,7 +154,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
movaps xmm1, [outq+16]
BUTTERFLY3 xmm1, xmm3, xmm2, xmm5
- movaps [outq+16], xmm1
+ movaps [outq+96], xmm1
BUTTERFLY3 xmm4, xmm3, xmm2, xmm5
movaps [outq+64], xmm4
@@ -171,47 +171,45 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
movaps [outq+48], xmm4
BUTTERFLY3 xmm6, xmm3, xmm2, xmm7
- movaps [outq+96], xmm6
+ movaps [outq+16], xmm6
BUTTERFLY3 xmm0, xmm3, xmm2, xmm7
movaps [outq+112], xmm0
-
- ; pass no, 6 SIMD...
+ ; pass 6, no SIMD...
+ mov tmpq, [outd+4]
+ movss xmm7, [outd+72]
+ addss xmm7, [outq+76]
movss xmm3, [outq+56]
- mov tmpq, [outd+4]
addss xmm3, [outq+60]
- movss xmm7, [outd+72]
addss xmm4, xmm3
movss xmm2, [outq+52]
addss xmm2, xmm3
- movss xmm3, [outq+24]
- addss xmm3, [outq+28]
- addss xmm7, [outq+76]
+ movss xmm3, [outq+104]
+ addss xmm3, [outq+108]
addss xmm1, xmm3
addss xmm5, xmm4
movss [outq+16], xmm1
- movss xmm1, [outq+20]
+ movss xmm1, [outq+100]
addss xmm1, xmm3
movss xmm3, [outq+40]
movss [outq+48], xmm1
addss xmm3, [outq+44]
- movss xmm1, [outq+20]
+ movss xmm1, [outq+100]
addss xmm4, xmm3
addss xmm3, xmm2
- addss xmm1, [outq+28]
+ addss xmm1, [outq+108]
movss [outq+40], xmm3
addss xmm2, [outq+36]
movss xmm3, [outq+8]
movss [outq+56], xmm2
addss xmm3, [outq+12]
- movss [outq+8], xmm5
movss [outq+32], xmm3
- movss xmm2, [outq+52]
movss xmm3, [outq+80]
- movss xmm5, [outq+120]
+ movss [outq+8], xmm5
movss [outq+80], xmm1
- movss [outq+24], xmm4
+ movss xmm2, [outq+52]
+ movss xmm5, [outq+120]
addss xmm5, [outq+124]
movss xmm1, [outq+64]
addss xmm2, [outq+60]
@@ -221,67 +219,68 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
addss xmm6, xmm0
addss xmm1, xmm6
mov tmpq, [outq+12]
- movss [outq+4], xmm1
- movss xmm1, [outq+88]
mov [outq+96], tmpq
- addss xmm1, [outq+92]
- movss xmm4, [outq+104]
- mov tmpq, [outq+28]
- addss xmm4, [outq+108]
- addss xmm0, xmm4
- addss xmm3, xmm1
- addss xmm1, [outq+84]
- addss xmm4, xmm5
+ movss [outq+4], xmm1
+ movss xmm1, [outq+24]
+ movss [outq+24], xmm4
+ movss xmm4, [outq+88]
+ addss xmm4, [outq+92]
+ addss xmm3, xmm4
+ addss xmm4, [outq+84]
+ mov tmpq, [outq+108]
+ addss xmm1, [outq+28]
+ addss xmm0, xmm1
+ addss xmm1, xmm5
addss xmm6, xmm3
addss xmm3, xmm0
addss xmm0, xmm7
- addss xmm5, [outq+100]
- addss xmm7, xmm4
+ addss xmm5, [outq+20]
+ addss xmm7, xmm1
+ movss [outq+12], xmm6
mov [outq+112], tmpq
+ movss xmm6, [outq+28]
movss [outq+28], xmm0
movss xmm0, [outq+36]
movss [outq+36], xmm7
- addss xmm4, xmm1
+ addss xmm1, xmm4
movss xmm7, [outq+116]
addss xmm0, xmm2
addss xmm7, [outq+124]
movss [outq+72], xmm0
movss xmm0, [outq+44]
- movss [outq+12], xmm6
- movss [outq+20], xmm3
addss xmm2, xmm0
- movss [outq+44], xmm4
+ movss [outq+44], xmm1
movss [outq+88], xmm2
addss xmm0, [outq+60]
mov tmpq, [outq+60]
mov [outq+120], tmpq
movss [outq+104], xmm0
- addss xmm1, xmm5
+ addss xmm4, xmm5
addss xmm5, [outq+68]
- movss [outq+52], xmm1
+ movss [outq+52], xmm4
movss [outq+60], xmm5
- movss xmm1, [outq+68]
- movss xmm5, [outq+100]
+ movss xmm4, [outq+68]
+ movss xmm5, [outq+20]
+ movss [outq+20], xmm3
addss xmm5, xmm7
- addss xmm7, [outq+108]
- addss xmm1, xmm5
+ addss xmm7, xmm6
+ addss xmm4, xmm5
movss xmm2, [outq+84]
addss xmm2, [outq+92]
addss xmm5, xmm2
- movss [outq+68], xmm1
+ movss [outq+68], xmm4
addss xmm2, xmm7
- movss xmm1, [outq+76]
+ movss xmm4, [outq+76]
movss [outq+84], xmm2
movss [outq+76], xmm5
- movss xmm2, [outq+108]
- addss xmm7, xmm1
- addss xmm2, [outq+124]
- addss xmm1, xmm2
- addss xmm2, [outq+92]
- movss [outq+100], xmm1
- movss [outq+108], xmm2
- movss xmm2, [outq+92]
+ addss xmm7, xmm4
+ addss xmm6, [outq+124]
+ addss xmm4, xmm6
+ addss xmm6, [outq+92]
+ movss [outq+100], xmm4
+ movss [outq+108], xmm6
+ movss xmm6, [outq+92]
movss [outq+92], xmm7
- addss xmm2, [outq+124]
- movss [outq+116], xmm2
+ addss xmm6, [outq+124]
+ movss [outq+116], xmm6
REP_RET
--
1.7.4.1
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel