>From 78dbd81719fcb063cccfa7fc8551c34620abe1bb Mon Sep 17 00:00:00 2001
From: Vitor Sessak <[email protected]>
Date: Sat, 14 May 2011 14:16:30 +0200
Subject: [PATCH 2/3] dct32: Change pass 6 permutation to allow for AVX implementation
---
libavcodec/x86/dct32_sse.asm | 102 +++++++++++++++++++++---------------------
1 files changed, 51 insertions(+), 51 deletions(-)
diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
index c565ea5..3c9bfac 100644
--- a/libavcodec/x86/dct32_sse.asm
+++ b/libavcodec/x86/dct32_sse.asm
@@ -155,7 +155,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
movaps xmm1, [outq+16]
BUTTERFLY3 xmm1, xmm3, xmm2, xmm5
- movaps [outq+16], xmm1
+ movaps [outq+96], xmm1
BUTTERFLY3 xmm4, xmm3, xmm2, xmm5
movaps [outq+64], xmm4
@@ -172,117 +172,117 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
movaps [outq+48], xmm4
BUTTERFLY3 xmm6, xmm3, xmm2, xmm7
- movaps [outq+96], xmm6
+ movaps [outq+16], xmm6
BUTTERFLY3 xmm0, xmm3, xmm2, xmm7
movaps [outq+112], xmm0
; pass 6, no SIMD...
- movss xmm3, [outq+56]
mov tmpd, [outd+4]
- addss xmm3, [outq+60]
movss xmm7, [outd+72]
+ addss xmm7, [outq+76]
+ movss xmm3, [outq+56]
+ addss xmm3, [outq+60]
addss xmm4, xmm3
movss xmm2, [outq+52]
addss xmm2, xmm3
- movss xmm3, [outq+24]
- addss xmm3, [outq+28]
- addss xmm7, [outq+76]
+ movss xmm3, [outq+104]
+ addss xmm3, [outq+108]
addss xmm1, xmm3
addss xmm5, xmm4
movss [outq+ 16], xmm1
- movss xmm1, [outq+20]
+ movss xmm1, [outq+100]
addss xmm1, xmm3
movss xmm3, [outq+40]
movss [outq+ 48], xmm1
addss xmm3, [outq+44]
- movss xmm1, [outq+20]
+ movss xmm1, [outq+100]
addss xmm4, xmm3
addss xmm3, xmm2
- addss xmm1, [outq+28]
+ addss xmm1, [outq+108]
movss [outq+ 40], xmm3
addss xmm2, [outq+36]
movss xmm3, [outq+8]
movss [outq+ 56], xmm2
addss xmm3, [outq+12]
- movss [outq+ 8], xmm5
movss [outq+ 32], xmm3
- movss xmm2, [outq+52]
movss xmm3, [outq+80]
- movss xmm5, [outq+120]
+ movss [outq+ 8], xmm5
movss [outq+ 80], xmm1
- movss [outq+ 24], xmm4
+ movss xmm2, [outq+52]
+ movss xmm5, [outq+120]
addss xmm5, [outq+124]
movss xmm1, [outq+64]
addss xmm2, [outq+60]
addss xmm0, xmm5
addss xmm5, [outq+116]
- mov [outq+64], tmpd
+ mov [outq+ 64], tmpd
addss xmm6, xmm0
addss xmm1, xmm6
mov tmpd, [outq+12]
- movss [outq+ 4], xmm1
- movss xmm1, [outq+88]
mov [outq+ 96], tmpd
- addss xmm1, [outq+92]
- movss xmm4, [outq+104]
- mov tmpd, [outq+28]
- addss xmm4, [outq+108]
- addss xmm0, xmm4
- addss xmm3, xmm1
- addss xmm1, [outq+84]
- addss xmm4, xmm5
+ movss [outq+ 4], xmm1
+ movss xmm1, [outq+24]
+ movss [outq+ 24], xmm4
+ movss xmm4, [outq+88]
+ addss xmm4, [outq+92]
+ addss xmm3, xmm4
+ addss xmm4, [outq+84]
+ mov tmpd, [outq+108]
+ addss xmm1, [outq+28]
+ addss xmm0, xmm1
+ addss xmm1, xmm5
addss xmm6, xmm3
addss xmm3, xmm0
addss xmm0, xmm7
- addss xmm5, [outq+100]
- addss xmm7, xmm4
+ addss xmm5, [outq+20]
+ addss xmm7, xmm1
+ movss [outq+ 12], xmm6
mov [outq+112], tmpd
+ movss xmm6, [outq+28]
movss [outq+ 28], xmm0
movss xmm0, [outq+36]
movss [outq+ 36], xmm7
- addss xmm4, xmm1
+ addss xmm1, xmm4
movss xmm7, [outq+116]
addss xmm0, xmm2
addss xmm7, [outq+124]
movss [outq+ 72], xmm0
movss xmm0, [outq+44]
- movss [outq+ 12], xmm6
- movss [outq+ 20], xmm3
addss xmm2, xmm0
- movss [outq+ 44], xmm4
+ movss [outq+ 44], xmm1
movss [outq+ 88], xmm2
addss xmm0, [outq+60]
mov tmpd, [outq+60]
mov [outq+120], tmpd
movss [outq+104], xmm0
- addss xmm1, xmm5
+ addss xmm4, xmm5
addss xmm5, [outq+68]
- movss [outq+52], xmm1
- movss [outq+60], xmm5
- movss xmm1, [outq+68]
- movss xmm5, [outq+100]
+ movss [outq+ 52], xmm4
+ movss [outq+ 60], xmm5
+ movss xmm4, [outq+68]
+ movss xmm5, [outq+20]
+ movss [outq+ 20], xmm3
addss xmm5, xmm7
- addss xmm7, [outq+108]
- addss xmm1, xmm5
+ addss xmm7, xmm6
+ addss xmm4, xmm5
movss xmm2, [outq+84]
addss xmm2, [outq+92]
addss xmm5, xmm2
- movss [outq+ 68], xmm1
+ movss [outq+68], xmm4
addss xmm2, xmm7
- movss xmm1, [outq+76]
+ movss xmm4, [outq+76]
movss [outq+ 84], xmm2
movss [outq+ 76], xmm5
- movss xmm2, [outq+108]
- addss xmm7, xmm1
- addss xmm2, [outq+124]
- addss xmm1, xmm2
- addss xmm2, [outq+92]
- movss [outq+100], xmm1
- movss [outq+108], xmm2
- movss xmm2, [outq+92]
- movss [outq+ 92], xmm7
- addss xmm2, [outq+124]
- movss [outq+116], xmm2
+ addss xmm7, xmm4
+ addss xmm6, [outq+124]
+ addss xmm4, xmm6
+ addss xmm6, [outq+92]
+ movss [outq+100], xmm4
+ movss [outq+108], xmm6
+ movss xmm6, [outq+92]
+ movss [outq+92], xmm7
+ addss xmm6, [outq+124]
+ movss [outq+116], xmm6
RET
--
1.7.4.1
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel