Like the FFT patch, not tested on real hardware yet (what can be done
with "fft-test -d -n 5 -s" and by decoding MP3)...
-Vitor
>From 22253a170b27d8aa79e15553b4a88fd46a1930e9 Mon Sep 17 00:00:00 2001
From: Vitor Sessak <[email protected]>
Date: Sat, 14 May 2011 14:17:15 +0200
Subject: [PATCH 3/3] Add AVX implementation of 32-point DCT
---
libavcodec/mpegaudio.h | 2 +-
libavcodec/x86/dct32_sse.asm | 325 +++++++++++++++++++++++++++---------------
libavcodec/x86/fft.c | 4 +-
libavcodec/x86/fft.h | 1 +
4 files changed, 216 insertions(+), 116 deletions(-)
diff --git a/libavcodec/mpegaudio.h b/libavcodec/mpegaudio.h
index fbfddcc..ccd859e 100644
--- a/libavcodec/mpegaudio.h
+++ b/libavcodec/mpegaudio.h
@@ -145,7 +145,7 @@ typedef struct MPADecodeContext {
uint32_t free_format_next_header;
GetBitContext gb;
GetBitContext in_gb;
- DECLARE_ALIGNED(16, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2];
+ DECLARE_ALIGNED(32, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2];
int synth_buf_offset[MPA_MAX_CHANNELS];
DECLARE_ALIGNED(16, INTFLOAT, sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT];
INTFLOAT mdct_buf[MPA_MAX_CHANNELS][SBLIMIT * 18]; /* previous samples, for layer 3 MDCT */
diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
index d94c0e7..e25d1b1 100644
--- a/libavcodec/x86/dct32_sse.asm
+++ b/libavcodec/x86/dct32_sse.asm
@@ -26,25 +26,34 @@ SECTION_RODATA
align 32
ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
dd 0.553104, 0.582935, 0.622504, 0.674808
- dd -1.169440, -0.972568, -0.839350, -0.744536
dd -10.190008, -3.407609, -2.057781, -1.484165
+ dd -1.169440, -0.972568, -0.839350, -0.744536
dd 0.502419, 0.522499, 0.566944, 0.646822
dd 0.788155, 1.060678, 1.722447, 5.101149
dd 0.509796, 0.601345, 0.899976, 2.562916
+ dd 0.509796, 0.601345, 0.899976, 2.562916
+ dd 1.000000, 1.000000, 1.306563, 0.541196
dd 1.000000, 1.000000, 1.306563, 0.541196
dd 1.000000, 0.707107, 1.000000, -0.707107
+ dd 1.000000, 0.707107, 1.000000, -0.707107
-ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
+ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
-%macro BUTTERFLY 4
+%macro BUTTERFLY_SSE 4
movaps %4, %1
subps %1, %2
addps %2, %4
mulps %1, %3
%endmacro
-%macro BUTTERFLY0 5
+%macro BUTTERFLY_AVX 4
+ vsubps %4, %1, %2
+ vaddps %2, %2, %1
+ vmulps %1, %4, %3
+%endmacro
+
+%macro BUTTERFLY0_SSE 5
movaps %4, %1
shufps %1, %1, %5
xorps %4, %2
@@ -52,6 +61,13 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
mulps %1, %3
%endmacro
+%macro BUTTERFLY0_AVX 5
+ vshufps %4, %1, %1, %5
+ vxorps %1, %1, %2
+ vaddps %4, %4, %1
+ vmulps %1, %4, %3
+%endmacro
+
%macro BUTTERFLY2 4
BUTTERFLY0 %1, %2, %3, %4, 0x1b
%endmacro
@@ -60,7 +76,193 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
BUTTERFLY0 %1, %2, %3, %4, 0xb1
%endmacro
+%macro PASS6_AND_PERMUTE 0
+ mov tmpq, [outd+4]
+ movss xmm7, [outd+72]
+ addss xmm7, [outq+76]
+ movss xmm3, [outq+56]
+ addss xmm3, [outq+60]
+ addss xmm4, xmm3
+ movss xmm2, [outq+52]
+ addss xmm2, xmm3
+ movss xmm3, [outq+104]
+ addss xmm3, [outq+108]
+ addss xmm1, xmm3
+ addss xmm5, xmm4
+ movss [outq+16], xmm1
+ movss xmm1, [outq+100]
+ addss xmm1, xmm3
+ movss xmm3, [outq+40]
+ movss [outq+48], xmm1
+ addss xmm3, [outq+44]
+ movss xmm1, [outq+100]
+ addss xmm4, xmm3
+ addss xmm3, xmm2
+ addss xmm1, [outq+108]
+ movss [outq+40], xmm3
+ addss xmm2, [outq+36]
+ movss xmm3, [outq+8]
+ movss [outq+56], xmm2
+ addss xmm3, [outq+12]
+ movss [outq+32], xmm3
+ movss xmm3, [outq+80]
+ movss [outq+8], xmm5
+ movss [outq+80], xmm1
+ movss xmm2, [outq+52]
+ movss xmm5, [outq+120]
+ addss xmm5, [outq+124]
+ movss xmm1, [outq+64]
+ addss xmm2, [outq+60]
+ addss xmm0, xmm5
+ addss xmm5, [outq+116]
+ mov [outq+64], tmpq
+ addss xmm6, xmm0
+ addss xmm1, xmm6
+ mov tmpq, [outq+12]
+ mov [outq+96], tmpq
+ movss [outq+4], xmm1
+ movss xmm1, [outq+24]
+ movss [outq+24], xmm4
+ movss xmm4, [outq+88]
+ addss xmm4, [outq+92]
+ addss xmm3, xmm4
+ addss xmm4, [outq+84]
+ mov tmpq, [outq+108]
+ addss xmm1, [outq+28]
+ addss xmm0, xmm1
+ addss xmm1, xmm5
+ addss xmm6, xmm3
+ addss xmm3, xmm0
+ addss xmm0, xmm7
+ addss xmm5, [outq+20]
+ addss xmm7, xmm1
+ movss [outq+12], xmm6
+ mov [outq+112], tmpq
+ movss xmm6, [outq+28]
+ movss [outq+28], xmm0
+ movss xmm0, [outq+36]
+ movss [outq+36], xmm7
+ addss xmm1, xmm4
+ movss xmm7, [outq+116]
+ addss xmm0, xmm2
+ addss xmm7, [outq+124]
+ movss [outq+72], xmm0
+ movss xmm0, [outq+44]
+ addss xmm2, xmm0
+ movss [outq+44], xmm1
+ movss [outq+88], xmm2
+ addss xmm0, [outq+60]
+ mov tmpq, [outq+60]
+ mov [outq+120], tmpq
+ movss [outq+104], xmm0
+ addss xmm4, xmm5
+ addss xmm5, [outq+68]
+ movss [outq+52], xmm4
+ movss [outq+60], xmm5
+ movss xmm4, [outq+68]
+ movss xmm5, [outq+20]
+ movss [outq+20], xmm3
+ addss xmm5, xmm7
+ addss xmm7, xmm6
+ addss xmm4, xmm5
+ movss xmm2, [outq+84]
+ addss xmm2, [outq+92]
+ addss xmm5, xmm2
+ movss [outq+68], xmm4
+ addss xmm2, xmm7
+ movss xmm4, [outq+76]
+ movss [outq+84], xmm2
+ movss [outq+76], xmm5
+ addss xmm7, xmm4
+ addss xmm6, [outq+124]
+ addss xmm4, xmm6
+ addss xmm6, [outq+92]
+ movss [outq+100], xmm4
+ movss [outq+108], xmm6
+ movss xmm6, [outq+92]
+ movss [outq+92], xmm7
+ addss xmm6, [outq+124]
+ movss [outq+116], xmm6
+%endmacro
+
+%define BUTTERFLY BUTTERFLY_AVX
+%define BUTTERFLY0 BUTTERFLY0_AVX
+
section .text align=16
+cglobal dct32_float_avx, 2,3,8, out, in, tmp
+ ; pass 1
+ vmovaps ymm4, [inq+0]
+ vinsertf128 ymm5, ymm5, [inq+96], 1
+ vinsertf128 ymm5, ymm5, [inq+112], 0
+ vshufps ymm5, ymm5, ymm5, 0x1b
+ BUTTERFLY ymm4, ymm5, [ps_cos_vec], ymm6
+
+ vmovaps ymm2, [inq+64]
+ vinsertf128 ymm6, ymm6, [inq+32], 1
+ vinsertf128 ymm6, ymm6, [inq+48], 0
+ vshufps ymm6, ymm6, ymm6, 0x1b
+ BUTTERFLY ymm2, ymm6, [ps_cos_vec+32], ymm0
+
+ ; pass 2
+
+ BUTTERFLY ymm5, ymm6, [ps_cos_vec+64], ymm0
+ BUTTERFLY ymm4, ymm2, [ps_cos_vec+64], ymm7
+
+
+ ; pass 3
+ vperm2f128 ymm3, ymm6, ymm4, 0x31
+ vperm2f128 ymm1, ymm6, ymm4, 0x20
+ vshufps ymm3, ymm3, ymm3, 0x1b
+
+ BUTTERFLY ymm1, ymm3, [ps_cos_vec+96], ymm6
+
+
+ vperm2f128 ymm4, ymm5, ymm2, 0x20
+ vperm2f128 ymm5, ymm5, ymm2, 0x31
+ vshufps ymm5, ymm5, ymm5, 0x1b
+
+ BUTTERFLY ymm4, ymm5, [ps_cos_vec+96], ymm6
+
+ ; pass 4
+ vmovaps ymm6, [ps_p1p1m1m1+0]
+ vmovaps ymm2, [ps_cos_vec+128]
+
+ BUTTERFLY2 ymm5, ymm6, ymm2, ymm7
+ BUTTERFLY2 ymm4, ymm6, ymm2, ymm7
+ BUTTERFLY2 ymm1, ymm6, ymm2, ymm7
+ BUTTERFLY2 ymm3, ymm6, ymm2, ymm7
+
+
+ ; pass 5
+ vshufps ymm6, ymm6, ymm6, 0xcc
+ vmovaps ymm2, [ps_cos_vec+160]
+
+ BUTTERFLY3 ymm5, ymm6, ymm2, ymm7
+ BUTTERFLY3 ymm4, ymm6, ymm2, ymm7
+ BUTTERFLY3 ymm1, ymm6, ymm2, ymm7
+ BUTTERFLY3 ymm3, ymm6, ymm2, ymm7
+
+ vextractf128 xmm6, ymm3, 1
+ vmovaps [outq], ymm3
+
+ vextractf128 [outq+64], ymm5, 1
+ vextractf128 [outq+32], ymm5, 0
+
+ vextractf128 [outq+80], ymm4, 1
+ vextractf128 [outq+48], ymm4, 0
+
+ vextractf128 xmm0, ymm1, 1
+ vmovaps [outq+96], ymm1
+
+ vzeroupper
+
+ ; pass 6, no SIMD...
+ PASS6_AND_PERMUTE
+ REP_RET
+
+%define BUTTERFLY BUTTERFLY_SSE
+%define BUTTERFLY0 BUTTERFLY0_SSE
+
cglobal dct32_float_sse, 2,3,8, out, in, tmp
; pass 1
@@ -72,7 +274,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
movaps xmm7, [inq+64]
movaps xmm4, [inq+48]
shufps xmm4, xmm4, 0x1b
- BUTTERFLY xmm7, xmm4, [ps_cos_vec+48], xmm3
+ BUTTERFLY xmm7, xmm4, [ps_cos_vec+32], xmm3
; pass 2
@@ -90,7 +292,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
movaps xmm4, [inq+80]
movaps xmm5, [inq+32]
shufps xmm5, xmm5, 0x1b
- BUTTERFLY xmm4, xmm5, [ps_cos_vec+32], xmm3
+ BUTTERFLY xmm4, xmm5, [ps_cos_vec+48], xmm3
; pass 2
BUTTERFLY xmm0, xmm7, xmm2, xmm3
@@ -121,7 +323,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
; pass 4
movaps xmm3, [ps_p1p1m1m1+0]
- movaps xmm2, [ps_cos_vec+112]
+ movaps xmm2, [ps_cos_vec+128]
BUTTERFLY2 xmm5, xmm3, xmm2, xmm1
@@ -146,7 +348,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
BUTTERFLY2 xmm0, xmm3, xmm2, xmm1
; pass 5
- movaps xmm2, [ps_cos_vec+128]
+ movaps xmm2, [ps_cos_vec+160]
shufps xmm3, xmm3, 0xcc
BUTTERFLY3 xmm5, xmm3, xmm2, xmm1
@@ -177,110 +379,5 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
movaps [outq+112], xmm0
; pass 6, no SIMD...
- mov tmpq, [outd+4]
- movss xmm7, [outd+72]
- addss xmm7, [outq+76]
- movss xmm3, [outq+56]
- addss xmm3, [outq+60]
- addss xmm4, xmm3
- movss xmm2, [outq+52]
- addss xmm2, xmm3
- movss xmm3, [outq+104]
- addss xmm3, [outq+108]
- addss xmm1, xmm3
- addss xmm5, xmm4
- movss [outq+16], xmm1
- movss xmm1, [outq+100]
- addss xmm1, xmm3
- movss xmm3, [outq+40]
- movss [outq+48], xmm1
- addss xmm3, [outq+44]
- movss xmm1, [outq+100]
- addss xmm4, xmm3
- addss xmm3, xmm2
- addss xmm1, [outq+108]
- movss [outq+40], xmm3
- addss xmm2, [outq+36]
- movss xmm3, [outq+8]
- movss [outq+56], xmm2
- addss xmm3, [outq+12]
- movss [outq+32], xmm3
- movss xmm3, [outq+80]
- movss [outq+8], xmm5
- movss [outq+80], xmm1
- movss xmm2, [outq+52]
- movss xmm5, [outq+120]
- addss xmm5, [outq+124]
- movss xmm1, [outq+64]
- addss xmm2, [outq+60]
- addss xmm0, xmm5
- addss xmm5, [outq+116]
- mov [outq+64], tmpq
- addss xmm6, xmm0
- addss xmm1, xmm6
- mov tmpq, [outq+12]
- mov [outq+96], tmpq
- movss [outq+4], xmm1
- movss xmm1, [outq+24]
- movss [outq+24], xmm4
- movss xmm4, [outq+88]
- addss xmm4, [outq+92]
- addss xmm3, xmm4
- addss xmm4, [outq+84]
- mov tmpq, [outq+108]
- addss xmm1, [outq+28]
- addss xmm0, xmm1
- addss xmm1, xmm5
- addss xmm6, xmm3
- addss xmm3, xmm0
- addss xmm0, xmm7
- addss xmm5, [outq+20]
- addss xmm7, xmm1
- movss [outq+12], xmm6
- mov [outq+112], tmpq
- movss xmm6, [outq+28]
- movss [outq+28], xmm0
- movss xmm0, [outq+36]
- movss [outq+36], xmm7
- addss xmm1, xmm4
- movss xmm7, [outq+116]
- addss xmm0, xmm2
- addss xmm7, [outq+124]
- movss [outq+72], xmm0
- movss xmm0, [outq+44]
- addss xmm2, xmm0
- movss [outq+44], xmm1
- movss [outq+88], xmm2
- addss xmm0, [outq+60]
- mov tmpq, [outq+60]
- mov [outq+120], tmpq
- movss [outq+104], xmm0
- addss xmm4, xmm5
- addss xmm5, [outq+68]
- movss [outq+52], xmm4
- movss [outq+60], xmm5
- movss xmm4, [outq+68]
- movss xmm5, [outq+20]
- movss [outq+20], xmm3
- addss xmm5, xmm7
- addss xmm7, xmm6
- addss xmm4, xmm5
- movss xmm2, [outq+84]
- addss xmm2, [outq+92]
- addss xmm5, xmm2
- movss [outq+68], xmm4
- addss xmm2, xmm7
- movss xmm4, [outq+76]
- movss [outq+84], xmm2
- movss [outq+76], xmm5
- addss xmm7, xmm4
- addss xmm6, [outq+124]
- addss xmm4, xmm6
- addss xmm6, [outq+92]
- movss [outq+100], xmm4
- movss [outq+108], xmm6
- movss xmm6, [outq+92]
- movss [outq+92], xmm7
- addss xmm6, [outq+124]
- movss [outq+116], xmm6
+ PASS6_AND_PERMUTE
REP_RET
diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c
index b29412c..8eef421 100644
--- a/libavcodec/x86/fft.c
+++ b/libavcodec/x86/fft.c
@@ -57,7 +57,9 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
av_cold void ff_dct_init_mmx(DCTContext *s)
{
int has_vectors = av_get_cpu_flags();
- if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
+ if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX)
+ s->dct32 = ff_dct32_float_avx;
+ else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
s->dct32 = ff_dct32_float_sse;
}
#endif
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index e6eace2..c714185 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -35,5 +35,6 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
+void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
#endif
--
1.7.4.1
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel