On 05/15/2011 06:32 PM, Vitor Sessak wrote:
On 05/15/2011 06:03 PM, Kieran Kunhya wrote:
If you want access to an Sandy Bridgebox, there is one available
that is being used for x264 AVX development right now.
Thanks, that will probably be very useful. I'll send you a ssh key in pvt.
Easy to debug with an actual machine :-).
Revised patch attached (only the third patch needed changes), all FATE
tests pass.
Benchmarks:
SSE:
time: 0.0 us/transform [total time=1.53 s its=33554432]
time: 0.0 us/transform [total time=1.53 s its=33554432]
time: 0.0 us/transform [total time=1.53 s its=33554432]
AVX:
time: 0.0 us/transform [total time=1.40 s its=33554432]
time: 0.0 us/transform [total time=1.40 s its=33554432]
time: 0.0 us/transform [total time=1.40 s its=33554432]
-Vitor
>From de3ad19337966eb80f844236fe1ae5fb55c747da Mon Sep 17 00:00:00 2001
From: Vitor Sessak <[email protected]>
Date: Sat, 14 May 2011 14:17:15 +0200
Subject: [PATCH 3/3] dct32: Add AVX implementation of 32-point DCT
---
libavcodec/mpegaudio.h | 4 +-
libavcodec/x86/dct32_sse.asm | 326 +++++++++++++++++++++++++++---------------
libavcodec/x86/fft.c | 4 +-
libavcodec/x86/fft.h | 1 +
4 files changed, 217 insertions(+), 118 deletions(-)
diff --git a/libavcodec/mpegaudio.h b/libavcodec/mpegaudio.h
index f12b897..d247ce8 100644
--- a/libavcodec/mpegaudio.h
+++ b/libavcodec/mpegaudio.h
@@ -134,9 +134,9 @@ typedef struct MPADecodeContext {
uint32_t free_format_next_header;
GetBitContext gb;
GetBitContext in_gb;
- DECLARE_ALIGNED(16, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2];
+ DECLARE_ALIGNED(32, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2];
int synth_buf_offset[MPA_MAX_CHANNELS];
- DECLARE_ALIGNED(16, INTFLOAT, sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT];
+ DECLARE_ALIGNED(32, INTFLOAT, sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT];
INTFLOAT mdct_buf[MPA_MAX_CHANNELS][SBLIMIT * 18]; /* previous samples, for layer 3 MDCT */
GranuleDef granules[2][2]; /* Used in Layer 3 */
#ifdef DEBUG
diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
index ac25eb3..5a8691c 100644
--- a/libavcodec/x86/dct32_sse.asm
+++ b/libavcodec/x86/dct32_sse.asm
@@ -26,25 +26,34 @@ SECTION_RODATA 32
align 32
ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
dd 0.553104, 0.582935, 0.622504, 0.674808
- dd -1.169440, -0.972568, -0.839350, -0.744536
dd -10.190008, -3.407609, -2.057781, -1.484165
+ dd -1.169440, -0.972568, -0.839350, -0.744536
dd 0.502419, 0.522499, 0.566944, 0.646822
dd 0.788155, 1.060678, 1.722447, 5.101149
dd 0.509796, 0.601345, 0.899976, 2.562916
+ dd 0.509796, 0.601345, 0.899976, 2.562916
+ dd 1.000000, 1.000000, 1.306563, 0.541196
dd 1.000000, 1.000000, 1.306563, 0.541196
dd 1.000000, 0.707107, 1.000000, -0.707107
+ dd 1.000000, 0.707107, 1.000000, -0.707107
-ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
+ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
-%macro BUTTERFLY 4
+%macro BUTTERFLY_SSE 4
movaps %4, %1
subps %1, %2
addps %2, %4
mulps %1, %3
%endmacro
-%macro BUTTERFLY0 5
+%macro BUTTERFLY_AVX 4
+ vsubps %4, %1, %2
+ vaddps %2, %2, %1
+ vmulps %1, %4, %3
+%endmacro
+
+%macro BUTTERFLY0_SSE 5
movaps %4, %1
shufps %1, %1, %5
xorps %4, %2
@@ -52,6 +61,13 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
mulps %1, %3
%endmacro
+%macro BUTTERFLY0_AVX 5
+ vshufps %4, %1, %1, %5
+ vxorps %1, %1, %2
+ vaddps %4, %4, %1
+ vmulps %1, %4, %3
+%endmacro
+
%macro BUTTERFLY2 4
BUTTERFLY0 %1, %2, %3, %4, 0x1b
%endmacro
@@ -60,7 +76,193 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
BUTTERFLY0 %1, %2, %3, %4, 0xb1
%endmacro
+%macro PASS6_AND_PERMUTE 0
+ mov tmpd, [outd+4]
+ movss xmm7, [outd+72]
+ addss xmm7, [outq+76]
+ movss xmm3, [outq+56]
+ addss xmm3, [outq+60]
+ addss xmm4, xmm3
+ movss xmm2, [outq+52]
+ addss xmm2, xmm3
+ movss xmm3, [outq+104]
+ addss xmm3, [outq+108]
+ addss xmm1, xmm3
+ addss xmm5, xmm4
+ movss [outq+16], xmm1
+ movss xmm1, [outq+100]
+ addss xmm1, xmm3
+ movss xmm3, [outq+40]
+ movss [outq+48], xmm1
+ addss xmm3, [outq+44]
+ movss xmm1, [outq+100]
+ addss xmm4, xmm3
+ addss xmm3, xmm2
+ addss xmm1, [outq+108]
+ movss [outq+40], xmm3
+ addss xmm2, [outq+36]
+ movss xmm3, [outq+8]
+ movss [outq+56], xmm2
+ addss xmm3, [outq+12]
+ movss [outq+32], xmm3
+ movss xmm3, [outq+80]
+ movss [outq+8], xmm5
+ movss [outq+80], xmm1
+ movss xmm2, [outq+52]
+ movss xmm5, [outq+120]
+ addss xmm5, [outq+124]
+ movss xmm1, [outq+64]
+ addss xmm2, [outq+60]
+ addss xmm0, xmm5
+ addss xmm5, [outq+116]
+ mov [outq+64], tmpd
+ addss xmm6, xmm0
+ addss xmm1, xmm6
+ mov tmpd, [outq+12]
+ mov [outq+96], tmpd
+ movss [outq+4], xmm1
+ movss xmm1, [outq+24]
+ movss [outq+24], xmm4
+ movss xmm4, [outq+88]
+ addss xmm4, [outq+92]
+ addss xmm3, xmm4
+ addss xmm4, [outq+84]
+ mov tmpd, [outq+108]
+ addss xmm1, [outq+28]
+ addss xmm0, xmm1
+ addss xmm1, xmm5
+ addss xmm6, xmm3
+ addss xmm3, xmm0
+ addss xmm0, xmm7
+ addss xmm5, [outq+20]
+ addss xmm7, xmm1
+ movss [outq+12], xmm6
+ mov [outq+112], tmpd
+ movss xmm6, [outq+28]
+ movss [outq+28], xmm0
+ movss xmm0, [outq+36]
+ movss [outq+36], xmm7
+ addss xmm1, xmm4
+ movss xmm7, [outq+116]
+ addss xmm0, xmm2
+ addss xmm7, [outq+124]
+ movss [outq+72], xmm0
+ movss xmm0, [outq+44]
+ addss xmm2, xmm0
+ movss [outq+44], xmm1
+ movss [outq+88], xmm2
+ addss xmm0, [outq+60]
+ mov tmpd, [outq+60]
+ mov [outq+120], tmpd
+ movss [outq+104], xmm0
+ addss xmm4, xmm5
+ addss xmm5, [outq+68]
+ movss [outq+52], xmm4
+ movss [outq+60], xmm5
+ movss xmm4, [outq+68]
+ movss xmm5, [outq+20]
+ movss [outq+20], xmm3
+ addss xmm5, xmm7
+ addss xmm7, xmm6
+ addss xmm4, xmm5
+ movss xmm2, [outq+84]
+ addss xmm2, [outq+92]
+ addss xmm5, xmm2
+ movss [outq+68], xmm4
+ addss xmm2, xmm7
+ movss xmm4, [outq+76]
+ movss [outq+84], xmm2
+ movss [outq+76], xmm5
+ addss xmm7, xmm4
+ addss xmm6, [outq+124]
+ addss xmm4, xmm6
+ addss xmm6, [outq+92]
+ movss [outq+100], xmm4
+ movss [outq+108], xmm6
+ movss xmm6, [outq+92]
+ movss [outq+92], xmm7
+ addss xmm6, [outq+124]
+ movss [outq+116], xmm6
+%endmacro
+
+%define BUTTERFLY BUTTERFLY_AVX
+%define BUTTERFLY0 BUTTERFLY0_AVX
+
section .text align=16
+cglobal dct32_float_avx, 2,3,8, out, in, tmp
+ ; pass 1
+ vmovaps ymm4, [inq+0]
+ vinsertf128 ymm5, ymm5, [inq+96], 1
+ vinsertf128 ymm5, ymm5, [inq+112], 0
+ vshufps ymm5, ymm5, ymm5, 0x1b
+ BUTTERFLY ymm4, ymm5, [ps_cos_vec], ymm6
+
+ vmovaps ymm2, [inq+64]
+ vinsertf128 ymm6, ymm6, [inq+32], 1
+ vinsertf128 ymm6, ymm6, [inq+48], 0
+ vshufps ymm6, ymm6, ymm6, 0x1b
+ BUTTERFLY ymm2, ymm6, [ps_cos_vec+32], ymm0
+
+ ; pass 2
+
+ BUTTERFLY ymm5, ymm6, [ps_cos_vec+64], ymm0
+ BUTTERFLY ymm4, ymm2, [ps_cos_vec+64], ymm7
+
+
+ ; pass 3
+ vperm2f128 ymm3, ymm6, ymm4, 0x31
+ vperm2f128 ymm1, ymm6, ymm4, 0x20
+ vshufps ymm3, ymm3, ymm3, 0x1b
+
+ BUTTERFLY ymm1, ymm3, [ps_cos_vec+96], ymm6
+
+
+ vperm2f128 ymm4, ymm5, ymm2, 0x20
+ vperm2f128 ymm5, ymm5, ymm2, 0x31
+ vshufps ymm5, ymm5, ymm5, 0x1b
+
+ BUTTERFLY ymm4, ymm5, [ps_cos_vec+96], ymm6
+
+ ; pass 4
+ vmovaps ymm6, [ps_p1p1m1m1+0]
+ vmovaps ymm2, [ps_cos_vec+128]
+
+ BUTTERFLY2 ymm5, ymm6, ymm2, ymm7
+ BUTTERFLY2 ymm4, ymm6, ymm2, ymm7
+ BUTTERFLY2 ymm1, ymm6, ymm2, ymm7
+ BUTTERFLY2 ymm3, ymm6, ymm2, ymm7
+
+
+ ; pass 5
+ vshufps ymm6, ymm6, ymm6, 0xcc
+ vmovaps ymm2, [ps_cos_vec+160]
+
+ BUTTERFLY3 ymm5, ymm6, ymm2, ymm7
+ BUTTERFLY3 ymm4, ymm6, ymm2, ymm7
+ BUTTERFLY3 ymm1, ymm6, ymm2, ymm7
+ BUTTERFLY3 ymm3, ymm6, ymm2, ymm7
+
+ vextractf128 xmm6, ymm3, 1
+ vmovaps [outq], ymm3
+
+ vextractf128 [outq+64], ymm5, 1
+ vextractf128 [outq+32], ymm5, 0
+
+ vextractf128 [outq+80], ymm4, 1
+ vextractf128 [outq+48], ymm4, 0
+
+ vextractf128 xmm0, ymm1, 1
+ vmovaps [outq+96], ymm1
+
+ vzeroupper
+
+ ; pass 6, no SIMD...
+ PASS6_AND_PERMUTE
+ REP_RET
+
+%define BUTTERFLY BUTTERFLY_SSE
+%define BUTTERFLY0 BUTTERFLY0_SSE
+
cglobal dct32_float_sse, 2,3,8, out, in, tmp
; pass 1
@@ -72,8 +274,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
movaps xmm7, [inq+64]
movaps xmm4, [inq+48]
shufps xmm4, xmm4, 0x1b
- BUTTERFLY xmm7, xmm4, [ps_cos_vec+48], xmm3
-
+ BUTTERFLY xmm7, xmm4, [ps_cos_vec+32], xmm3
; pass 2
movaps xmm2, [ps_cos_vec+64]
@@ -90,7 +291,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
movaps xmm4, [inq+80]
movaps xmm5, [inq+32]
shufps xmm5, xmm5, 0x1b
- BUTTERFLY xmm4, xmm5, [ps_cos_vec+32], xmm3
+ BUTTERFLY xmm4, xmm5, [ps_cos_vec+48], xmm3
; pass 2
BUTTERFLY xmm0, xmm7, xmm2, xmm3
@@ -121,7 +322,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
; pass 4
movaps xmm3, [ps_p1p1m1m1+0]
- movaps xmm2, [ps_cos_vec+112]
+ movaps xmm2, [ps_cos_vec+128]
BUTTERFLY2 xmm5, xmm3, xmm2, xmm1
@@ -146,7 +347,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
BUTTERFLY2 xmm0, xmm3, xmm2, xmm1
; pass 5
- movaps xmm2, [ps_cos_vec+128]
+ movaps xmm2, [ps_cos_vec+160]
shufps xmm3, xmm3, 0xcc
BUTTERFLY3 xmm5, xmm3, xmm2, xmm1
@@ -177,110 +378,5 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
movaps [outq+112], xmm0
; pass 6, no SIMD...
- mov tmpq, [outd+4]
- movss xmm7, [outd+72]
- addss xmm7, [outq+76]
- movss xmm3, [outq+56]
- addss xmm3, [outq+60]
- addss xmm4, xmm3
- movss xmm2, [outq+52]
- addss xmm2, xmm3
- movss xmm3, [outq+104]
- addss xmm3, [outq+108]
- addss xmm1, xmm3
- addss xmm5, xmm4
- movss [outq+16], xmm1
- movss xmm1, [outq+100]
- addss xmm1, xmm3
- movss xmm3, [outq+40]
- movss [outq+48], xmm1
- addss xmm3, [outq+44]
- movss xmm1, [outq+100]
- addss xmm4, xmm3
- addss xmm3, xmm2
- addss xmm1, [outq+108]
- movss [outq+40], xmm3
- addss xmm2, [outq+36]
- movss xmm3, [outq+8]
- movss [outq+56], xmm2
- addss xmm3, [outq+12]
- movss [outq+32], xmm3
- movss xmm3, [outq+80]
- movss [outq+8], xmm5
- movss [outq+80], xmm1
- movss xmm2, [outq+52]
- movss xmm5, [outq+120]
- addss xmm5, [outq+124]
- movss xmm1, [outq+64]
- addss xmm2, [outq+60]
- addss xmm0, xmm5
- addss xmm5, [outq+116]
- mov [outq+64], tmpq
- addss xmm6, xmm0
- addss xmm1, xmm6
- mov tmpq, [outq+12]
- mov [outq+96], tmpq
- movss [outq+4], xmm1
- movss xmm1, [outq+24]
- movss [outq+24], xmm4
- movss xmm4, [outq+88]
- addss xmm4, [outq+92]
- addss xmm3, xmm4
- addss xmm4, [outq+84]
- mov tmpq, [outq+108]
- addss xmm1, [outq+28]
- addss xmm0, xmm1
- addss xmm1, xmm5
- addss xmm6, xmm3
- addss xmm3, xmm0
- addss xmm0, xmm7
- addss xmm5, [outq+20]
- addss xmm7, xmm1
- movss [outq+12], xmm6
- mov [outq+112], tmpq
- movss xmm6, [outq+28]
- movss [outq+28], xmm0
- movss xmm0, [outq+36]
- movss [outq+36], xmm7
- addss xmm1, xmm4
- movss xmm7, [outq+116]
- addss xmm0, xmm2
- addss xmm7, [outq+124]
- movss [outq+72], xmm0
- movss xmm0, [outq+44]
- addss xmm2, xmm0
- movss [outq+44], xmm1
- movss [outq+88], xmm2
- addss xmm0, [outq+60]
- mov tmpq, [outq+60]
- mov [outq+120], tmpq
- movss [outq+104], xmm0
- addss xmm4, xmm5
- addss xmm5, [outq+68]
- movss [outq+52], xmm4
- movss [outq+60], xmm5
- movss xmm4, [outq+68]
- movss xmm5, [outq+20]
- movss [outq+20], xmm3
- addss xmm5, xmm7
- addss xmm7, xmm6
- addss xmm4, xmm5
- movss xmm2, [outq+84]
- addss xmm2, [outq+92]
- addss xmm5, xmm2
- movss [outq+68], xmm4
- addss xmm2, xmm7
- movss xmm4, [outq+76]
- movss [outq+84], xmm2
- movss [outq+76], xmm5
- addss xmm7, xmm4
- addss xmm6, [outq+124]
- addss xmm4, xmm6
- addss xmm6, [outq+92]
- movss [outq+100], xmm4
- movss [outq+108], xmm6
- movss xmm6, [outq+92]
- movss [outq+92], xmm7
- addss xmm6, [outq+124]
- movss [outq+116], xmm6
+ PASS6_AND_PERMUTE
REP_RET
diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c
index b29412c..8eef421 100644
--- a/libavcodec/x86/fft.c
+++ b/libavcodec/x86/fft.c
@@ -57,7 +57,9 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
av_cold void ff_dct_init_mmx(DCTContext *s)
{
int has_vectors = av_get_cpu_flags();
- if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
+ if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX)
+ s->dct32 = ff_dct32_float_avx;
+ else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
s->dct32 = ff_dct32_float_sse;
}
#endif
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index e6eace2..c714185 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -35,5 +35,6 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
+void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
#endif
--
1.7.4.1
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel