On Sun, May 15, 2011 at 4:21 PM, Vitor Sessak <[email protected]> wrote: > > Easy to debug with an actual machine :-). > > Revised patch attached (only the third patch needed changes), all FATE > tests pass. > > Benchmarks: > > SSE: > time: 0.0 us/transform [total time=1.53 s its=33554432] > time: 0.0 us/transform [total time=1.53 s its=33554432] > time: 0.0 us/transform [total time=1.53 s its=33554432] > > AVX: > time: 0.0 us/transform [total time=1.40 s its=33554432] > time: 0.0 us/transform [total time=1.40 s its=33554432] > time: 0.0 us/transform [total time=1.40 s its=33554432] >
nit: can I have START/STOP_TIMER number? This is not important. Also, how does the SSE time compare to before? Is there a slowdown? > From de3ad19337966eb80f844236fe1ae5fb55c747da Mon Sep 17 00:00:00 2001 > From: Vitor Sessak <[email protected]> > Date: Sat, 14 May 2011 14:17:15 +0200 > Subject: [PATCH 3/3] dct32: Add AVX implementation of 32-point DCT > --- > libavcodec/mpegaudio.h | 4 +- > libavcodec/x86/dct32_sse.asm | 326 +++++++++++++++++++++++++++--------------- > libavcodec/x86/fft.c | 4 +- > libavcodec/x86/fft.h | 1 + > 4 files changed, 217 insertions(+), 118 deletions(-) > diff --git a/libavcodec/mpegaudio.h b/libavcodec/mpegaudio.h > index f12b897..d247ce8 100644 > --- a/libavcodec/mpegaudio.h > +++ b/libavcodec/mpegaudio.h > @@ -134,9 +134,9 @@ typedef struct MPADecodeContext { > uint32_t free_format_next_header; > GetBitContext gb; > GetBitContext in_gb; > - DECLARE_ALIGNED(16, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2]; > + DECLARE_ALIGNED(32, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2]; > int synth_buf_offset[MPA_MAX_CHANNELS]; > - DECLARE_ALIGNED(16, INTFLOAT, sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT]; > + DECLARE_ALIGNED(32, INTFLOAT, sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT]; > INTFLOAT mdct_buf[MPA_MAX_CHANNELS][SBLIMIT * 18]; /* previous samples, for layer 3 MDCT */ > GranuleDef granules[2][2]; /* Used in Layer 3 */ > #ifdef DEBUG > diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm > index ac25eb3..5a8691c 100644 > --- a/libavcodec/x86/dct32_sse.asm > +++ b/libavcodec/x86/dct32_sse.asm > @@ -26,25 +26,34 @@ SECTION_RODATA 32 > align 32 > ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 > dd 0.553104, 0.582935, 0.622504, 0.674808 > - dd -1.169440, -0.972568, -0.839350, -0.744536 > dd -10.190008, -3.407609, -2.057781, -1.484165 > + dd -1.169440, -0.972568, -0.839350, -0.744536 > dd 0.502419, 0.522499, 0.566944, 0.646822 > dd 0.788155, 1.060678, 1.722447, 5.101149 > dd 0.509796, 0.601345, 0.899976, 2.562916 > + dd 0.509796, 0.601345, 0.899976, 2.562916 > + dd 1.000000, 1.000000, 1.306563, 0.541196 > dd 1.000000, 1.000000, 1.306563, 0.541196 > dd 1.000000, 0.707107, 1.000000, -0.707107 > + dd 1.000000, 0.707107, 1.000000, -0.707107 > > > -ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 > +ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 > > -%macro BUTTERFLY 4 > +%macro BUTTERFLY_SSE 4 > movaps %4, %1 > subps %1, %2 > addps %2, %4 > mulps %1, %3 > %endmacro > > -%macro BUTTERFLY0 5 > +%macro BUTTERFLY_AVX 4 > + vsubps %4, %1, %2 > + vaddps %2, %2, %1 > + vmulps %1, %4, %3 > +%endmacro > + > +%macro BUTTERFLY0_SSE 5 > movaps %4, %1 > shufps %1, %1, %5 > xorps %4, %2 > @@ -52,6 +61,13 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 > mulps %1, %3 > %endmacro > > +%macro BUTTERFLY0_AVX 5 > + vshufps %4, %1, %1, %5 > + vxorps %1, %1, %2 > + vaddps %4, %4, %1 > + vmulps %1, %4, %3 > +%endmacro > + > %macro BUTTERFLY2 4 > BUTTERFLY0 %1, %2, %3, %4, 0x1b > %endmacro > @@ -60,7 +76,193 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 > BUTTERFLY0 %1, %2, %3, %4, 0xb1 > %endmacro > > +%macro PASS6_AND_PERMUTE 0 > + mov tmpd, [outd+4] > + movss xmm7, [outd+72] > + addss xmm7, [outq+76] > + movss xmm3, [outq+56] > + addss xmm3, [outq+60] > + addss xmm4, xmm3 > + movss xmm2, [outq+52] > + addss xmm2, xmm3 > + movss xmm3, [outq+104] > + addss xmm3, [outq+108] > + addss xmm1, xmm3 > + addss xmm5, xmm4 > + movss [outq+16], xmm1 > + movss xmm1, [outq+100] > + addss xmm1, xmm3 > + movss xmm3, [outq+40] > + movss [outq+48], xmm1 > + addss xmm3, [outq+44] > + movss xmm1, [outq+100] > + addss xmm4, xmm3 > + addss xmm3, xmm2 > + addss xmm1, [outq+108] > + movss [outq+40], xmm3 > + addss xmm2, [outq+36] > + movss xmm3, [outq+8] > + movss [outq+56], xmm2 > + addss xmm3, [outq+12] > + movss [outq+32], xmm3 > + movss xmm3, [outq+80] > + movss [outq+8], xmm5 > + movss [outq+80], xmm1 > + movss xmm2, [outq+52] > + movss xmm5, [outq+120] > + addss xmm5, [outq+124] > + movss xmm1, [outq+64] > + addss xmm2, [outq+60] > + addss xmm0, xmm5 > + addss xmm5, [outq+116] > + mov [outq+64], tmpd > + addss xmm6, xmm0 > + addss xmm1, xmm6 > + mov tmpd, [outq+12] > + mov [outq+96], tmpd > + movss [outq+4], xmm1 > + movss xmm1, [outq+24] > + movss [outq+24], xmm4 > + movss xmm4, [outq+88] > + addss xmm4, [outq+92] > + addss xmm3, xmm4 > + addss xmm4, [outq+84] > + mov tmpd, [outq+108] > + addss xmm1, [outq+28] > + addss xmm0, xmm1 > + addss xmm1, xmm5 > + addss xmm6, xmm3 > + addss xmm3, xmm0 > + addss xmm0, xmm7 > + addss xmm5, [outq+20] > + addss xmm7, xmm1 > + movss [outq+12], xmm6 > + mov [outq+112], tmpd > + movss xmm6, [outq+28] > + movss [outq+28], xmm0 > + movss xmm0, [outq+36] > + movss [outq+36], xmm7 > + addss xmm1, xmm4 > + movss xmm7, [outq+116] > + addss xmm0, xmm2 > + addss xmm7, [outq+124] > + movss [outq+72], xmm0 > + movss xmm0, [outq+44] > + addss xmm2, xmm0 > + movss [outq+44], xmm1 > + movss [outq+88], xmm2 > + addss xmm0, [outq+60] > + mov tmpd, [outq+60] > + mov [outq+120], tmpd > + movss [outq+104], xmm0 > + addss xmm4, xmm5 > + addss xmm5, [outq+68] > + movss [outq+52], xmm4 > + movss [outq+60], xmm5 > + movss xmm4, [outq+68] > + movss xmm5, [outq+20] > + movss [outq+20], xmm3 > + addss xmm5, xmm7 > + addss xmm7, xmm6 > + addss xmm4, xmm5 > + movss xmm2, [outq+84] > + addss xmm2, [outq+92] > + addss xmm5, xmm2 > + movss [outq+68], xmm4 > + addss xmm2, xmm7 > + movss xmm4, [outq+76] > + movss [outq+84], xmm2 > + movss [outq+76], xmm5 > + addss xmm7, xmm4 > + addss xmm6, [outq+124] > + addss xmm4, xmm6 > + addss xmm6, [outq+92] > + movss [outq+100], xmm4 > + movss [outq+108], xmm6 > + movss xmm6, [outq+92] > + movss [outq+92], xmm7 > + addss xmm6, [outq+124] > + movss [outq+116], xmm6 > +%endmacro Could this be SIMD? It looks horrific, so if the answer is no, I'll go with that. > + > +%define BUTTERFLY BUTTERFLY_AVX > +%define BUTTERFLY0 BUTTERFLY0_AVX > + > section .text align=16 > +cglobal dct32_float_avx, 2,3,8, out, in, tmp nit: add a function header. Not a nit: can you get this down to only using 7 registers? A quick glance suggests yes. This is better on Windows. > + ; pass 1 > + vmovaps ymm4, [inq+0] > + vinsertf128 ymm5, ymm5, [inq+96], 1 > + vinsertf128 ymm5, ymm5, [inq+112], 0 > + vshufps ymm5, ymm5, ymm5, 0x1b > + BUTTERFLY ymm4, ymm5, [ps_cos_vec], ymm6 > + > + vmovaps ymm2, [inq+64] > + vinsertf128 ymm6, ymm6, [inq+32], 1 > + vinsertf128 ymm6, ymm6, [inq+48], 0 > + vshufps ymm6, ymm6, ymm6, 0x1b > + BUTTERFLY ymm2, ymm6, [ps_cos_vec+32], ymm0 > + > + ; pass 2 > + > + BUTTERFLY ymm5, ymm6, [ps_cos_vec+64], ymm0 > + BUTTERFLY ymm4, ymm2, [ps_cos_vec+64], ymm7 > + > + > + ; pass 3 > + vperm2f128 ymm3, ymm6, ymm4, 0x31 > + vperm2f128 ymm1, ymm6, ymm4, 0x20 > + vshufps ymm3, ymm3, ymm3, 0x1b > + > + BUTTERFLY ymm1, ymm3, [ps_cos_vec+96], ymm6 > + > + > + vperm2f128 ymm4, ymm5, ymm2, 0x20 > + vperm2f128 ymm5, ymm5, ymm2, 0x31 > + vshufps ymm5, ymm5, ymm5, 0x1b > + > + BUTTERFLY ymm4, ymm5, [ps_cos_vec+96], ymm6 > + > + ; pass 4 > + vmovaps ymm6, [ps_p1p1m1m1+0] > + vmovaps ymm2, [ps_cos_vec+128] > + > + BUTTERFLY2 ymm5, ymm6, ymm2, ymm7 > + BUTTERFLY2 ymm4, ymm6, ymm2, ymm7 > + BUTTERFLY2 ymm1, ymm6, ymm2, ymm7 > + BUTTERFLY2 ymm3, ymm6, ymm2, ymm7 > + > + > + ; pass 5 > + vshufps ymm6, ymm6, ymm6, 0xcc > + vmovaps ymm2, [ps_cos_vec+160] > + > + BUTTERFLY3 ymm5, ymm6, ymm2, ymm7 > + BUTTERFLY3 ymm4, ymm6, ymm2, ymm7 > + BUTTERFLY3 ymm1, ymm6, ymm2, ymm7 > + BUTTERFLY3 ymm3, ymm6, ymm2, ymm7 > + > + vextractf128 xmm6, ymm3, 1 > + vmovaps [outq], ymm3 > + > + vextractf128 [outq+64], ymm5, 1 > + vextractf128 [outq+32], ymm5, 0 > + > + vextractf128 [outq+80], ymm4, 1 > + vextractf128 [outq+48], ymm4, 0 > + > + vextractf128 xmm0, ymm1, 1 > + vmovaps [outq+96], ymm1 > + > + vzeroupper > + > + ; pass 6, no SIMD... > + PASS6_AND_PERMUTE > + REP_RET Same as Ronald's comment from before. Just RET. > + > +%define BUTTERFLY BUTTERFLY_SSE > +%define BUTTERFLY0 BUTTERFLY0_SSE > + > cglobal dct32_float_sse, 2,3,8, out, in, tmp > ; pass 1 > > @@ -72,8 +274,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp > movaps xmm7, [inq+64] > movaps xmm4, [inq+48] > shufps xmm4, xmm4, 0x1b > - BUTTERFLY xmm7, xmm4, [ps_cos_vec+48], xmm3 > - > + BUTTERFLY xmm7, xmm4, [ps_cos_vec+32], xmm3 > > ; pass 2 > movaps xmm2, [ps_cos_vec+64] > @@ -90,7 +291,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp > movaps xmm4, [inq+80] > movaps xmm5, [inq+32] > shufps xmm5, xmm5, 0x1b > - BUTTERFLY xmm4, xmm5, [ps_cos_vec+32], xmm3 > + BUTTERFLY xmm4, xmm5, [ps_cos_vec+48], xmm3 > > ; pass 2 > BUTTERFLY xmm0, xmm7, xmm2, xmm3 > @@ -121,7 +322,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp > > ; pass 4 > movaps xmm3, [ps_p1p1m1m1+0] > - movaps xmm2, [ps_cos_vec+112] > + movaps xmm2, [ps_cos_vec+128] > > BUTTERFLY2 xmm5, xmm3, xmm2, xmm1 > > @@ -146,7 +347,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp > BUTTERFLY2 xmm0, xmm3, xmm2, xmm1 > > ; pass 5 > - movaps xmm2, [ps_cos_vec+128] > + movaps xmm2, [ps_cos_vec+160] > shufps xmm3, xmm3, 0xcc > > BUTTERFLY3 xmm5, xmm3, xmm2, xmm1 > @@ -177,110 +378,5 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp > movaps [outq+112], xmm0 > > ; pass 6, no SIMD... > - mov tmpq, [outd+4] > - movss xmm7, [outd+72] > - addss xmm7, [outq+76] > - movss xmm3, [outq+56] > - addss xmm3, [outq+60] > - addss xmm4, xmm3 > - movss xmm2, [outq+52] > - addss xmm2, xmm3 > - movss xmm3, [outq+104] > - addss xmm3, [outq+108] > - addss xmm1, xmm3 > - addss xmm5, xmm4 > - movss [outq+16], xmm1 > - movss xmm1, [outq+100] > - addss xmm1, xmm3 > - movss xmm3, [outq+40] > - movss [outq+48], xmm1 > - addss xmm3, [outq+44] > - movss xmm1, [outq+100] > - addss xmm4, xmm3 > - addss xmm3, xmm2 > - addss xmm1, [outq+108] > - movss [outq+40], xmm3 > - addss xmm2, [outq+36] > - movss xmm3, [outq+8] > - movss [outq+56], xmm2 > - addss xmm3, [outq+12] > - movss [outq+32], xmm3 > - movss xmm3, [outq+80] > - movss [outq+8], xmm5 > - movss [outq+80], xmm1 > - movss xmm2, [outq+52] > - movss xmm5, [outq+120] > - addss xmm5, [outq+124] > - movss xmm1, [outq+64] > - addss xmm2, [outq+60] > - addss xmm0, xmm5 > - addss xmm5, [outq+116] > - mov [outq+64], tmpq > - addss xmm6, xmm0 > - addss xmm1, xmm6 > - mov tmpq, [outq+12] > - mov [outq+96], tmpq > - movss [outq+4], xmm1 > - movss xmm1, [outq+24] > - movss [outq+24], xmm4 > - movss xmm4, [outq+88] > - addss xmm4, [outq+92] > - addss xmm3, xmm4 > - addss xmm4, [outq+84] > - mov tmpq, [outq+108] > - addss xmm1, [outq+28] > - addss xmm0, xmm1 > - addss xmm1, xmm5 > - addss xmm6, xmm3 > - addss xmm3, xmm0 > - addss xmm0, xmm7 > - addss xmm5, [outq+20] > - addss xmm7, xmm1 > - movss [outq+12], xmm6 > - mov [outq+112], tmpq > - movss xmm6, [outq+28] > - movss [outq+28], xmm0 > - movss xmm0, [outq+36] > - movss [outq+36], xmm7 > - addss xmm1, xmm4 > - movss xmm7, [outq+116] > - addss xmm0, xmm2 > - addss xmm7, [outq+124] > - movss [outq+72], xmm0 > - movss xmm0, [outq+44] > - addss xmm2, xmm0 > - movss [outq+44], xmm1 > - movss [outq+88], xmm2 > - addss xmm0, [outq+60] > - mov tmpq, [outq+60] > - mov [outq+120], tmpq > - movss [outq+104], xmm0 > - addss xmm4, xmm5 > - addss xmm5, [outq+68] > - movss [outq+52], xmm4 > - movss [outq+60], xmm5 > - movss xmm4, [outq+68] > - movss xmm5, [outq+20] > - movss [outq+20], xmm3 > - addss xmm5, xmm7 > - addss xmm7, xmm6 > - addss xmm4, xmm5 > - movss xmm2, [outq+84] > - addss xmm2, [outq+92] > - addss xmm5, xmm2 > - movss [outq+68], xmm4 > - addss xmm2, xmm7 > - movss xmm4, [outq+76] > - movss [outq+84], xmm2 > - movss [outq+76], xmm5 > - addss xmm7, xmm4 > - addss xmm6, [outq+124] > - addss xmm4, xmm6 > - addss xmm6, [outq+92] > - movss [outq+100], xmm4 > - movss [outq+108], xmm6 > - movss xmm6, [outq+92] > - movss [outq+92], xmm7 > - addss xmm6, [outq+124] > - movss [outq+116], xmm6 > + PASS6_AND_PERMUTE > REP_RET Same as above. > diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c > index b29412c..8eef421 100644 > --- a/libavcodec/x86/fft.c > +++ b/libavcodec/x86/fft.c > @@ -57,7 +57,9 @@ av_cold void ff_fft_init_mmx(FFTContext *s) > av_cold void ff_dct_init_mmx(DCTContext *s) > { > int has_vectors = av_get_cpu_flags(); > - if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) > + if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX) > + s->dct32 = ff_dct32_float_avx; > + else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) > s->dct32 = ff_dct32_float_sse; > } > #endif > diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h > index e6eace2..c714185 100644 > --- a/libavcodec/x86/fft.h > +++ b/libavcodec/x86/fft.h > @@ -35,5 +35,6 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input) > void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); > void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); > void ff_dct32_float_sse(FFTSample *out, const FFTSample *in); > +void ff_dct32_float_avx(FFTSample *out, const FFTSample *in); > > #endif > -- > 1.7.4.1
_______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
