On Sun, May 15, 2011 at 6:49 PM, Daniel Kang <[email protected]>wrote:
> On Sun, May 15, 2011 at 4:21 PM, Vitor Sessak <[email protected]> wrote: >> >> Easy to debug with an actual machine :-). >> >> Revised patch attached (only the third patch needed changes), all FATE >> tests pass. >> >> Benchmarks: >> >> SSE: >> time: 0.0 us/transform [total time=1.53 s its=33554432] >> time: 0.0 us/transform [total time=1.53 s its=33554432] >> time: 0.0 us/transform [total time=1.53 s its=33554432] >> >> AVX: >> time: 0.0 us/transform [total time=1.40 s its=33554432] >> time: 0.0 us/transform [total time=1.40 s its=33554432] >> time: 0.0 us/transform [total time=1.40 s its=33554432] >> > > nit: can I have START/STOP_TIMER number? This is not important. > Also, how does the SSE time compare to before? Is there a slowdown? > > > From de3ad19337966eb80f844236fe1ae5fb55c747da Mon Sep 17 00:00:00 2001 > > From: Vitor Sessak <[email protected]> > > Date: Sat, 14 May 2011 14:17:15 +0200 > > Subject: [PATCH 3/3] dct32: Add AVX implementation of 32-point DCT > > --- > > libavcodec/mpegaudio.h | 4 +- > > libavcodec/x86/dct32_sse.asm | 326 > +++++++++++++++++++++++++++--------------- > > libavcodec/x86/fft.c | 4 +- > > libavcodec/x86/fft.h | 1 + > > 4 files changed, 217 insertions(+), 118 deletions(-) > > diff --git a/libavcodec/mpegaudio.h b/libavcodec/mpegaudio.h > > index f12b897..d247ce8 100644 > > --- a/libavcodec/mpegaudio.h > > +++ b/libavcodec/mpegaudio.h > > @@ -134,9 +134,9 @@ typedef struct MPADecodeContext { > > uint32_t free_format_next_header; > > GetBitContext gb; > > GetBitContext in_gb; > > - DECLARE_ALIGNED(16, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2]; > > + DECLARE_ALIGNED(32, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2]; > > int synth_buf_offset[MPA_MAX_CHANNELS]; > > - DECLARE_ALIGNED(16, INTFLOAT, > sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT]; > > + DECLARE_ALIGNED(32, INTFLOAT, > sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT]; > > INTFLOAT mdct_buf[MPA_MAX_CHANNELS][SBLIMIT * 18]; /* previous > samples, for layer 3 MDCT */ > > GranuleDef granules[2][2]; /* Used in Layer 3 */ > > #ifdef DEBUG > > diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm > > index ac25eb3..5a8691c 100644 > > --- a/libavcodec/x86/dct32_sse.asm > > +++ b/libavcodec/x86/dct32_sse.asm > > @@ -26,25 +26,34 @@ SECTION_RODATA 32 > > align 32 > > ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 > > dd 0.553104, 0.582935, 0.622504, 0.674808 > > - dd -1.169440, -0.972568, -0.839350, -0.744536 > > dd -10.190008, -3.407609, -2.057781, -1.484165 > > + dd -1.169440, -0.972568, -0.839350, -0.744536 > > dd 0.502419, 0.522499, 0.566944, 0.646822 > > dd 0.788155, 1.060678, 1.722447, 5.101149 > > dd 0.509796, 0.601345, 0.899976, 2.562916 > > + dd 0.509796, 0.601345, 0.899976, 2.562916 > > + dd 1.000000, 1.000000, 1.306563, 0.541196 > > dd 1.000000, 1.000000, 1.306563, 0.541196 > > dd 1.000000, 0.707107, 1.000000, -0.707107 > > + dd 1.000000, 0.707107, 1.000000, -0.707107 > > > > > > -ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 > > +ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, > 0x80000000 > > > > -%macro BUTTERFLY 4 > > +%macro BUTTERFLY_SSE 4 > > movaps %4, %1 > > subps %1, %2 > > addps %2, %4 > > mulps %1, %3 > > %endmacro > > > > -%macro BUTTERFLY0 5 > > +%macro BUTTERFLY_AVX 4 > > + vsubps %4, %1, %2 > > + vaddps %2, %2, %1 > > + vmulps %1, %4, %3 > > +%endmacro > > + > > +%macro BUTTERFLY0_SSE 5 > > movaps %4, %1 > > shufps %1, %1, %5 > > xorps %4, %2 > > @@ -52,6 +61,13 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 > > mulps %1, %3 > > %endmacro > > > > +%macro BUTTERFLY0_AVX 5 > > + vshufps %4, %1, %1, %5 > > + vxorps %1, %1, %2 > > + vaddps %4, %4, %1 > > + vmulps %1, %4, %3 > > +%endmacro > > + > > %macro BUTTERFLY2 4 > > BUTTERFLY0 %1, %2, %3, %4, 0x1b > > %endmacro > > @@ -60,7 +76,193 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 > > BUTTERFLY0 %1, %2, %3, %4, 0xb1 > > %endmacro > > > > +%macro PASS6_AND_PERMUTE 0 > > + mov tmpd, [outd+4] > > + movss xmm7, [outd+72] > > + addss xmm7, [outq+76] > > + movss xmm3, [outq+56] > > + addss xmm3, [outq+60] > > + addss xmm4, xmm3 > > + movss xmm2, [outq+52] > > + addss xmm2, xmm3 > > + movss xmm3, [outq+104] > > + addss xmm3, [outq+108] > > + addss xmm1, xmm3 > > + addss xmm5, xmm4 > > + movss [outq+16], xmm1 > > + movss xmm1, [outq+100] > > + addss xmm1, xmm3 > > + movss xmm3, [outq+40] > > + movss [outq+48], xmm1 > > + addss xmm3, [outq+44] > > + movss xmm1, [outq+100] > > + addss xmm4, xmm3 > > + addss xmm3, xmm2 > > + addss xmm1, [outq+108] > > + movss [outq+40], xmm3 > > + addss xmm2, [outq+36] > > + movss xmm3, [outq+8] > > + movss [outq+56], xmm2 > > + addss xmm3, [outq+12] > > + movss [outq+32], xmm3 > > + movss xmm3, [outq+80] > > + movss [outq+8], xmm5 > > + movss [outq+80], xmm1 > > + movss xmm2, [outq+52] > > + movss xmm5, [outq+120] > > + addss xmm5, [outq+124] > > + movss xmm1, [outq+64] > > + addss xmm2, [outq+60] > > + addss xmm0, xmm5 > > + addss xmm5, [outq+116] > > + mov [outq+64], tmpd > > + addss xmm6, xmm0 > > + addss xmm1, xmm6 > > + mov tmpd, [outq+12] > > + mov [outq+96], tmpd > > + movss [outq+4], xmm1 > > + movss xmm1, [outq+24] > > + movss [outq+24], xmm4 > > + movss xmm4, [outq+88] > > + addss xmm4, [outq+92] > > + addss xmm3, xmm4 > > + addss xmm4, [outq+84] > > + mov tmpd, [outq+108] > > + addss xmm1, [outq+28] > > + addss xmm0, xmm1 > > + addss xmm1, xmm5 > > + addss xmm6, xmm3 > > + addss xmm3, xmm0 > > + addss xmm0, xmm7 > > + addss xmm5, [outq+20] > > + addss xmm7, xmm1 > > + movss [outq+12], xmm6 > > + mov [outq+112], tmpd > > + movss xmm6, [outq+28] > > + movss [outq+28], xmm0 > > + movss xmm0, [outq+36] > > + movss [outq+36], xmm7 > > + addss xmm1, xmm4 > > + movss xmm7, [outq+116] > > + addss xmm0, xmm2 > > + addss xmm7, [outq+124] > > + movss [outq+72], xmm0 > > + movss xmm0, [outq+44] > > + addss xmm2, xmm0 > > + movss [outq+44], xmm1 > > + movss [outq+88], xmm2 > > + addss xmm0, [outq+60] > > + mov tmpd, [outq+60] > > + mov [outq+120], tmpd > > + movss [outq+104], xmm0 > > + addss xmm4, xmm5 > > + addss xmm5, [outq+68] > > + movss [outq+52], xmm4 > > + movss [outq+60], xmm5 > > + movss xmm4, [outq+68] > > + movss xmm5, [outq+20] > > + movss [outq+20], xmm3 > > + addss xmm5, xmm7 > > + addss xmm7, xmm6 > > + addss xmm4, xmm5 > > + movss xmm2, [outq+84] > > + addss xmm2, [outq+92] > > + addss xmm5, xmm2 > > + movss [outq+68], xmm4 > > + addss xmm2, xmm7 > > + movss xmm4, [outq+76] > > + movss [outq+84], xmm2 > > + movss [outq+76], xmm5 > > + addss xmm7, xmm4 > > + addss xmm6, [outq+124] > > + addss xmm4, xmm6 > > + addss xmm6, [outq+92] > > + movss [outq+100], xmm4 > > + movss [outq+108], xmm6 > > + movss xmm6, [outq+92] > > + movss [outq+92], xmm7 > > + addss xmm6, [outq+124] > > + movss [outq+116], xmm6 > > +%endmacro > > Could this be SIMD? It looks horrific, so if the answer is no, I'll go with > that. > > > + > > +%define BUTTERFLY BUTTERFLY_AVX > > +%define BUTTERFLY0 BUTTERFLY0_AVX > > + > > section .text align=16 > > +cglobal dct32_float_avx, 2,3,8, out, in, tmp > > nit: add a function header. > Not a nit: can you get this down to only using 7 registers? A quick glance > suggests yes. This is better on Windows. > > > + ; pass 1 > > + vmovaps ymm4, [inq+0] > > + vinsertf128 ymm5, ymm5, [inq+96], 1 > > + vinsertf128 ymm5, ymm5, [inq+112], 0 > > + vshufps ymm5, ymm5, ymm5, 0x1b > > + BUTTERFLY ymm4, ymm5, [ps_cos_vec], ymm6 > > + > > + vmovaps ymm2, [inq+64] > > + vinsertf128 ymm6, ymm6, [inq+32], 1 > > + vinsertf128 ymm6, ymm6, [inq+48], 0 > > + vshufps ymm6, ymm6, ymm6, 0x1b > > + BUTTERFLY ymm2, ymm6, [ps_cos_vec+32], ymm0 > > + > > + ; pass 2 > > + > > + BUTTERFLY ymm5, ymm6, [ps_cos_vec+64], ymm0 > > + BUTTERFLY ymm4, ymm2, [ps_cos_vec+64], ymm7 > > + > > + > > + ; pass 3 > > + vperm2f128 ymm3, ymm6, ymm4, 0x31 > > + vperm2f128 ymm1, ymm6, ymm4, 0x20 > > + vshufps ymm3, ymm3, ymm3, 0x1b > > + > > + BUTTERFLY ymm1, ymm3, [ps_cos_vec+96], ymm6 > > + > > + > > + vperm2f128 ymm4, ymm5, ymm2, 0x20 > > + vperm2f128 ymm5, ymm5, ymm2, 0x31 > > + vshufps ymm5, ymm5, ymm5, 0x1b > > + > > + BUTTERFLY ymm4, ymm5, [ps_cos_vec+96], ymm6 > > + > > + ; pass 4 > > + vmovaps ymm6, [ps_p1p1m1m1+0] > > + vmovaps ymm2, [ps_cos_vec+128] > > + > > + BUTTERFLY2 ymm5, ymm6, ymm2, ymm7 > > + BUTTERFLY2 ymm4, ymm6, ymm2, ymm7 > > + BUTTERFLY2 ymm1, ymm6, ymm2, ymm7 > > + BUTTERFLY2 ymm3, ymm6, ymm2, ymm7 > > + > > + > > + ; pass 5 > > + vshufps ymm6, ymm6, ymm6, 0xcc > > + vmovaps ymm2, [ps_cos_vec+160] > > + > > + BUTTERFLY3 ymm5, ymm6, ymm2, ymm7 > > + BUTTERFLY3 ymm4, ymm6, ymm2, ymm7 > > + BUTTERFLY3 ymm1, ymm6, ymm2, ymm7 > > + BUTTERFLY3 ymm3, ymm6, ymm2, ymm7 > > + > > + vextractf128 xmm6, ymm3, 1 > > + vmovaps [outq], ymm3 > > + > > + vextractf128 [outq+64], ymm5, 1 > > + vextractf128 [outq+32], ymm5, 0 > > + > > + vextractf128 [outq+80], ymm4, 1 > > + vextractf128 [outq+48], ymm4, 0 > > + > > + vextractf128 xmm0, ymm1, 1 > > + vmovaps [outq+96], ymm1 > > + > > + vzeroupper > > + > > + ; pass 6, no SIMD... > > + PASS6_AND_PERMUTE > > + REP_RET > > Same as Ronald's comment from before. Just RET. > > > + > > +%define BUTTERFLY BUTTERFLY_SSE > > +%define BUTTERFLY0 BUTTERFLY0_SSE > > + > > cglobal dct32_float_sse, 2,3,8, out, in, tmp > > ; pass 1 > > > > @@ -72,8 +274,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp > > movaps xmm7, [inq+64] > > movaps xmm4, [inq+48] > > shufps xmm4, xmm4, 0x1b > > - BUTTERFLY xmm7, xmm4, [ps_cos_vec+48], xmm3 > > - > > + BUTTERFLY xmm7, xmm4, [ps_cos_vec+32], xmm3 > > > > ; pass 2 > > movaps xmm2, [ps_cos_vec+64] > > @@ -90,7 +291,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp > > movaps xmm4, [inq+80] > > movaps xmm5, [inq+32] > > shufps xmm5, xmm5, 0x1b > > - BUTTERFLY xmm4, xmm5, [ps_cos_vec+32], xmm3 > > + BUTTERFLY xmm4, xmm5, [ps_cos_vec+48], xmm3 > > > > ; pass 2 > > BUTTERFLY xmm0, xmm7, xmm2, xmm3 > > @@ -121,7 +322,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp > > > > ; pass 4 > > movaps xmm3, [ps_p1p1m1m1+0] > > - movaps xmm2, [ps_cos_vec+112] > > + movaps xmm2, [ps_cos_vec+128] > > > > BUTTERFLY2 xmm5, xmm3, xmm2, xmm1 > > > > @@ -146,7 +347,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp > > BUTTERFLY2 xmm0, xmm3, xmm2, xmm1 > > > > ; pass 5 > > - movaps xmm2, [ps_cos_vec+128] > > + movaps xmm2, [ps_cos_vec+160] > > shufps xmm3, xmm3, 0xcc > > > > BUTTERFLY3 xmm5, xmm3, xmm2, xmm1 > > @@ -177,110 +378,5 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp > > movaps [outq+112], xmm0 > > > > ; pass 6, no SIMD... > > - mov tmpq, [outd+4] > > - movss xmm7, [outd+72] > > - addss xmm7, [outq+76] > > - movss xmm3, [outq+56] > > - addss xmm3, [outq+60] > > - addss xmm4, xmm3 > > - movss xmm2, [outq+52] > > - addss xmm2, xmm3 > > - movss xmm3, [outq+104] > > - addss xmm3, [outq+108] > > - addss xmm1, xmm3 > > - addss xmm5, xmm4 > > - movss [outq+16], xmm1 > > - movss xmm1, [outq+100] > > - addss xmm1, xmm3 > > - movss xmm3, [outq+40] > > - movss [outq+48], xmm1 > > - addss xmm3, [outq+44] > > - movss xmm1, [outq+100] > > - addss xmm4, xmm3 > > - addss xmm3, xmm2 > > - addss xmm1, [outq+108] > > - movss [outq+40], xmm3 > > - addss xmm2, [outq+36] > > - movss xmm3, [outq+8] > > - movss [outq+56], xmm2 > > - addss xmm3, [outq+12] > > - movss [outq+32], xmm3 > > - movss xmm3, [outq+80] > > - movss [outq+8], xmm5 > > - movss [outq+80], xmm1 > > - movss xmm2, [outq+52] > > - movss xmm5, [outq+120] > > - addss xmm5, [outq+124] > > - movss xmm1, [outq+64] > > - addss xmm2, [outq+60] > > - addss xmm0, xmm5 > > - addss xmm5, [outq+116] > > - mov [outq+64], tmpq > > - addss xmm6, xmm0 > > - addss xmm1, xmm6 > > - mov tmpq, [outq+12] > > - mov [outq+96], tmpq > > - movss [outq+4], xmm1 > > - movss xmm1, [outq+24] > > - movss [outq+24], xmm4 > > - movss xmm4, [outq+88] > > - addss xmm4, [outq+92] > > - addss xmm3, xmm4 > > - addss xmm4, [outq+84] > > - mov tmpq, [outq+108] > > - addss xmm1, [outq+28] > > - addss xmm0, xmm1 > > - addss xmm1, xmm5 > > - addss xmm6, xmm3 > > - addss xmm3, xmm0 > > - addss xmm0, xmm7 > > - addss xmm5, [outq+20] > > - addss xmm7, xmm1 > > - movss [outq+12], xmm6 > > - mov [outq+112], tmpq > > - movss xmm6, [outq+28] > > - movss [outq+28], xmm0 > > - movss xmm0, [outq+36] > > - movss [outq+36], xmm7 > > - addss xmm1, xmm4 > > - movss xmm7, [outq+116] > > - addss xmm0, xmm2 > > - addss xmm7, [outq+124] > > - movss [outq+72], xmm0 > > - movss xmm0, [outq+44] > > - addss xmm2, xmm0 > > - movss [outq+44], xmm1 > > - movss [outq+88], xmm2 > > - addss xmm0, [outq+60] > > - mov tmpq, [outq+60] > > - mov [outq+120], tmpq > > - movss [outq+104], xmm0 > > - addss xmm4, xmm5 > > - addss xmm5, [outq+68] > > - movss [outq+52], xmm4 > > - movss [outq+60], xmm5 > > - movss xmm4, [outq+68] > > - movss xmm5, [outq+20] > > - movss [outq+20], xmm3 > > - addss xmm5, xmm7 > > - addss xmm7, xmm6 > > - addss xmm4, xmm5 > > - movss xmm2, [outq+84] > > - addss xmm2, [outq+92] > > - addss xmm5, xmm2 > > - movss [outq+68], xmm4 > > - addss xmm2, xmm7 > > - movss xmm4, [outq+76] > > - movss [outq+84], xmm2 > > - movss [outq+76], xmm5 > > - addss xmm7, xmm4 > > - addss xmm6, [outq+124] > > - addss xmm4, xmm6 > > - addss xmm6, [outq+92] > > - movss [outq+100], xmm4 > > - movss [outq+108], xmm6 > > - movss xmm6, [outq+92] > > - movss [outq+92], xmm7 > > - addss xmm6, [outq+124] > > - movss [outq+116], xmm6 > > + PASS6_AND_PERMUTE > > REP_RET > > Same as above. > > > diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c > > index b29412c..8eef421 100644 > > --- a/libavcodec/x86/fft.c > > +++ b/libavcodec/x86/fft.c > > @@ -57,7 +57,9 @@ av_cold void ff_fft_init_mmx(FFTContext *s) > > av_cold void ff_dct_init_mmx(DCTContext *s) > > { > > int has_vectors = av_get_cpu_flags(); > > - if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) > > + if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX) > > + s->dct32 = ff_dct32_float_avx; > > + else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) > > s->dct32 = ff_dct32_float_sse; > > } > > #endif > > diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h > > index e6eace2..c714185 100644 > > --- a/libavcodec/x86/fft.h > > +++ b/libavcodec/x86/fft.h > > @@ -35,5 +35,6 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample > *output, const FFTSample *input) > > void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample > *input); > > void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample > *input); > > void ff_dct32_float_sse(FFTSample *out, const FFTSample *in); > > +void ff_dct32_float_avx(FFTSample *out, const FFTSample *in); > > > > #endif > > -- > > 1.7.4.1 > > One more thing that came up as I was talking with Diego: AVX can be disabled by the user. Put the AVX code under %ifdef HAVE_AVX and #ifdef HAVE_AVX, or whatever the flags were in libav.
_______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
