Re: [libav-devel] [PATCH 3/3] dct32: Add AVX implementation of 32-point DCT

Daniel Kang Sun, 15 May 2011 19:47:46 -0700

On Sun, May 15, 2011 at 6:49 PM, Daniel Kang <[email protected]>wrote:


> On Sun, May 15, 2011 at 4:21 PM, Vitor Sessak <[email protected]> wrote:
>>
>>  Easy to debug with an actual machine :-).
>>
>> Revised patch attached (only the third patch needed changes), all FATE
>> tests pass.
>>
>> Benchmarks:
>>
>> SSE:
>> time: 0.0 us/transform [total time=1.53 s its=33554432]
>> time: 0.0 us/transform [total time=1.53 s its=33554432]
>> time: 0.0 us/transform [total time=1.53 s its=33554432]
>>
>> AVX:
>> time: 0.0 us/transform [total time=1.40 s its=33554432]
>> time: 0.0 us/transform [total time=1.40 s its=33554432]
>> time: 0.0 us/transform [total time=1.40 s its=33554432]
>>
>
> nit: can I have START/STOP_TIMER number? This is not important.
> Also, how does the SSE time compare  to before? Is there a slowdown?
>
> > From de3ad19337966eb80f844236fe1ae5fb55c747da Mon Sep 17 00:00:00 2001
> > From: Vitor Sessak <[email protected]>
> > Date: Sat, 14 May 2011 14:17:15 +0200
> > Subject: [PATCH 3/3] dct32: Add AVX implementation of 32-point DCT
> > ---
> >  libavcodec/mpegaudio.h       |    4 +-
> >  libavcodec/x86/dct32_sse.asm |  326
> +++++++++++++++++++++++++++---------------
> >  libavcodec/x86/fft.c         |    4 +-
> >  libavcodec/x86/fft.h         |    1 +
> >  4 files changed, 217 insertions(+), 118 deletions(-)
> > diff --git a/libavcodec/mpegaudio.h b/libavcodec/mpegaudio.h
> > index f12b897..d247ce8 100644
> > --- a/libavcodec/mpegaudio.h
> > +++ b/libavcodec/mpegaudio.h
> > @@ -134,9 +134,9 @@ typedef struct MPADecodeContext {
> >      uint32_t free_format_next_header;
> >      GetBitContext gb;
> >      GetBitContext in_gb;
> > -    DECLARE_ALIGNED(16, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2];
> > +    DECLARE_ALIGNED(32, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2];
> >      int synth_buf_offset[MPA_MAX_CHANNELS];
> > -    DECLARE_ALIGNED(16, INTFLOAT,
> sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT];
> > +    DECLARE_ALIGNED(32, INTFLOAT,
> sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT];
> >      INTFLOAT mdct_buf[MPA_MAX_CHANNELS][SBLIMIT * 18]; /* previous
> samples, for layer 3 MDCT */
> >      GranuleDef granules[2][2]; /* Used in Layer 3 */
> >  #ifdef DEBUG
> > diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
> > index ac25eb3..5a8691c 100644
> > --- a/libavcodec/x86/dct32_sse.asm
> > +++ b/libavcodec/x86/dct32_sse.asm
> > @@ -26,25 +26,34 @@ SECTION_RODATA 32
> >  align 32
> >  ps_cos_vec: dd   0.500603,  0.505471,  0.515447,  0.531043
> >              dd   0.553104,  0.582935,  0.622504,  0.674808
> > -            dd  -1.169440, -0.972568, -0.839350, -0.744536
> >              dd -10.190008, -3.407609, -2.057781, -1.484165
> > +            dd  -1.169440, -0.972568, -0.839350, -0.744536
> >              dd   0.502419,  0.522499,  0.566944,  0.646822
> >              dd   0.788155,  1.060678,  1.722447,  5.101149
> >              dd   0.509796,  0.601345,  0.899976,  2.562916
> > +            dd   0.509796,  0.601345,  0.899976,  2.562916
> > +            dd   1.000000,  1.000000,  1.306563,  0.541196
> >              dd   1.000000,  1.000000,  1.306563,  0.541196
> >              dd   1.000000,  0.707107,  1.000000, -0.707107
> > +            dd   1.000000,  0.707107,  1.000000, -0.707107
> >
> >
> > -ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
> > +ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000,
> 0x80000000
> >
> > -%macro BUTTERFLY 4
> > +%macro BUTTERFLY_SSE 4
> >      movaps %4, %1
> >      subps  %1, %2
> >      addps  %2, %4
> >      mulps  %1, %3
> >  %endmacro
> >
> > -%macro BUTTERFLY0 5
> > +%macro BUTTERFLY_AVX 4
> > +    vsubps  %4, %1, %2
> > +    vaddps  %2, %2, %1
> > +    vmulps  %1, %4, %3
> > +%endmacro
> > +
> > +%macro BUTTERFLY0_SSE 5
> >      movaps %4, %1
> >      shufps %1, %1, %5
> >      xorps  %4, %2
> > @@ -52,6 +61,13 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
> >      mulps  %1, %3
> >  %endmacro
> >
> > +%macro BUTTERFLY0_AVX 5
> > +    vshufps %4, %1, %1, %5
> > +    vxorps  %1, %1, %2
> > +    vaddps  %4, %4, %1
> > +    vmulps  %1, %4, %3
> > +%endmacro
> > +
> >  %macro BUTTERFLY2 4
> >      BUTTERFLY0 %1, %2, %3, %4, 0x1b
> >  %endmacro
> > @@ -60,7 +76,193 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
> >      BUTTERFLY0 %1, %2, %3, %4, 0xb1
> >  %endmacro
> >
> > +%macro PASS6_AND_PERMUTE 0
> > +    mov   tmpd, [outd+4]
> > +    movss xmm7, [outd+72]
> > +    addss xmm7, [outq+76]
> > +    movss xmm3, [outq+56]
> > +    addss xmm3, [outq+60]
> > +    addss xmm4, xmm3
> > +    movss xmm2, [outq+52]
> > +    addss xmm2, xmm3
> > +    movss xmm3, [outq+104]
> > +    addss xmm3, [outq+108]
> > +    addss xmm1, xmm3
> > +    addss xmm5, xmm4
> > +    movss [outq+16], xmm1
> > +    movss xmm1, [outq+100]
> > +    addss xmm1, xmm3
> > +    movss xmm3, [outq+40]
> > +    movss [outq+48], xmm1
> > +    addss xmm3, [outq+44]
> > +    movss xmm1, [outq+100]
> > +    addss xmm4, xmm3
> > +    addss xmm3, xmm2
> > +    addss xmm1, [outq+108]
> > +    movss [outq+40], xmm3
> > +    addss xmm2, [outq+36]
> > +    movss xmm3, [outq+8]
> > +    movss [outq+56], xmm2
> > +    addss xmm3, [outq+12]
> > +    movss [outq+32], xmm3
> > +    movss xmm3, [outq+80]
> > +    movss [outq+8], xmm5
> > +    movss [outq+80], xmm1
> > +    movss xmm2, [outq+52]
> > +    movss xmm5, [outq+120]
> > +    addss xmm5, [outq+124]
> > +    movss xmm1, [outq+64]
> > +    addss xmm2, [outq+60]
> > +    addss xmm0, xmm5
> > +    addss xmm5, [outq+116]
> > +    mov   [outq+64], tmpd
> > +    addss xmm6, xmm0
> > +    addss xmm1, xmm6
> > +    mov   tmpd, [outq+12]
> > +    mov   [outq+96], tmpd
> > +    movss [outq+4], xmm1
> > +    movss xmm1, [outq+24]
> > +    movss [outq+24], xmm4
> > +    movss xmm4, [outq+88]
> > +    addss xmm4, [outq+92]
> > +    addss xmm3, xmm4
> > +    addss xmm4, [outq+84]
> > +    mov   tmpd, [outq+108]
> > +    addss xmm1, [outq+28]
> > +    addss xmm0, xmm1
> > +    addss xmm1, xmm5
> > +    addss xmm6, xmm3
> > +    addss xmm3, xmm0
> > +    addss xmm0, xmm7
> > +    addss xmm5, [outq+20]
> > +    addss xmm7, xmm1
> > +    movss [outq+12], xmm6
> > +    mov   [outq+112], tmpd
> > +    movss xmm6, [outq+28]
> > +    movss [outq+28], xmm0
> > +    movss xmm0, [outq+36]
> > +    movss [outq+36], xmm7
> > +    addss xmm1, xmm4
> > +    movss xmm7, [outq+116]
> > +    addss xmm0, xmm2
> > +    addss xmm7, [outq+124]
> > +    movss [outq+72], xmm0
> > +    movss xmm0, [outq+44]
> > +    addss xmm2, xmm0
> > +    movss [outq+44], xmm1
> > +    movss [outq+88], xmm2
> > +    addss xmm0, [outq+60]
> > +    mov   tmpd, [outq+60]
> > +    mov   [outq+120], tmpd
> > +    movss [outq+104], xmm0
> > +    addss xmm4, xmm5
> > +    addss xmm5, [outq+68]
> > +    movss [outq+52], xmm4
> > +    movss [outq+60], xmm5
> > +    movss xmm4, [outq+68]
> > +    movss xmm5, [outq+20]
> > +    movss [outq+20], xmm3
> > +    addss xmm5, xmm7
> > +    addss xmm7, xmm6
> > +    addss xmm4, xmm5
> > +    movss xmm2, [outq+84]
> > +    addss xmm2, [outq+92]
> > +    addss xmm5, xmm2
> > +    movss [outq+68], xmm4
> > +    addss xmm2, xmm7
> > +    movss xmm4, [outq+76]
> > +    movss [outq+84], xmm2
> > +    movss [outq+76], xmm5
> > +    addss xmm7, xmm4
> > +    addss xmm6, [outq+124]
> > +    addss xmm4, xmm6
> > +    addss xmm6, [outq+92]
> > +    movss [outq+100], xmm4
> > +    movss [outq+108], xmm6
> > +    movss xmm6, [outq+92]
> > +    movss [outq+92], xmm7
> > +    addss xmm6, [outq+124]
> > +    movss [outq+116], xmm6
> > +%endmacro
>
> Could this be SIMD? It looks horrific, so if the answer is no, I'll go with
> that.
>
> > +
> > +%define BUTTERFLY  BUTTERFLY_AVX
> > +%define BUTTERFLY0 BUTTERFLY0_AVX
> > +
> >  section .text align=16
> > +cglobal dct32_float_avx, 2,3,8, out, in, tmp
>
> nit: add a function header.
> Not a nit: can you get this down to only using 7 registers? A quick glance
> suggests yes. This is better on Windows.
>
> > +    ; pass 1
> > +    vmovaps     ymm4, [inq+0]
> > +    vinsertf128 ymm5, ymm5, [inq+96], 1
> > +    vinsertf128 ymm5, ymm5, [inq+112], 0
> > +    vshufps     ymm5, ymm5, ymm5, 0x1b
> > +    BUTTERFLY   ymm4, ymm5, [ps_cos_vec], ymm6
> > +
> > +    vmovaps     ymm2, [inq+64]
> > +    vinsertf128 ymm6, ymm6, [inq+32], 1
> > +    vinsertf128 ymm6, ymm6, [inq+48], 0
> > +    vshufps     ymm6, ymm6, ymm6, 0x1b
> > +    BUTTERFLY   ymm2, ymm6, [ps_cos_vec+32], ymm0
> > +
> > +    ; pass 2
> > +
> > +    BUTTERFLY  ymm5, ymm6, [ps_cos_vec+64], ymm0
> > +    BUTTERFLY  ymm4, ymm2, [ps_cos_vec+64], ymm7
> > +
> > +
> > +    ; pass 3
> > +    vperm2f128  ymm3, ymm6, ymm4, 0x31
> > +    vperm2f128  ymm1, ymm6, ymm4, 0x20
> > +    vshufps     ymm3, ymm3, ymm3, 0x1b
> > +
> > +    BUTTERFLY   ymm1, ymm3, [ps_cos_vec+96], ymm6
> > +
> > +
> > +    vperm2f128  ymm4, ymm5, ymm2, 0x20
> > +    vperm2f128  ymm5, ymm5, ymm2, 0x31
> > +    vshufps     ymm5, ymm5, ymm5, 0x1b
> > +
> > +    BUTTERFLY   ymm4, ymm5, [ps_cos_vec+96], ymm6
> > +
> > +    ; pass 4
> > +    vmovaps ymm6, [ps_p1p1m1m1+0]
> > +    vmovaps ymm2, [ps_cos_vec+128]
> > +
> > +    BUTTERFLY2  ymm5, ymm6, ymm2, ymm7
> > +    BUTTERFLY2  ymm4, ymm6, ymm2, ymm7
> > +    BUTTERFLY2  ymm1, ymm6, ymm2, ymm7
> > +    BUTTERFLY2  ymm3, ymm6, ymm2, ymm7
> > +
> > +
> > +    ; pass 5
> > +    vshufps ymm6, ymm6, ymm6, 0xcc
> > +    vmovaps ymm2, [ps_cos_vec+160]
> > +
> > +    BUTTERFLY3  ymm5, ymm6, ymm2, ymm7
> > +    BUTTERFLY3  ymm4, ymm6, ymm2, ymm7
> > +    BUTTERFLY3  ymm1, ymm6, ymm2, ymm7
> > +    BUTTERFLY3  ymm3, ymm6, ymm2, ymm7
> > +
> > +    vextractf128  xmm6, ymm3, 1
> > +    vmovaps [outq], ymm3
> > +
> > +    vextractf128  [outq+64], ymm5, 1
> > +    vextractf128  [outq+32], ymm5, 0
> > +
> > +    vextractf128  [outq+80], ymm4, 1
> > +    vextractf128  [outq+48], ymm4, 0
> > +
> > +    vextractf128  xmm0, ymm1, 1
> > +    vmovaps [outq+96], ymm1
> > +
> > +    vzeroupper
> > +
> > +    ;    pass 6, no SIMD...
> > +    PASS6_AND_PERMUTE
> > +    REP_RET
>
> Same as Ronald's comment from before. Just RET.
>
> > +
> > +%define BUTTERFLY  BUTTERFLY_SSE
> > +%define BUTTERFLY0 BUTTERFLY0_SSE
> > +
> >  cglobal dct32_float_sse, 2,3,8, out, in, tmp
> >      ; pass 1
> >
> > @@ -72,8 +274,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
> >      movaps      xmm7, [inq+64]
> >      movaps      xmm4, [inq+48]
> >      shufps      xmm4, xmm4, 0x1b
> > -    BUTTERFLY   xmm7, xmm4, [ps_cos_vec+48], xmm3
> > -
> > +    BUTTERFLY   xmm7, xmm4, [ps_cos_vec+32], xmm3
> >
> >      ; pass 2
> >      movaps      xmm2, [ps_cos_vec+64]
> > @@ -90,7 +291,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
> >      movaps      xmm4, [inq+80]
> >      movaps      xmm5, [inq+32]
> >      shufps      xmm5, xmm5, 0x1b
> > -    BUTTERFLY   xmm4, xmm5, [ps_cos_vec+32], xmm3
> > +    BUTTERFLY   xmm4, xmm5, [ps_cos_vec+48], xmm3
> >
> >      ; pass 2
> >      BUTTERFLY   xmm0, xmm7, xmm2, xmm3
> > @@ -121,7 +322,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
> >
> >      ; pass 4
> >      movaps      xmm3, [ps_p1p1m1m1+0]
> > -    movaps      xmm2, [ps_cos_vec+112]
> > +    movaps      xmm2, [ps_cos_vec+128]
> >
> >      BUTTERFLY2  xmm5, xmm3, xmm2, xmm1
> >
> > @@ -146,7 +347,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
> >      BUTTERFLY2  xmm0, xmm3, xmm2, xmm1
> >
> >      ; pass 5
> > -    movaps      xmm2, [ps_cos_vec+128]
> > +    movaps      xmm2, [ps_cos_vec+160]
> >      shufps      xmm3, xmm3, 0xcc
> >
> >      BUTTERFLY3  xmm5, xmm3, xmm2, xmm1
> > @@ -177,110 +378,5 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
> >      movaps      [outq+112], xmm0
> >
> >      ;    pass 6, no SIMD...
> > -    mov   tmpq, [outd+4]
> > -    movss xmm7, [outd+72]
> > -    addss xmm7, [outq+76]
> > -    movss xmm3, [outq+56]
> > -    addss xmm3, [outq+60]
> > -    addss xmm4, xmm3
> > -    movss xmm2, [outq+52]
> > -    addss xmm2, xmm3
> > -    movss xmm3, [outq+104]
> > -    addss xmm3, [outq+108]
> > -    addss xmm1, xmm3
> > -    addss xmm5, xmm4
> > -    movss [outq+16], xmm1
> > -    movss xmm1, [outq+100]
> > -    addss xmm1, xmm3
> > -    movss xmm3, [outq+40]
> > -    movss [outq+48], xmm1
> > -    addss xmm3, [outq+44]
> > -    movss xmm1, [outq+100]
> > -    addss xmm4, xmm3
> > -    addss xmm3, xmm2
> > -    addss xmm1, [outq+108]
> > -    movss [outq+40], xmm3
> > -    addss xmm2, [outq+36]
> > -    movss xmm3, [outq+8]
> > -    movss [outq+56], xmm2
> > -    addss xmm3, [outq+12]
> > -    movss [outq+32], xmm3
> > -    movss xmm3, [outq+80]
> > -    movss [outq+8], xmm5
> > -    movss [outq+80], xmm1
> > -    movss xmm2, [outq+52]
> > -    movss xmm5, [outq+120]
> > -    addss xmm5, [outq+124]
> > -    movss xmm1, [outq+64]
> > -    addss xmm2, [outq+60]
> > -    addss xmm0, xmm5
> > -    addss xmm5, [outq+116]
> > -    mov  [outq+64], tmpq
> > -    addss xmm6, xmm0
> > -    addss xmm1, xmm6
> > -    mov  tmpq, [outq+12]
> > -    mov  [outq+96], tmpq
> > -    movss [outq+4], xmm1
> > -    movss xmm1, [outq+24]
> > -    movss [outq+24], xmm4
> > -    movss xmm4, [outq+88]
> > -    addss xmm4, [outq+92]
> > -    addss xmm3, xmm4
> > -    addss xmm4, [outq+84]
> > -    mov  tmpq, [outq+108]
> > -    addss xmm1, [outq+28]
> > -    addss xmm0, xmm1
> > -    addss xmm1, xmm5
> > -    addss xmm6, xmm3
> > -    addss xmm3, xmm0
> > -    addss xmm0, xmm7
> > -    addss xmm5, [outq+20]
> > -    addss xmm7, xmm1
> > -    movss [outq+12], xmm6
> > -    mov  [outq+112], tmpq
> > -    movss xmm6, [outq+28]
> > -    movss [outq+28], xmm0
> > -    movss xmm0, [outq+36]
> > -    movss [outq+36], xmm7
> > -    addss xmm1, xmm4
> > -    movss xmm7, [outq+116]
> > -    addss xmm0, xmm2
> > -    addss xmm7, [outq+124]
> > -    movss [outq+72], xmm0
> > -    movss xmm0, [outq+44]
> > -    addss xmm2, xmm0
> > -    movss [outq+44], xmm1
> > -    movss [outq+88], xmm2
> > -    addss xmm0, [outq+60]
> > -    mov   tmpq, [outq+60]
> > -    mov   [outq+120], tmpq
> > -    movss [outq+104], xmm0
> > -    addss xmm4, xmm5
> > -    addss xmm5, [outq+68]
> > -    movss [outq+52], xmm4
> > -    movss [outq+60], xmm5
> > -    movss xmm4, [outq+68]
> > -    movss xmm5, [outq+20]
> > -    movss [outq+20], xmm3
> > -    addss xmm5, xmm7
> > -    addss xmm7, xmm6
> > -    addss xmm4, xmm5
> > -    movss xmm2, [outq+84]
> > -    addss xmm2, [outq+92]
> > -    addss xmm5, xmm2
> > -    movss [outq+68], xmm4
> > -    addss xmm2, xmm7
> > -    movss xmm4, [outq+76]
> > -    movss [outq+84], xmm2
> > -    movss [outq+76], xmm5
> > -    addss xmm7, xmm4
> > -    addss xmm6, [outq+124]
> > -    addss xmm4, xmm6
> > -    addss xmm6, [outq+92]
> > -    movss [outq+100], xmm4
> > -    movss [outq+108], xmm6
> > -    movss xmm6, [outq+92]
> > -    movss [outq+92], xmm7
> > -    addss xmm6, [outq+124]
> > -    movss [outq+116], xmm6
> > +    PASS6_AND_PERMUTE
> >      REP_RET
>
> Same as above.
>
> > diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c
> > index b29412c..8eef421 100644
> > --- a/libavcodec/x86/fft.c
> > +++ b/libavcodec/x86/fft.c
> > @@ -57,7 +57,9 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
> >  av_cold void ff_dct_init_mmx(DCTContext *s)
> >  {
> >      int has_vectors = av_get_cpu_flags();
> > -    if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
> > +    if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX)
> > +        s->dct32 = ff_dct32_float_avx;
> > +    else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
> >          s->dct32 = ff_dct32_float_sse;
> >  }
> >  #endif
> > diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
> > index e6eace2..c714185 100644
> > --- a/libavcodec/x86/fft.h
> > +++ b/libavcodec/x86/fft.h
> > @@ -35,5 +35,6 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample
> *output, const FFTSample *input)
> >  void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample
> *input);
> >  void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample
> *input);
> >  void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
> > +void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
> >
> >  #endif
> > --
> > 1.7.4.1
>
>

One more thing that came up as I was talking with Diego: AVX can be disabled
by the user. Put the AVX code under %ifdef HAVE_AVX and #ifdef HAVE_AVX, or
whatever the flags were in libav.

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 3/3] dct32: Add AVX implementation of 32-point DCT

Reply via email to