Re: [libav-devel] [PATCH 3/3] dct32: Add AVX implementation of 32-point DCT

Daniel Kang Sun, 15 May 2011 15:50:05 -0700

On Sun, May 15, 2011 at 4:21 PM, Vitor Sessak <[email protected]> wrote:
>
> Easy to debug with an actual machine :-).
>
> Revised patch attached (only the third patch needed changes), all FATE
> tests pass.
>
> Benchmarks:
>
> SSE:
> time: 0.0 us/transform [total time=1.53 s its=33554432]
> time: 0.0 us/transform [total time=1.53 s its=33554432]
> time: 0.0 us/transform [total time=1.53 s its=33554432]
>
> AVX:
> time: 0.0 us/transform [total time=1.40 s its=33554432]
> time: 0.0 us/transform [total time=1.40 s its=33554432]
> time: 0.0 us/transform [total time=1.40 s its=33554432]
>


nit: can I have START/STOP_TIMER number? This is not important.
Also, how does the SSE time compare  to before? Is there a slowdown?

> From de3ad19337966eb80f844236fe1ae5fb55c747da Mon Sep 17 00:00:00 2001
> From: Vitor Sessak <[email protected]>
> Date: Sat, 14 May 2011 14:17:15 +0200
> Subject: [PATCH 3/3] dct32: Add AVX implementation of 32-point DCT
> ---
>  libavcodec/mpegaudio.h       |    4 +-
>  libavcodec/x86/dct32_sse.asm |  326
+++++++++++++++++++++++++++---------------
>  libavcodec/x86/fft.c         |    4 +-
>  libavcodec/x86/fft.h         |    1 +
>  4 files changed, 217 insertions(+), 118 deletions(-)
> diff --git a/libavcodec/mpegaudio.h b/libavcodec/mpegaudio.h
> index f12b897..d247ce8 100644
> --- a/libavcodec/mpegaudio.h
> +++ b/libavcodec/mpegaudio.h
> @@ -134,9 +134,9 @@ typedef struct MPADecodeContext {
>      uint32_t free_format_next_header;
>      GetBitContext gb;
>      GetBitContext in_gb;
> -    DECLARE_ALIGNED(16, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2];
> +    DECLARE_ALIGNED(32, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2];
>      int synth_buf_offset[MPA_MAX_CHANNELS];
> -    DECLARE_ALIGNED(16, INTFLOAT,
sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT];
> +    DECLARE_ALIGNED(32, INTFLOAT,
sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT];
>      INTFLOAT mdct_buf[MPA_MAX_CHANNELS][SBLIMIT * 18]; /* previous
samples, for layer 3 MDCT */
>      GranuleDef granules[2][2]; /* Used in Layer 3 */
>  #ifdef DEBUG
> diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
> index ac25eb3..5a8691c 100644
> --- a/libavcodec/x86/dct32_sse.asm
> +++ b/libavcodec/x86/dct32_sse.asm
> @@ -26,25 +26,34 @@ SECTION_RODATA 32
>  align 32
>  ps_cos_vec: dd   0.500603,  0.505471,  0.515447,  0.531043
>              dd   0.553104,  0.582935,  0.622504,  0.674808
> -            dd  -1.169440, -0.972568, -0.839350, -0.744536
>              dd -10.190008, -3.407609, -2.057781, -1.484165
> +            dd  -1.169440, -0.972568, -0.839350, -0.744536
>              dd   0.502419,  0.522499,  0.566944,  0.646822
>              dd   0.788155,  1.060678,  1.722447,  5.101149
>              dd   0.509796,  0.601345,  0.899976,  2.562916
> +            dd   0.509796,  0.601345,  0.899976,  2.562916
> +            dd   1.000000,  1.000000,  1.306563,  0.541196
>              dd   1.000000,  1.000000,  1.306563,  0.541196
>              dd   1.000000,  0.707107,  1.000000, -0.707107
> +            dd   1.000000,  0.707107,  1.000000, -0.707107
>
>
> -ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
> +ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000,
0x80000000
>
> -%macro BUTTERFLY 4
> +%macro BUTTERFLY_SSE 4
>      movaps %4, %1
>      subps  %1, %2
>      addps  %2, %4
>      mulps  %1, %3
>  %endmacro
>
> -%macro BUTTERFLY0 5
> +%macro BUTTERFLY_AVX 4
> +    vsubps  %4, %1, %2
> +    vaddps  %2, %2, %1
> +    vmulps  %1, %4, %3
> +%endmacro
> +
> +%macro BUTTERFLY0_SSE 5
>      movaps %4, %1
>      shufps %1, %1, %5
>      xorps  %4, %2
> @@ -52,6 +61,13 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
>      mulps  %1, %3
>  %endmacro
>
> +%macro BUTTERFLY0_AVX 5
> +    vshufps %4, %1, %1, %5
> +    vxorps  %1, %1, %2
> +    vaddps  %4, %4, %1
> +    vmulps  %1, %4, %3
> +%endmacro
> +
>  %macro BUTTERFLY2 4
>      BUTTERFLY0 %1, %2, %3, %4, 0x1b
>  %endmacro
> @@ -60,7 +76,193 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
>      BUTTERFLY0 %1, %2, %3, %4, 0xb1
>  %endmacro
>
> +%macro PASS6_AND_PERMUTE 0
> +    mov   tmpd, [outd+4]
> +    movss xmm7, [outd+72]
> +    addss xmm7, [outq+76]
> +    movss xmm3, [outq+56]
> +    addss xmm3, [outq+60]
> +    addss xmm4, xmm3
> +    movss xmm2, [outq+52]
> +    addss xmm2, xmm3
> +    movss xmm3, [outq+104]
> +    addss xmm3, [outq+108]
> +    addss xmm1, xmm3
> +    addss xmm5, xmm4
> +    movss [outq+16], xmm1
> +    movss xmm1, [outq+100]
> +    addss xmm1, xmm3
> +    movss xmm3, [outq+40]
> +    movss [outq+48], xmm1
> +    addss xmm3, [outq+44]
> +    movss xmm1, [outq+100]
> +    addss xmm4, xmm3
> +    addss xmm3, xmm2
> +    addss xmm1, [outq+108]
> +    movss [outq+40], xmm3
> +    addss xmm2, [outq+36]
> +    movss xmm3, [outq+8]
> +    movss [outq+56], xmm2
> +    addss xmm3, [outq+12]
> +    movss [outq+32], xmm3
> +    movss xmm3, [outq+80]
> +    movss [outq+8], xmm5
> +    movss [outq+80], xmm1
> +    movss xmm2, [outq+52]
> +    movss xmm5, [outq+120]
> +    addss xmm5, [outq+124]
> +    movss xmm1, [outq+64]
> +    addss xmm2, [outq+60]
> +    addss xmm0, xmm5
> +    addss xmm5, [outq+116]
> +    mov   [outq+64], tmpd
> +    addss xmm6, xmm0
> +    addss xmm1, xmm6
> +    mov   tmpd, [outq+12]
> +    mov   [outq+96], tmpd
> +    movss [outq+4], xmm1
> +    movss xmm1, [outq+24]
> +    movss [outq+24], xmm4
> +    movss xmm4, [outq+88]
> +    addss xmm4, [outq+92]
> +    addss xmm3, xmm4
> +    addss xmm4, [outq+84]
> +    mov   tmpd, [outq+108]
> +    addss xmm1, [outq+28]
> +    addss xmm0, xmm1
> +    addss xmm1, xmm5
> +    addss xmm6, xmm3
> +    addss xmm3, xmm0
> +    addss xmm0, xmm7
> +    addss xmm5, [outq+20]
> +    addss xmm7, xmm1
> +    movss [outq+12], xmm6
> +    mov   [outq+112], tmpd
> +    movss xmm6, [outq+28]
> +    movss [outq+28], xmm0
> +    movss xmm0, [outq+36]
> +    movss [outq+36], xmm7
> +    addss xmm1, xmm4
> +    movss xmm7, [outq+116]
> +    addss xmm0, xmm2
> +    addss xmm7, [outq+124]
> +    movss [outq+72], xmm0
> +    movss xmm0, [outq+44]
> +    addss xmm2, xmm0
> +    movss [outq+44], xmm1
> +    movss [outq+88], xmm2
> +    addss xmm0, [outq+60]
> +    mov   tmpd, [outq+60]
> +    mov   [outq+120], tmpd
> +    movss [outq+104], xmm0
> +    addss xmm4, xmm5
> +    addss xmm5, [outq+68]
> +    movss [outq+52], xmm4
> +    movss [outq+60], xmm5
> +    movss xmm4, [outq+68]
> +    movss xmm5, [outq+20]
> +    movss [outq+20], xmm3
> +    addss xmm5, xmm7
> +    addss xmm7, xmm6
> +    addss xmm4, xmm5
> +    movss xmm2, [outq+84]
> +    addss xmm2, [outq+92]
> +    addss xmm5, xmm2
> +    movss [outq+68], xmm4
> +    addss xmm2, xmm7
> +    movss xmm4, [outq+76]
> +    movss [outq+84], xmm2
> +    movss [outq+76], xmm5
> +    addss xmm7, xmm4
> +    addss xmm6, [outq+124]
> +    addss xmm4, xmm6
> +    addss xmm6, [outq+92]
> +    movss [outq+100], xmm4
> +    movss [outq+108], xmm6
> +    movss xmm6, [outq+92]
> +    movss [outq+92], xmm7
> +    addss xmm6, [outq+124]
> +    movss [outq+116], xmm6
> +%endmacro

Could this be SIMD? It looks horrific, so if the answer is no, I'll go with
that.

> +
> +%define BUTTERFLY  BUTTERFLY_AVX
> +%define BUTTERFLY0 BUTTERFLY0_AVX
> +
>  section .text align=16
> +cglobal dct32_float_avx, 2,3,8, out, in, tmp

nit: add a function header.
Not a nit: can you get this down to only using 7 registers? A quick glance
suggests yes. This is better on Windows.

> +    ; pass 1
> +    vmovaps     ymm4, [inq+0]
> +    vinsertf128 ymm5, ymm5, [inq+96], 1
> +    vinsertf128 ymm5, ymm5, [inq+112], 0
> +    vshufps     ymm5, ymm5, ymm5, 0x1b
> +    BUTTERFLY   ymm4, ymm5, [ps_cos_vec], ymm6
> +
> +    vmovaps     ymm2, [inq+64]
> +    vinsertf128 ymm6, ymm6, [inq+32], 1
> +    vinsertf128 ymm6, ymm6, [inq+48], 0
> +    vshufps     ymm6, ymm6, ymm6, 0x1b
> +    BUTTERFLY   ymm2, ymm6, [ps_cos_vec+32], ymm0
> +
> +    ; pass 2
> +
> +    BUTTERFLY  ymm5, ymm6, [ps_cos_vec+64], ymm0
> +    BUTTERFLY  ymm4, ymm2, [ps_cos_vec+64], ymm7
> +
> +
> +    ; pass 3
> +    vperm2f128  ymm3, ymm6, ymm4, 0x31
> +    vperm2f128  ymm1, ymm6, ymm4, 0x20
> +    vshufps     ymm3, ymm3, ymm3, 0x1b
> +
> +    BUTTERFLY   ymm1, ymm3, [ps_cos_vec+96], ymm6
> +
> +
> +    vperm2f128  ymm4, ymm5, ymm2, 0x20
> +    vperm2f128  ymm5, ymm5, ymm2, 0x31
> +    vshufps     ymm5, ymm5, ymm5, 0x1b
> +
> +    BUTTERFLY   ymm4, ymm5, [ps_cos_vec+96], ymm6
> +
> +    ; pass 4
> +    vmovaps ymm6, [ps_p1p1m1m1+0]
> +    vmovaps ymm2, [ps_cos_vec+128]
> +
> +    BUTTERFLY2  ymm5, ymm6, ymm2, ymm7
> +    BUTTERFLY2  ymm4, ymm6, ymm2, ymm7
> +    BUTTERFLY2  ymm1, ymm6, ymm2, ymm7
> +    BUTTERFLY2  ymm3, ymm6, ymm2, ymm7
> +
> +
> +    ; pass 5
> +    vshufps ymm6, ymm6, ymm6, 0xcc
> +    vmovaps ymm2, [ps_cos_vec+160]
> +
> +    BUTTERFLY3  ymm5, ymm6, ymm2, ymm7
> +    BUTTERFLY3  ymm4, ymm6, ymm2, ymm7
> +    BUTTERFLY3  ymm1, ymm6, ymm2, ymm7
> +    BUTTERFLY3  ymm3, ymm6, ymm2, ymm7
> +
> +    vextractf128  xmm6, ymm3, 1
> +    vmovaps [outq], ymm3
> +
> +    vextractf128  [outq+64], ymm5, 1
> +    vextractf128  [outq+32], ymm5, 0
> +
> +    vextractf128  [outq+80], ymm4, 1
> +    vextractf128  [outq+48], ymm4, 0
> +
> +    vextractf128  xmm0, ymm1, 1
> +    vmovaps [outq+96], ymm1
> +
> +    vzeroupper
> +
> +    ;    pass 6, no SIMD...
> +    PASS6_AND_PERMUTE
> +    REP_RET

Same as Ronald's comment from before. Just RET.

> +
> +%define BUTTERFLY  BUTTERFLY_SSE
> +%define BUTTERFLY0 BUTTERFLY0_SSE
> +
>  cglobal dct32_float_sse, 2,3,8, out, in, tmp
>      ; pass 1
>
> @@ -72,8 +274,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
>      movaps      xmm7, [inq+64]
>      movaps      xmm4, [inq+48]
>      shufps      xmm4, xmm4, 0x1b
> -    BUTTERFLY   xmm7, xmm4, [ps_cos_vec+48], xmm3
> -
> +    BUTTERFLY   xmm7, xmm4, [ps_cos_vec+32], xmm3
>
>      ; pass 2
>      movaps      xmm2, [ps_cos_vec+64]
> @@ -90,7 +291,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
>      movaps      xmm4, [inq+80]
>      movaps      xmm5, [inq+32]
>      shufps      xmm5, xmm5, 0x1b
> -    BUTTERFLY   xmm4, xmm5, [ps_cos_vec+32], xmm3
> +    BUTTERFLY   xmm4, xmm5, [ps_cos_vec+48], xmm3
>
>      ; pass 2
>      BUTTERFLY   xmm0, xmm7, xmm2, xmm3
> @@ -121,7 +322,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
>
>      ; pass 4
>      movaps      xmm3, [ps_p1p1m1m1+0]
> -    movaps      xmm2, [ps_cos_vec+112]
> +    movaps      xmm2, [ps_cos_vec+128]
>
>      BUTTERFLY2  xmm5, xmm3, xmm2, xmm1
>
> @@ -146,7 +347,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
>      BUTTERFLY2  xmm0, xmm3, xmm2, xmm1
>
>      ; pass 5
> -    movaps      xmm2, [ps_cos_vec+128]
> +    movaps      xmm2, [ps_cos_vec+160]
>      shufps      xmm3, xmm3, 0xcc
>
>      BUTTERFLY3  xmm5, xmm3, xmm2, xmm1
> @@ -177,110 +378,5 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
>      movaps      [outq+112], xmm0
>
>      ;    pass 6, no SIMD...
> -    mov   tmpq, [outd+4]
> -    movss xmm7, [outd+72]
> -    addss xmm7, [outq+76]
> -    movss xmm3, [outq+56]
> -    addss xmm3, [outq+60]
> -    addss xmm4, xmm3
> -    movss xmm2, [outq+52]
> -    addss xmm2, xmm3
> -    movss xmm3, [outq+104]
> -    addss xmm3, [outq+108]
> -    addss xmm1, xmm3
> -    addss xmm5, xmm4
> -    movss [outq+16], xmm1
> -    movss xmm1, [outq+100]
> -    addss xmm1, xmm3
> -    movss xmm3, [outq+40]
> -    movss [outq+48], xmm1
> -    addss xmm3, [outq+44]
> -    movss xmm1, [outq+100]
> -    addss xmm4, xmm3
> -    addss xmm3, xmm2
> -    addss xmm1, [outq+108]
> -    movss [outq+40], xmm3
> -    addss xmm2, [outq+36]
> -    movss xmm3, [outq+8]
> -    movss [outq+56], xmm2
> -    addss xmm3, [outq+12]
> -    movss [outq+32], xmm3
> -    movss xmm3, [outq+80]
> -    movss [outq+8], xmm5
> -    movss [outq+80], xmm1
> -    movss xmm2, [outq+52]
> -    movss xmm5, [outq+120]
> -    addss xmm5, [outq+124]
> -    movss xmm1, [outq+64]
> -    addss xmm2, [outq+60]
> -    addss xmm0, xmm5
> -    addss xmm5, [outq+116]
> -    mov  [outq+64], tmpq
> -    addss xmm6, xmm0
> -    addss xmm1, xmm6
> -    mov  tmpq, [outq+12]
> -    mov  [outq+96], tmpq
> -    movss [outq+4], xmm1
> -    movss xmm1, [outq+24]
> -    movss [outq+24], xmm4
> -    movss xmm4, [outq+88]
> -    addss xmm4, [outq+92]
> -    addss xmm3, xmm4
> -    addss xmm4, [outq+84]
> -    mov  tmpq, [outq+108]
> -    addss xmm1, [outq+28]
> -    addss xmm0, xmm1
> -    addss xmm1, xmm5
> -    addss xmm6, xmm3
> -    addss xmm3, xmm0
> -    addss xmm0, xmm7
> -    addss xmm5, [outq+20]
> -    addss xmm7, xmm1
> -    movss [outq+12], xmm6
> -    mov  [outq+112], tmpq
> -    movss xmm6, [outq+28]
> -    movss [outq+28], xmm0
> -    movss xmm0, [outq+36]
> -    movss [outq+36], xmm7
> -    addss xmm1, xmm4
> -    movss xmm7, [outq+116]
> -    addss xmm0, xmm2
> -    addss xmm7, [outq+124]
> -    movss [outq+72], xmm0
> -    movss xmm0, [outq+44]
> -    addss xmm2, xmm0
> -    movss [outq+44], xmm1
> -    movss [outq+88], xmm2
> -    addss xmm0, [outq+60]
> -    mov   tmpq, [outq+60]
> -    mov   [outq+120], tmpq
> -    movss [outq+104], xmm0
> -    addss xmm4, xmm5
> -    addss xmm5, [outq+68]
> -    movss [outq+52], xmm4
> -    movss [outq+60], xmm5
> -    movss xmm4, [outq+68]
> -    movss xmm5, [outq+20]
> -    movss [outq+20], xmm3
> -    addss xmm5, xmm7
> -    addss xmm7, xmm6
> -    addss xmm4, xmm5
> -    movss xmm2, [outq+84]
> -    addss xmm2, [outq+92]
> -    addss xmm5, xmm2
> -    movss [outq+68], xmm4
> -    addss xmm2, xmm7
> -    movss xmm4, [outq+76]
> -    movss [outq+84], xmm2
> -    movss [outq+76], xmm5
> -    addss xmm7, xmm4
> -    addss xmm6, [outq+124]
> -    addss xmm4, xmm6
> -    addss xmm6, [outq+92]
> -    movss [outq+100], xmm4
> -    movss [outq+108], xmm6
> -    movss xmm6, [outq+92]
> -    movss [outq+92], xmm7
> -    addss xmm6, [outq+124]
> -    movss [outq+116], xmm6
> +    PASS6_AND_PERMUTE
>      REP_RET

Same as above.

> diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c
> index b29412c..8eef421 100644
> --- a/libavcodec/x86/fft.c
> +++ b/libavcodec/x86/fft.c
> @@ -57,7 +57,9 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
>  av_cold void ff_dct_init_mmx(DCTContext *s)
>  {
>      int has_vectors = av_get_cpu_flags();
> -    if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
> +    if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX)
> +        s->dct32 = ff_dct32_float_avx;
> +    else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
>          s->dct32 = ff_dct32_float_sse;
>  }
>  #endif
> diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
> index e6eace2..c714185 100644
> --- a/libavcodec/x86/fft.h
> +++ b/libavcodec/x86/fft.h
> @@ -35,5 +35,6 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output,
const FFTSample *input)
>  void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample
*input);
>  void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample
*input);
>  void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
> +void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
>
>  #endif
> --
> 1.7.4.1

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 3/3] dct32: Add AVX implementation of 32-point DCT

Reply via email to