On 05/16/2011 04:47 AM, Daniel Kang wrote:
On Sun, May 15, 2011 at 6:49 PM, Daniel Kang <[email protected]
<mailto:[email protected]>> wrote:
On Sun, May 15, 2011 at 4:21 PM, Vitor Sessak <[email protected]
On Sun, May 15, 2011 at 6:49 PM, Daniel Kang <[email protected]> wrote:
On Sun, May 15, 2011 at 4:21 PM, Vitor Sessak <[email protected]> wrote:
Easy to debug with an actual machine :-).
Revised patch attached (only the third patch needed changes), all FATE
tests pass.
Benchmarks:
SSE:
time: 0.0 us/transform [total time=1.53 s its=33554432]
time: 0.0 us/transform [total time=1.53 s its=33554432]
time: 0.0 us/transform [total time=1.53 s its=33554432]
AVX:
time: 0.0 us/transform [total time=1.40 s its=33554432]
time: 0.0 us/transform [total time=1.40 s its=33554432]
time: 0.0 us/transform [total time=1.40 s its=33554432]
nit: can I have START/STOP_TIMER number? This is not important.
Also, how does the SSE time compare to before? Is there a slowdown?
All times are the best from 5 runs:
Atom, unpatched: 5386 dezicycles in dct32, 2096842 runs, 316 skips
Atom, patched: 5362 dezicycles in dct32, 2096842 runs, 310 skips
SB, AVX: 1059 dezicycles in dct32, 33549877 runs, 4555 skips
SB, SSE patched: 1203 dezicycles in dct32, 33549151 runs, 5281 skips
SB, SSE unpatched: 1199 dezicycles in dct32, 33549046 runs, 5386 skips
> +%macro PASS6_AND_PERMUTE 0
> + mov tmpd, [outd+4]
> + movss xmm7, [outd+72]
> + addss xmm7, [outq+76]
> + movss xmm3, [outq+56]
> + addss xmm3, [outq+60]
> + addss xmm4, xmm3
> + movss xmm2, [outq+52]
> + addss xmm2, xmm3
> + movss xmm3, [outq+104]
> + addss xmm3, [outq+108]
> + addss xmm1, xmm3
> + addss xmm5, xmm4
> + movss [outq+16], xmm1
> + movss xmm1, [outq+100]
> + addss xmm1, xmm3
> + movss xmm3, [outq+40]
> + movss [outq+48], xmm1
> + addss xmm3, [outq+44]
> + movss xmm1, [outq+100]
> + addss xmm4, xmm3
> + addss xmm3, xmm2
> + addss xmm1, [outq+108]
> + movss [outq+40], xmm3
> + addss xmm2, [outq+36]
> + movss xmm3, [outq+8]
> + movss [outq+56], xmm2
> + addss xmm3, [outq+12]
> + movss [outq+32], xmm3
> + movss xmm3, [outq+80]
> + movss [outq+8], xmm5
> + movss [outq+80], xmm1
> + movss xmm2, [outq+52]
> + movss xmm5, [outq+120]
> + addss xmm5, [outq+124]
> + movss xmm1, [outq+64]
> + addss xmm2, [outq+60]
> + addss xmm0, xmm5
> + addss xmm5, [outq+116]
> + mov [outq+64], tmpd
> + addss xmm6, xmm0
> + addss xmm1, xmm6
> + mov tmpd, [outq+12]
> + mov [outq+96], tmpd
> + movss [outq+4], xmm1
> + movss xmm1, [outq+24]
> + movss [outq+24], xmm4
> + movss xmm4, [outq+88]
> + addss xmm4, [outq+92]
> + addss xmm3, xmm4
> + addss xmm4, [outq+84]
> + mov tmpd, [outq+108]
> + addss xmm1, [outq+28]
> + addss xmm0, xmm1
> + addss xmm1, xmm5
> + addss xmm6, xmm3
> + addss xmm3, xmm0
> + addss xmm0, xmm7
> + addss xmm5, [outq+20]
> + addss xmm7, xmm1
> + movss [outq+12], xmm6
> + mov [outq+112], tmpd
> + movss xmm6, [outq+28]
> + movss [outq+28], xmm0
> + movss xmm0, [outq+36]
> + movss [outq+36], xmm7
> + addss xmm1, xmm4
> + movss xmm7, [outq+116]
> + addss xmm0, xmm2
> + addss xmm7, [outq+124]
> + movss [outq+72], xmm0
> + movss xmm0, [outq+44]
> + addss xmm2, xmm0
> + movss [outq+44], xmm1
> + movss [outq+88], xmm2
> + addss xmm0, [outq+60]
> + mov tmpd, [outq+60]
> + mov [outq+120], tmpd
> + movss [outq+104], xmm0
> + addss xmm4, xmm5
> + addss xmm5, [outq+68]
> + movss [outq+52], xmm4
> + movss [outq+60], xmm5
> + movss xmm4, [outq+68]
> + movss xmm5, [outq+20]
> + movss [outq+20], xmm3
> + addss xmm5, xmm7
> + addss xmm7, xmm6
> + addss xmm4, xmm5
> + movss xmm2, [outq+84]
> + addss xmm2, [outq+92]
> + addss xmm5, xmm2
> + movss [outq+68], xmm4
> + addss xmm2, xmm7
> + movss xmm4, [outq+76]
> + movss [outq+84], xmm2
> + movss [outq+76], xmm5
> + addss xmm7, xmm4
> + addss xmm6, [outq+124]
> + addss xmm4, xmm6
> + addss xmm6, [outq+92]
> + movss [outq+100], xmm4
> + movss [outq+108], xmm6
> + movss xmm6, [outq+92]
> + movss [outq+92], xmm7
> + addss xmm6, [outq+124]
> + movss [outq+116], xmm6
> +%endmacro
Could this be SIMD? It looks horrific, so if the answer is no, I'll go with
that.
Unfortunately, I didn't found a good way. If you wonder what this looks
like in C, just look at dct32.c.
> +
> +%define BUTTERFLY BUTTERFLY_AVX
> +%define BUTTERFLY0 BUTTERFLY0_AVX
> +
> section .text align=16
> +cglobal dct32_float_avx, 2,3,8, out, in, tmp
nit: add a function header.
done
Not a nit: can you get this down to only using 7 registers? A quick glance
suggests yes. This is better on Windows.
No, PASS6_AND_PERMUTE needs all 8 registers.
> + vzeroupper
> +
> + ; pass 6, no SIMD...
> + PASS6_AND_PERMUTE
> + REP_RET
Same as Ronald's comment from before. Just RET.
done
> +%define BUTTERFLY BUTTERFLY_SSE
> +%define BUTTERFLY0 BUTTERFLY0_SSE
> +
> cglobal dct32_float_sse, 2,3,8, out, in, tmp
> ; pass 1
>
> @@ -72,8 +274,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
> movaps xmm7, [inq+64]
> movaps xmm4, [inq+48]
> shufps xmm4, xmm4, 0x1b
> - BUTTERFLY xmm7, xmm4, [ps_cos_vec+48], xmm3
> -
> + BUTTERFLY xmm7, xmm4, [ps_cos_vec+32], xmm3
>
> ; pass 2
> movaps xmm2, [ps_cos_vec+64]
> @@ -90,7 +291,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
> movaps xmm4, [inq+80]
> movaps xmm5, [inq+32]
> shufps xmm5, xmm5, 0x1b
> - BUTTERFLY xmm4, xmm5, [ps_cos_vec+32], xmm3
> + BUTTERFLY xmm4, xmm5, [ps_cos_vec+48], xmm3
>
> ; pass 2
> BUTTERFLY xmm0, xmm7, xmm2, xmm3
> @@ -121,7 +322,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
>
> ; pass 4
> movaps xmm3, [ps_p1p1m1m1+0]
> - movaps xmm2, [ps_cos_vec+112]
> + movaps xmm2, [ps_cos_vec+128]
>
> BUTTERFLY2 xmm5, xmm3, xmm2, xmm1
>
> @@ -146,7 +347,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
> BUTTERFLY2 xmm0, xmm3, xmm2, xmm1
>
> ; pass 5
> - movaps xmm2, [ps_cos_vec+128]
> + movaps xmm2, [ps_cos_vec+160]
> shufps xmm3, xmm3, 0xcc
>
> BUTTERFLY3 xmm5, xmm3, xmm2, xmm1
> @@ -177,110 +378,5 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
> movaps [outq+112], xmm0
>
> ; pass 6, no SIMD...
> - mov tmpq, [outd+4]
> - movss xmm7, [outd+72]
> - addss xmm7, [outq+76]
> - movss xmm3, [outq+56]
> - addss xmm3, [outq+60]
> - addss xmm4, xmm3
> - movss xmm2, [outq+52]
> - addss xmm2, xmm3
> - movss xmm3, [outq+104]
> - addss xmm3, [outq+108]
> - addss xmm1, xmm3
> - addss xmm5, xmm4
> - movss [outq+16], xmm1
> - movss xmm1, [outq+100]
> - addss xmm1, xmm3
> - movss xmm3, [outq+40]
> - movss [outq+48], xmm1
> - addss xmm3, [outq+44]
> - movss xmm1, [outq+100]
> - addss xmm4, xmm3
> - addss xmm3, xmm2
> - addss xmm1, [outq+108]
> - movss [outq+40], xmm3
> - addss xmm2, [outq+36]
> - movss xmm3, [outq+8]
> - movss [outq+56], xmm2
> - addss xmm3, [outq+12]
> - movss [outq+32], xmm3
> - movss xmm3, [outq+80]
> - movss [outq+8], xmm5
> - movss [outq+80], xmm1
> - movss xmm2, [outq+52]
> - movss xmm5, [outq+120]
> - addss xmm5, [outq+124]
> - movss xmm1, [outq+64]
> - addss xmm2, [outq+60]
> - addss xmm0, xmm5
> - addss xmm5, [outq+116]
> - mov [outq+64], tmpq
> - addss xmm6, xmm0
> - addss xmm1, xmm6
> - mov tmpq, [outq+12]
> - mov [outq+96], tmpq
> - movss [outq+4], xmm1
> - movss xmm1, [outq+24]
> - movss [outq+24], xmm4
> - movss xmm4, [outq+88]
> - addss xmm4, [outq+92]
> - addss xmm3, xmm4
> - addss xmm4, [outq+84]
> - mov tmpq, [outq+108]
> - addss xmm1, [outq+28]
> - addss xmm0, xmm1
> - addss xmm1, xmm5
> - addss xmm6, xmm3
> - addss xmm3, xmm0
> - addss xmm0, xmm7
> - addss xmm5, [outq+20]
> - addss xmm7, xmm1
> - movss [outq+12], xmm6
> - mov [outq+112], tmpq
> - movss xmm6, [outq+28]
> - movss [outq+28], xmm0
> - movss xmm0, [outq+36]
> - movss [outq+36], xmm7
> - addss xmm1, xmm4
> - movss xmm7, [outq+116]
> - addss xmm0, xmm2
> - addss xmm7, [outq+124]
> - movss [outq+72], xmm0
> - movss xmm0, [outq+44]
> - addss xmm2, xmm0
> - movss [outq+44], xmm1
> - movss [outq+88], xmm2
> - addss xmm0, [outq+60]
> - mov tmpq, [outq+60]
> - mov [outq+120], tmpq
> - movss [outq+104], xmm0
> - addss xmm4, xmm5
> - addss xmm5, [outq+68]
> - movss [outq+52], xmm4
> - movss [outq+60], xmm5
> - movss xmm4, [outq+68]
> - movss xmm5, [outq+20]
> - movss [outq+20], xmm3
> - addss xmm5, xmm7
> - addss xmm7, xmm6
> - addss xmm4, xmm5
> - movss xmm2, [outq+84]
> - addss xmm2, [outq+92]
> - addss xmm5, xmm2
> - movss [outq+68], xmm4
> - addss xmm2, xmm7
> - movss xmm4, [outq+76]
> - movss [outq+84], xmm2
> - movss [outq+76], xmm5
> - addss xmm7, xmm4
> - addss xmm6, [outq+124]
> - addss xmm4, xmm6
> - addss xmm6, [outq+92]
> - movss [outq+100], xmm4
> - movss [outq+108], xmm6
> - movss xmm6, [outq+92]
> - movss [outq+92], xmm7
> - addss xmm6, [outq+124]
> - movss [outq+116], xmm6
> + PASS6_AND_PERMUTE
> REP_RET
Same as above.
> diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c
> index b29412c..8eef421 100644
> --- a/libavcodec/x86/fft.c
> +++ b/libavcodec/x86/fft.c
> @@ -57,7 +57,9 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
> av_cold void ff_dct_init_mmx(DCTContext *s)
> {
> int has_vectors = av_get_cpu_flags();
> - if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
> + if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX)
> + s->dct32 = ff_dct32_float_avx;
> + else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
> s->dct32 = ff_dct32_float_sse;
> }
> #endif
> diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
> index e6eace2..c714185 100644
> --- a/libavcodec/x86/fft.h
> +++ b/libavcodec/x86/fft.h
> @@ -35,5 +35,6 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample
*output, const FFTSample *input)
> void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample
*input);
> void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample
*input);
> void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
> +void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
>
> #endif
> --
> 1.7.4.1
One more thing that came up as I was talking with Diego: AVX can be disabled by
the user. Put the AVX code under %ifdef HAVE_AVX and #ifdef HAVE_AVX, or
whatever the flags were in libav.
done also.
Indentation suggestion from Ronald also taken into consideration.
-Vitor
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel