On Sat, May 21, 2011 at 19:19:14 (CEST), Loren Merritt wrote: > Patch updated to exclude the x86_32 part that didn't help. > > --Loren Merritt > > commit 34b691139b9bee40998ba91465f30611b8ea1589 > Author: Loren Merritt <[email protected]> > Date: 2011-05-21 16:42:47 +0000 > > dct32_sse: eliminate some spills > 125->104 cycles on penryn (x86_64 only) > > diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm > index 2e1176c..bafe002 100644 > --- a/libavcodec/x86/dct32_sse.asm > +++ b/libavcodec/x86/dct32_sse.asm > @@ -20,7 +20,7 @@ > > ;****************************************************************************** > > %include "x86inc.asm" > -%include "config.asm" > +%include "x86util.asm" > > SECTION_RODATA 32 > > @@ -37,8 +37,9 @@ ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 > dd 1.000000, 1.000000, 1.306563, 0.541196 > dd 1.000000, 0.707107, 1.000000, -0.707107 > dd 1.000000, 0.707107, 1.000000, -0.707107 > + dd 0.707107, 0.707107, 0.707107, 0.707107 > > - > +align 32 > ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 > > %macro BUTTERFLY_SSE 4 > @@ -77,6 +78,18 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, > 0x80000000, 0x80000000 > BUTTERFLY0 %1, %2, %3, %4, 0xb1 > %endmacro > > +%macro BUTTERFLY3V 5 > + movaps m%5, m%1 > + addps m%1, m%2 > + subps m%5, m%2 > + SWAP %2, %5 > + mulps m%2, [ps_cos_vec+192] > + movaps m%5, m%3 > + addps m%3, m%4 > + subps m%4, m%5 > + mulps m%4, [ps_cos_vec+192] > +%endmacro > + > %macro PASS6_AND_PERMUTE 0 > mov tmpd, [outq+4] > movss m7, [outq+72] > @@ -269,9 +282,131 @@ INIT_XMM > %define BUTTERFLY BUTTERFLY_SSE > %define BUTTERFLY0 BUTTERFLY0_SSE > > +%ifdef ARCH_X86_64 > +%define SPILL SWAP > +%define UNSPILL SWAP > + > +%macro PASS5 0 > + nop ; FIXME code alignment > + SWAP 5, 8 > + SWAP 4, 12 > + SWAP 6, 14 > + SWAP 7, 13 > + SWAP 0, 15 > + PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13 > + TRANSPOSE4x4PS 8, 9, 10, 11, 0 > + BUTTERFLY3V 8, 9, 10, 11, 0 > + addps m10, m11 > + TRANSPOSE4x4PS 12, 13, 14, 15, 0 > + BUTTERFLY3V 12, 13, 14, 15, 0 > + addps m14, m15 > + addps m12, m14 > + addps m14, m13 > + addps m13, m15 > +%endmacro > + > +%macro PASS6 0 > + SWAP 9, 12 > + SWAP 11, 14 > + movss [outq+0x00], m8 > + pshuflw m0, m8, 0xe > + movss [outq+0x10], m9 > + pshuflw m1, m9, 0xe > + movss [outq+0x20], m10 > + pshuflw m2, m10, 0xe > + movss [outq+0x30], m11 > + pshuflw m3, m11, 0xe > + movss [outq+0x40], m12 > + pshuflw m4, m12, 0xe > + movss [outq+0x50], m13 > + pshuflw m5, m13, 0xe > + movss [outq+0x60], m14 > + pshuflw m6, m14, 0xe > + movaps [outq+0x70], m15 > + pshuflw m7, m15, 0xe > + addss m0, m1 > + addss m1, m2 > + movss [outq+0x08], m0 > + addss m2, m3 > + movss [outq+0x18], m1 > + addss m3, m4 > + movss [outq+0x28], m2 > + addss m4, m5 > + movss [outq+0x38], m3 > + addss m5, m6 > + movss [outq+0x48], m4 > + addss m6, m7 > + movss [outq+0x58], m5 > + movss [outq+0x68], m6 > + movss [outq+0x78], m7 > + > + PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7 > + movhlps m0, m1 > + pshufd m1, m1, 3 > + SWAP 0, 2, 4, 6, 8, 10, 12, 14 > + SWAP 1, 3, 5, 7, 9, 11, 13, 15 > +%rep 7 > + movhlps m0, m1 > + pshufd m1, m1, 3 > + addss m15, m1 > + SWAP 0, 2, 4, 6, 8, 10, 12, 14 > + SWAP 1, 3, 5, 7, 9, 11, 13, 15 > +%endrep > +%assign i 4 > +%rep 15 > + addss m0, m1 > + movss [outq+i], m0 > + SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 > + %assign i i+8 > +%endrep > +%endmacro > + > +%else ; ARCH_X86_32 > +%macro SPILL 2 ; xmm#, mempos > + movaps [outq+(%2-8)*16], m%1 > +%endmacro > +%macro UNSPILL 2 > + movaps m%1, [outq+(%2-8)*16] > +%endmacro > + > +%define PASS6 PASS6_AND_PERMUTE > +%macro PASS5 0 > + movaps m2, [ps_cos_vec+160] > + shufps m3, m3, 0xcc > + > + BUTTERFLY3 m5, m3, m2, m1 > + SPILL 5, 8 > + > + UNSPILL 1, 9 > + BUTTERFLY3 m1, m3, m2, m5 > + SPILL 1, 14 > + > + BUTTERFLY3 m4, m3, m2, m5 > + SPILL 4, 12 > + > + BUTTERFLY3 m7, m3, m2, m5 > + SPILL 7, 13 > + > + UNSPILL 5, 10 > + BUTTERFLY3 m5, m3, m2, m7 > + SPILL 5, 10 > + > + UNSPILL 4, 11 > + BUTTERFLY3 m4, m3, m2, m7 > + SPILL 4, 11 > + > + BUTTERFLY3 m6, m3, m2, m7 > + SPILL 6, 9 > + > + BUTTERFLY3 m0, m3, m2, m7 > + SPILL 0, 15 > +%endmacro > +%endif > + > + > INIT_XMM > ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) > -cglobal dct32_float_sse, 2,3,8, out, in, tmp > +cglobal dct32_float_sse, 2,3,16, out, in, tmp > ; pass 1 > > movaps m0, [inq+0] > @@ -287,8 +422,8 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp > ; pass 2 > movaps m2, [ps_cos_vec+64] > BUTTERFLY m1, m4, m2, m3 > - movaps [outq+48], m1 > - movaps [outq+ 0], m4 > + SPILL 1, 11 > + SPILL 4, 8 > > ; pass 1 > movaps m1, [inq+16] > @@ -313,17 +448,17 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp > movaps m2, [ps_cos_vec+96] > shufps m1, m1, 0x1b > BUTTERFLY m0, m1, m2, m3 > - movaps [outq+112], m0 > - movaps [outq+ 96], m1 > + SPILL 0, 15 > + SPILL 1, 14 > > - movaps m0, [outq+0] > + UNSPILL 0, 8 > shufps m5, m5, 0x1b > BUTTERFLY m0, m5, m2, m3 > > - movaps m1, [outq+48] > + UNSPILL 1, 11 > shufps m6, m6, 0x1b > BUTTERFLY m1, m6, m2, m3 > - movaps [outq+48], m1 > + SPILL 1, 11 > > shufps m4, m4, 0x1b > BUTTERFLY m7, m4, m2, m3 > @@ -335,57 +470,25 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp > BUTTERFLY2 m5, m3, m2, m1 > > BUTTERFLY2 m0, m3, m2, m1 > - movaps [outq+16], m0 > + SPILL 0, 9 > > BUTTERFLY2 m6, m3, m2, m1 > - movaps [outq+32], m6 > + SPILL 6, 10 > > - movaps m0, [outq+48] > + UNSPILL 0, 11 > BUTTERFLY2 m0, m3, m2, m1 > - movaps [outq+48], m0 > + SPILL 0, 11 > > BUTTERFLY2 m4, m3, m2, m1 > > BUTTERFLY2 m7, m3, m2, m1 > > - movaps m6, [outq+96] > + UNSPILL 6, 14 > BUTTERFLY2 m6, m3, m2, m1 > > - movaps m0, [outq+112] > + UNSPILL 0, 15 > BUTTERFLY2 m0, m3, m2, m1 > > - ; pass 5 > - movaps m2, [ps_cos_vec+160] > - shufps m3, m3, 0xcc > - > - BUTTERFLY3 m5, m3, m2, m1 > - movaps [outq+0], m5 > - > - movaps m1, [outq+16] > - BUTTERFLY3 m1, m3, m2, m5 > - movaps [outq+96], m1 > - > - BUTTERFLY3 m4, m3, m2, m5 > - movaps [outq+64], m4 > - > - BUTTERFLY3 m7, m3, m2, m5 > - movaps [outq+80], m7 > - > - movaps m5, [outq+32] > - BUTTERFLY3 m5, m3, m2, m7 > - movaps [outq+32], m5 > - > - movaps m4, [outq+48] > - BUTTERFLY3 m4, m3, m2, m7 > - movaps [outq+48], m4 > - > - BUTTERFLY3 m6, m3, m2, m7 > - movaps [outq+16], m6 > - > - BUTTERFLY3 m0, m3, m2, m7 > - movaps [outq+112], m0 > - > - > - ; pass 6, no SIMD... > - PASS6_AND_PERMUTE > + PASS5 > + PASS6 > RET > diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm > index 13d6cc0..efab87d 100644 > --- a/libavcodec/x86/fmtconvert.asm > +++ b/libavcodec/x86/fmtconvert.asm > @@ -95,13 +95,6 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2 > ; void ff_float_interleave6(float *dst, const float **src, unsigned int len); > > ;----------------------------------------------------------------------------- > > -%macro BUTTERFLYPS 3 > - movaps m%3, m%1 > - unpcklps m%1, m%2 > - unpckhps m%3, m%2 > - SWAP %2, %3 > -%endmacro > - > %macro FLOAT_INTERLEAVE6 2 > cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5 > %ifdef ARCH_X86_64 > @@ -130,9 +123,9 @@ cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, > src2, src3, src4, src5 > movaps m4, [srcq+src4q] > movaps m5, [srcq+src5q] > > - BUTTERFLYPS 0, 1, 6 > - BUTTERFLYPS 2, 3, 6 > - BUTTERFLYPS 4, 5, 6 > + SBUTTERFLYPS 0, 1, 6 > + SBUTTERFLYPS 2, 3, 6 > + SBUTTERFLYPS 4, 5, 6 > > movaps m6, m4 > shufps m4, m0, 0xe4 > diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm > index 7bd985a..141e960 100644 > --- a/libavcodec/x86/x86util.asm > +++ b/libavcodec/x86/x86util.asm > @@ -41,6 +41,13 @@ > SWAP %2, %4, %3 > %endmacro > > +%macro SBUTTERFLYPS 3 > + movaps m%3, m%1 > + unpcklps m%1, m%2 > + unpckhps m%3, m%2 > + SWAP %2, %3 > +%endmacro > + > %macro TRANSPOSE4x4B 5 > SBUTTERFLY bw, %1, %2, %5 > SBUTTERFLY bw, %3, %4, %5 > @@ -74,6 +81,19 @@ > SWAP %2, %3 > %endmacro > > +; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops > +%macro TRANSPOSE4x4PS 5 > + SBUTTERFLYPS %1, %2, %5 > + SBUTTERFLYPS %3, %4, %5 > + movaps m%5, m%1 > + movlhps m%1, m%3 > + movhlps m%3, m%5 > + movaps m%5, m%2 > + movlhps m%2, m%4 > + movhlps m%4, m%5 > + SWAP %2, %3 > +%endmacro > + > %macro TRANSPOSE8x8W 9-11 > %ifdef ARCH_X86_64 > SBUTTERFLY wd, %1, %2, %9
Patch compiles and passes both 'make test' and 'make fate'. Queued -- Gruesse/greetings, Reinhard Tartler, KeyID 945348A4 _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
