On 04/09/2013 06:57 PM, Christophe Gisquet wrote: >>From 253 to 51 cycles on Arrandale and Win64. > 44 cycles on SandyBridge. > --- > libavcodec/x86/sbrdsp.asm | 30 ++++++++++++++++++++++++++++++ > libavcodec/x86/sbrdsp_init.c | 5 +++++ > 2 files changed, 35 insertions(+) > > diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm > index 5e54505..5c21b11 100644 > --- a/libavcodec/x86/sbrdsp.asm > +++ b/libavcodec/x86/sbrdsp.asm > @@ -245,3 +245,33 @@ cglobal sbr_neg_odd_64, 1,2,4,z > cmp zq, r1q > jne .loop > REP_RET > + > +INIT_XMM sse2 > +cglobal sbr_qmf_pre_shuffle, 1,4,7,z > +%define OFFSET (32*4-2*mmsize) > + mov r3q, OFFSET > + lea r1q, [zq + (32+1)*4] > + lea r2q, [zq + 64*4] > + mova m6, [ps_neg] > +.loop: > + movu m0, [r1q] > + movu m2, [r1q + mmsize] > + movu m1, [zq + r3q + 4 + mmsize] > + movu m3, [zq + r3q + 4] > + > + pxor m2, m6 > + pxor m0, m6 > + pshufd m2, m2, q0123 > + pshufd m0, m0, q0123 > + SBUTTERFLY dq, 2, 3, 5 > + SBUTTERFLY dq, 0, 1, 4 > + mova [r2q + 2*r3q + 0*mmsize], m2 > + mova [r2q + 2*r3q + 1*mmsize], m3 > + mova [r2q + 2*r3q + 2*mmsize], m0 > + mova [r2q + 2*r3q + 3*mmsize], m1 > + add r1q, 2*mmsize > + sub r3q, 2*mmsize > + jge .loop > + mova m2, [zq]
why mova instead of movq? > + movq [r2q], m2 > + REP_RET Otherwise looks ok. -Justin _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
