On 2014-02-06 16:21:49 +0100, Diego Biurrun wrote: > On Thu, Feb 06, 2014 at 12:40:51AM +0000, Christophe Gisquet wrote: > > --- /dev/null > > +++ b/libavcodec/x86/dca.h > > @@ -0,0 +1,56 @@ > > +/* > > + * Copyright (c) 2012 Christophe Gisquet <christophe.gisq...@gmail.com> > > Happy new year? > > > +#if HAVE_SSE2_INLINE > > +# include "libavutil/x86/asm.h" > > +# include "libavutil/mem.h" > > + > > +#undef int8x8_fmul_int32 > > +static inline void int8x8_fmul_int32(float *dst, const int8_t *src, int > > scale) > > +{ > > + DECLARE_ALIGNED(16, static const uint32_t, inverse16) = 0x3D800000; > > + __asm__ volatile ( > > + "cvtsi2ss %2, %%xmm0 \n\t" > > + "mulss %3, %%xmm0 \n\t" > > +# if HAVE_SSE4_INLINE > > + "pmovsxbd 0(%1), %%xmm1 \n\t" > > + "pmovsxbd 4(%1), %%xmm2 \n\t" > > +# else > > + "movq (%1), %%xmm1 \n\t" > > + "punpcklbw %%xmm1, %%xmm1 \n\t" > > + "movaps %%xmm1, %%xmm2 \n\t" > > + "punpcklwd %%xmm1, %%xmm1 \n\t" > > + "punpckhwd %%xmm2, %%xmm2 \n\t" > > + "psrad $24, %%xmm1 \n\t" > > + "psrad $24, %%xmm2 \n\t" > > +# endif > > + "shufps $0, %%xmm0, %%xmm0 \n\t" > > + "cvtdq2ps %%xmm1, %%xmm1 \n\t" > > + "cvtdq2ps %%xmm2, %%xmm2 \n\t" > > + "mulps %%xmm0, %%xmm1 \n\t" > > + "mulps %%xmm0, %%xmm2 \n\t" > > + "movaps %%xmm1, 0(%0) \n\t" > > + "movaps %%xmm2, 16(%0) \n\t" > > + :: "r"(dst), "r"(src), "m"(scale), "m"(inverse16) > > + XMM_CLOBBERS_ONLY("xmm0", "xmm1", "xmm2") > > + ); > > +} > > +#define int8x8_fmul_int32(dsp) int8x8_fmul_int32 > > +#endif /* HAVE_SSE2_INLINE */ > > Does this have to be inline assembly?
The function is very short so the function call overhead becomes significant. 34 vs. 39 cycles on a cortex-a9, i.e. the inline version is over 10% faster. Janne _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel