ffmpeg | branch: master | James Almer <jamr...@gmail.com> | Tue Sep 30 22:21:40 2014 -0300| [acebff8e5dc0789c228b10ffcae2f2eb6c30a91d] | committer: James Almer
x86/mpegvideoencdsp: improve ff_pix_sum16_sse2 ~15% faster. Also add an mmxext version that takes advantage of the new code, and build it alongside with the mmx version only on x86_32. Reviewed-by: Michael Niedermayer <michae...@gmx.at> Signed-off-by: James Almer <jamr...@gmail.com> > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=acebff8e5dc0789c228b10ffcae2f2eb6c30a91d --- libavcodec/x86/mpegvideoencdsp.asm | 51 ++++++++++++++++++++++----------- libavcodec/x86/mpegvideoencdsp_init.c | 7 +++++ 2 files changed, 41 insertions(+), 17 deletions(-) diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm index 4fe6cfe..aec73f8 100644 --- a/libavcodec/x86/mpegvideoencdsp.asm +++ b/libavcodec/x86/mpegvideoencdsp.asm @@ -29,16 +29,16 @@ cextern pw_1 SECTION .text ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) -; %1 = number of xmm registers used -; %2 = number of loops -; %3 = number of GPRs used -%macro PIX_SUM16 4 -cglobal pix_sum16, 2, %3, %1 +; %1 = number of loops +; %2 = number of GPRs used +%macro PIX_SUM16 3 +cglobal pix_sum16, 2, %2, 6 movsxdifnidn r1, r1d - mov r2, %2 -%if cpuflag(xop) + mov r2, %1 +%if mmsize == 16 lea r3, [r1*3] -%else +%endif +%if notcpuflag(xop) pxor m5, m5 %endif pxor m4, m4 @@ -52,42 +52,59 @@ cglobal pix_sum16, 2, %3, %1 mova m0, [r0] %if mmsize == 8 mova m1, [r0+8] -%else +%if cpuflag(mmxext) + mova m2, [r0+r1] + mova m3, [r0+r1+8] +%endif +%else ; sse2 mova m1, [r0+r1] + mova m2, [r0+r1*2] + mova m3, [r0+r3] %endif +%if cpuflag(mmxext) + psadbw m0, m5 + psadbw m1, m5 + psadbw m2, m5 + psadbw m3, m5 +%else ; mmx punpckhbw m2, m0, m5 punpcklbw m0, m5 punpckhbw m3, m1, m5 punpcklbw m1, m5 +%endif ; cpuflag(mmxext) %endif ; cpuflag(xop) paddw m1, m0 paddw m3, m2 paddw m3, m1 paddw m4, m3 -%if mmsize == 8 - add r0, r1 +%if cpuflag(mmxext) + lea r0, [r0+r1*%3] %else - lea r0, [r0+r1*%4] + add r0, r1 %endif dec r2 jne .loop -%if cpuflag(xop) +%if mmsize == 16 pshufd m0, m4, q0032 paddd m4, m0 -%else +%elif notcpuflag(mmxext) HADDW m4, m5 %endif movd eax, m4 RET %endmacro +%if ARCH_X86_32 INIT_MMX mmx -PIX_SUM16 0, 16, 3, 0 +PIX_SUM16 16, 3, 0 +INIT_MMX mmxext +PIX_SUM16 8, 4, 2 +%endif INIT_XMM sse2 -PIX_SUM16 6, 8, 3, 2 +PIX_SUM16 4, 4, 4 %if HAVE_XOP_EXTERNAL INIT_XMM xop -PIX_SUM16 5, 4, 4, 4 +PIX_SUM16 4, 4, 4 %endif ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c index d91b902..2a4db61 100644 --- a/libavcodec/x86/mpegvideoencdsp_init.c +++ b/libavcodec/x86/mpegvideoencdsp_init.c @@ -24,6 +24,7 @@ #include "libavcodec/mpegvideoencdsp.h" int ff_pix_sum16_mmx(uint8_t *pix, int line_size); +int ff_pix_sum16_mmxext(uint8_t *pix, int line_size); int ff_pix_sum16_sse2(uint8_t *pix, int line_size); int ff_pix_sum16_xop(uint8_t *pix, int line_size); int ff_pix_norm1_mmx(uint8_t *pix, int line_size); @@ -218,11 +219,17 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, { int cpu_flags = av_get_cpu_flags(); +#if ARCH_X86_32 if (EXTERNAL_MMX(cpu_flags)) { c->pix_sum = ff_pix_sum16_mmx; c->pix_norm1 = ff_pix_norm1_mmx; } + if (EXTERNAL_MMXEXT(cpu_flags)) { + c->pix_sum = ff_pix_sum16_mmxext; + } +#endif + if (EXTERNAL_SSE2(cpu_flags)) { c->pix_sum = ff_pix_sum16_sse2; c->pix_norm1 = ff_pix_norm1_sse2; _______________________________________________ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog