Re: [FFmpeg-devel] [PATCH 4/4] x86: xvid_idct: SSE2 merged add version
On Thu, Mar 12, 2015 at 08:15:23PM +0100, Christophe Gisquet wrote: 2015-03-11 0:11 GMT+01:00 Christophe Gisquet christophe.gisq...@gmail.com: --- libavcodec/x86/xvididct.asm| 92 -- libavcodec/x86/xvididct_init.c | 9 + 2 files changed, 91 insertions(+), 10 deletions(-) Another refresh. xvididct.asm| 92 ++-- xvididct_init.c |9 + 2 files changed, 91 insertions(+), 10 deletions(-) dfa3e9e8d4f96a59c351688ad57cad8cd3c440fe 0004-x86-xvid_idct-SSE2-merged-add-version.patch From 6044cf0ac09c93d363b4a5cf1496d1e330a2fe9b Mon Sep 17 00:00:00 2001 From: Christophe Gisquet christophe.gisq...@gmail.com Date: Tue, 10 Mar 2015 23:11:54 + Subject: [PATCH 4/4] x86: xvid_idct: SSE2 merged add version applied thanks [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB The greatest way to live with honor in this world is to be what we pretend to be. -- Socrates signature.asc Description: Digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH 4/4] x86: xvid_idct: SSE2 merged add version
2015-03-11 0:11 GMT+01:00 Christophe Gisquet christophe.gisq...@gmail.com: --- libavcodec/x86/xvididct.asm| 92 -- libavcodec/x86/xvididct_init.c | 9 + 2 files changed, 91 insertions(+), 10 deletions(-) Another refresh. From 6044cf0ac09c93d363b4a5cf1496d1e330a2fe9b Mon Sep 17 00:00:00 2001 From: Christophe Gisquet christophe.gisq...@gmail.com Date: Tue, 10 Mar 2015 23:11:54 + Subject: [PATCH 4/4] x86: xvid_idct: SSE2 merged add version --- libavcodec/x86/xvididct.asm| 92 -- libavcodec/x86/xvididct_init.c | 9 + 2 files changed, 91 insertions(+), 10 deletions(-) diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm index 58ffb11..0220885 100644 --- a/libavcodec/x86/xvididct.asm +++ b/libavcodec/x86/xvididct.asm @@ -384,6 +384,12 @@ SECTION .text ; Must now load args as gprs are no longer used for masks ; DEST is set to where address of dest was loaded %if ARCH_X86_32 +%if %2 == 2 ; Not enough xmms, store +movdqa [%1+1*16], TAN3 +movdqa [%1+2*16], xmm3 +movdqa [%1+5*16], REG0 +movdqa [%1+6*16], xmm5 +%endif %xdefine DEST r2q ; BLOCK is r0, stride r1 movifnidn DEST, destm movifnidn strideq, stridem @@ -397,8 +403,6 @@ SECTION .text movq [DEST + strideq], TAN3 movhps [DEST + 2*strideq], TAN3 ; REG0 and TAN3 are now available (and likely used in second half) -%else -%warning Unimplemented %endif %endif %endmacro @@ -427,7 +431,88 @@ SECTION .text movq [DEST + 2*strideq], xmm5 movhps [DEST + strideq], xmm5 %elif %2 == 2 -%warning Unimplemented +pxorxmm0, xmm0 +%if ARCH_X86_32 +; free: m3 REG0=m4 m5 +; input: m1, m7, m2, m6 +movqxmm3, [DEST+0*strideq] +movqxmm4, [DEST+1*strideq] +punpcklbw xmm3, xmm0 +punpcklbw xmm4, xmm0 +paddsw xmm3, %3 +paddsw xmm4, [%1 + 1*16] +movq %3, [DEST+2*strideq] +movqxmm5, [DEST+ r3q] +punpcklbw %3, xmm0 +punpcklbw xmm5, xmm0 +paddsw%3, [%1 + 2*16] +paddsw xmm5, %5 +packuswbxmm3, xmm4 +packuswb %3, xmm5 +movq[DEST+0*strideq], xmm3 +movhps [DEST+1*strideq], xmm3 +movq[DEST+2*strideq], %3 +movhps [DEST+ r3q], %3 +lea DEST, [DEST+4*strideq] +movqxmm3, [DEST+0*strideq] +movqxmm4, [DEST+1*strideq] +movq %3, [DEST+2*strideq] +movqxmm5, [DEST+ r3q] +punpcklbw xmm3, xmm0 +punpcklbw xmm4, xmm0 +punpcklbw %3, xmm0 +punpcklbw xmm5, xmm0 +paddsw xmm3, %6 +paddsw xmm4, [%1 + 5*16] +paddsw%3, [%1 + 6*16] +paddsw xmm5, %4 +packuswbxmm3, xmm4 +packuswb %3, xmm5 +movq[DEST+0*strideq], xmm3 +movhps [DEST+1*strideq], xmm3 +movq[DEST+2*strideq], %3 +movhps [DEST+ r3q], %3 +%else +; l1:TAN3=m13 l2:m3 l5:REG0=m8 l6=m5 +; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10 +movqxmm2, [DEST+0*strideq] +movqxmm4, [DEST+1*strideq] +movq xmm12, [DEST+2*strideq] +movq xmm11, [DEST+ r3q] +punpcklbw xmm2, xmm0 +punpcklbw xmm4, xmm0 +punpcklbw xmm12, xmm0 +punpcklbw xmm11, xmm0 +paddsw xmm2, %3 +paddsw xmm4, TAN3 +paddsw xmm12, xmm3 +paddsw xmm11, %5 +packuswbxmm2, xmm4 +packuswb xmm12, xmm11 +movq[DEST+0*strideq], xmm2 +movhps [DEST+1*strideq], xmm2 +movq[DEST+2*strideq], xmm12 +movhps [DEST+ r3q], xmm12 +lea DEST, [DEST+4*strideq] +movqxmm2, [DEST+0*strideq] +movqxmm4, [DEST+1*strideq] +movq xmm12, [DEST+2*strideq] +movq xmm11, [DEST+ r3q] +punpcklbw xmm2, xmm0 +punpcklbw xmm4, xmm0 +punpcklbw xmm12, xmm0 +punpcklbw xmm11, xmm0 +paddsw xmm2, %6 +paddsw xmm4, REG0 +paddsw xmm12, xmm5 +paddsw xmm11, %4 +packuswbxmm2, xmm4 +packuswb xmm12, xmm11 +movq[DEST+0*strideq], xmm2 +movhps [DEST+1*strideq], xmm2 +movq[DEST+2*strideq], xmm12 +movhps [DEST+ r3q], xmm12 +%endif %endif %endmacro @@ -623,6 +708,7 @@ cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block INIT_XMM sse2 IDCT_SSE2 0 IDCT_SSE2 1 +IDCT_SSE2 2 %if ARCH_X86_32 diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c index 2530d7a..57f6ed6 100644 --- a/libavcodec/x86/xvididct_init.c +++ b/libavcodec/x86/xvididct_init.c @@ -27,12 +27,7 @@ #include xvididct.h void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block); - -static void xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block) -{ -ff_xvid_idct_sse2(block); -
[FFmpeg-devel] [PATCH 4/4] x86: xvid_idct: SSE2 merged add version
--- libavcodec/x86/xvididct.asm| 92 -- libavcodec/x86/xvididct_init.c | 9 + 2 files changed, 91 insertions(+), 10 deletions(-) diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm index 58ffb11..0220885 100644 --- a/libavcodec/x86/xvididct.asm +++ b/libavcodec/x86/xvididct.asm @@ -384,6 +384,12 @@ SECTION .text ; Must now load args as gprs are no longer used for masks ; DEST is set to where address of dest was loaded %if ARCH_X86_32 +%if %2 == 2 ; Not enough xmms, store +movdqa [%1+1*16], TAN3 +movdqa [%1+2*16], xmm3 +movdqa [%1+5*16], REG0 +movdqa [%1+6*16], xmm5 +%endif %xdefine DEST r2q ; BLOCK is r0, stride r1 movifnidn DEST, destm movifnidn strideq, stridem @@ -397,8 +403,6 @@ SECTION .text movq [DEST + strideq], TAN3 movhps [DEST + 2*strideq], TAN3 ; REG0 and TAN3 are now available (and likely used in second half) -%else -%warning Unimplemented %endif %endif %endmacro @@ -427,7 +431,88 @@ SECTION .text movq [DEST + 2*strideq], xmm5 movhps [DEST + strideq], xmm5 %elif %2 == 2 -%warning Unimplemented +pxorxmm0, xmm0 +%if ARCH_X86_32 +; free: m3 REG0=m4 m5 +; input: m1, m7, m2, m6 +movqxmm3, [DEST+0*strideq] +movqxmm4, [DEST+1*strideq] +punpcklbw xmm3, xmm0 +punpcklbw xmm4, xmm0 +paddsw xmm3, %3 +paddsw xmm4, [%1 + 1*16] +movq %3, [DEST+2*strideq] +movqxmm5, [DEST+ r3q] +punpcklbw %3, xmm0 +punpcklbw xmm5, xmm0 +paddsw%3, [%1 + 2*16] +paddsw xmm5, %5 +packuswbxmm3, xmm4 +packuswb %3, xmm5 +movq[DEST+0*strideq], xmm3 +movhps [DEST+1*strideq], xmm3 +movq[DEST+2*strideq], %3 +movhps [DEST+ r3q], %3 +lea DEST, [DEST+4*strideq] +movqxmm3, [DEST+0*strideq] +movqxmm4, [DEST+1*strideq] +movq %3, [DEST+2*strideq] +movqxmm5, [DEST+ r3q] +punpcklbw xmm3, xmm0 +punpcklbw xmm4, xmm0 +punpcklbw %3, xmm0 +punpcklbw xmm5, xmm0 +paddsw xmm3, %6 +paddsw xmm4, [%1 + 5*16] +paddsw%3, [%1 + 6*16] +paddsw xmm5, %4 +packuswbxmm3, xmm4 +packuswb %3, xmm5 +movq[DEST+0*strideq], xmm3 +movhps [DEST+1*strideq], xmm3 +movq[DEST+2*strideq], %3 +movhps [DEST+ r3q], %3 +%else +; l1:TAN3=m13 l2:m3 l5:REG0=m8 l6=m5 +; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10 +movqxmm2, [DEST+0*strideq] +movqxmm4, [DEST+1*strideq] +movq xmm12, [DEST+2*strideq] +movq xmm11, [DEST+ r3q] +punpcklbw xmm2, xmm0 +punpcklbw xmm4, xmm0 +punpcklbw xmm12, xmm0 +punpcklbw xmm11, xmm0 +paddsw xmm2, %3 +paddsw xmm4, TAN3 +paddsw xmm12, xmm3 +paddsw xmm11, %5 +packuswbxmm2, xmm4 +packuswb xmm12, xmm11 +movq[DEST+0*strideq], xmm2 +movhps [DEST+1*strideq], xmm2 +movq[DEST+2*strideq], xmm12 +movhps [DEST+ r3q], xmm12 +lea DEST, [DEST+4*strideq] +movqxmm2, [DEST+0*strideq] +movqxmm4, [DEST+1*strideq] +movq xmm12, [DEST+2*strideq] +movq xmm11, [DEST+ r3q] +punpcklbw xmm2, xmm0 +punpcklbw xmm4, xmm0 +punpcklbw xmm12, xmm0 +punpcklbw xmm11, xmm0 +paddsw xmm2, %6 +paddsw xmm4, REG0 +paddsw xmm12, xmm5 +paddsw xmm11, %4 +packuswbxmm2, xmm4 +packuswb xmm12, xmm11 +movq[DEST+0*strideq], xmm2 +movhps [DEST+1*strideq], xmm2 +movq[DEST+2*strideq], xmm12 +movhps [DEST+ r3q], xmm12 +%endif %endif %endmacro @@ -623,6 +708,7 @@ cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block INIT_XMM sse2 IDCT_SSE2 0 IDCT_SSE2 1 +IDCT_SSE2 2 %if ARCH_X86_32 diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c index 2530d7a..57f6ed6 100644 --- a/libavcodec/x86/xvididct_init.c +++ b/libavcodec/x86/xvididct_init.c @@ -27,12 +27,7 @@ #include xvididct.h void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block); - -static void xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block) -{ -ff_xvid_idct_sse2(block); -ff_add_pixels_clamped(block, dest, line_size); -} +void ff_xvid_idct_add_sse2(uint8_t *dest, int line_size, short *block); #if ARCH_X86_32 static void xvid_idct_mmx_put(uint8_t *dest, int line_size, short *block) @@ -88,7 +83,7 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, if (EXTERNAL_SSE2(cpu_flags)) { c-idct_put = ff_xvid_idct_put_sse2; -c-idct_add = xvid_idct_sse2_add; +c-idct_add = ff_xvid_idct_add_sse2; c-idct = ff_xvid_idct_sse2;