Re: [FFmpeg-devel] [PATCH 4/4] x86: xvid_idct: SSE2 merged add version

2015-03-14 Thread Michael Niedermayer
On Thu, Mar 12, 2015 at 08:15:23PM +0100, Christophe Gisquet wrote:
 2015-03-11 0:11 GMT+01:00 Christophe Gisquet christophe.gisq...@gmail.com:
  ---
   libavcodec/x86/xvididct.asm| 92 
  --
   libavcodec/x86/xvididct_init.c |  9 +
   2 files changed, 91 insertions(+), 10 deletions(-)
 
 Another refresh.

  xvididct.asm|   92 
 ++--
  xvididct_init.c |9 +
  2 files changed, 91 insertions(+), 10 deletions(-)
 dfa3e9e8d4f96a59c351688ad57cad8cd3c440fe  
 0004-x86-xvid_idct-SSE2-merged-add-version.patch
 From 6044cf0ac09c93d363b4a5cf1496d1e330a2fe9b Mon Sep 17 00:00:00 2001
 From: Christophe Gisquet christophe.gisq...@gmail.com
 Date: Tue, 10 Mar 2015 23:11:54 +
 Subject: [PATCH 4/4] x86: xvid_idct: SSE2 merged add version

applied

thanks

[...]

-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

The greatest way to live with honor in this world is to be what we pretend
to be. -- Socrates


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 4/4] x86: xvid_idct: SSE2 merged add version

2015-03-12 Thread Christophe Gisquet
2015-03-11 0:11 GMT+01:00 Christophe Gisquet christophe.gisq...@gmail.com:
 ---
  libavcodec/x86/xvididct.asm| 92 
 --
  libavcodec/x86/xvididct_init.c |  9 +
  2 files changed, 91 insertions(+), 10 deletions(-)

Another refresh.
From 6044cf0ac09c93d363b4a5cf1496d1e330a2fe9b Mon Sep 17 00:00:00 2001
From: Christophe Gisquet christophe.gisq...@gmail.com
Date: Tue, 10 Mar 2015 23:11:54 +
Subject: [PATCH 4/4] x86: xvid_idct: SSE2 merged add version

---
 libavcodec/x86/xvididct.asm| 92 --
 libavcodec/x86/xvididct_init.c |  9 +
 2 files changed, 91 insertions(+), 10 deletions(-)

diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
index 58ffb11..0220885 100644
--- a/libavcodec/x86/xvididct.asm
+++ b/libavcodec/x86/xvididct.asm
@@ -384,6 +384,12 @@ SECTION .text
 ; Must now load args as gprs are no longer used for masks
 ; DEST is set to where address of dest was loaded
 %if ARCH_X86_32
+%if %2 == 2 ; Not enough xmms, store
+movdqa   [%1+1*16], TAN3
+movdqa   [%1+2*16], xmm3
+movdqa   [%1+5*16], REG0
+movdqa   [%1+6*16], xmm5
+%endif
 %xdefine DEST r2q ; BLOCK is r0, stride r1
 movifnidn DEST, destm
 movifnidn strideq, stridem
@@ -397,8 +403,6 @@ SECTION .text
 movq [DEST + strideq], TAN3
 movhps   [DEST + 2*strideq], TAN3
 ; REG0 and TAN3 are now available (and likely used in second half)
-%else
-%warning Unimplemented
 %endif
 %endif
 %endmacro
@@ -427,7 +431,88 @@ SECTION .text
 movq [DEST + 2*strideq], xmm5
 movhps   [DEST + strideq], xmm5
 %elif %2 == 2
-%warning Unimplemented
+pxorxmm0, xmm0
+%if ARCH_X86_32
+; free: m3 REG0=m4 m5
+; input: m1, m7, m2, m6
+movqxmm3, [DEST+0*strideq]
+movqxmm4, [DEST+1*strideq]
+punpcklbw   xmm3, xmm0
+punpcklbw   xmm4, xmm0
+paddsw  xmm3, %3
+paddsw  xmm4, [%1 + 1*16]
+movq  %3, [DEST+2*strideq]
+movqxmm5, [DEST+  r3q]
+punpcklbw %3, xmm0
+punpcklbw   xmm5, xmm0
+paddsw%3, [%1 + 2*16]
+paddsw  xmm5, %5
+packuswbxmm3, xmm4
+packuswb  %3, xmm5
+movq[DEST+0*strideq], xmm3
+movhps  [DEST+1*strideq], xmm3
+movq[DEST+2*strideq], %3
+movhps  [DEST+  r3q], %3
+lea DEST, [DEST+4*strideq]
+movqxmm3, [DEST+0*strideq]
+movqxmm4, [DEST+1*strideq]
+movq  %3, [DEST+2*strideq]
+movqxmm5, [DEST+  r3q]
+punpcklbw   xmm3, xmm0
+punpcklbw   xmm4, xmm0
+punpcklbw %3, xmm0
+punpcklbw   xmm5, xmm0
+paddsw  xmm3, %6
+paddsw  xmm4, [%1 + 5*16]
+paddsw%3, [%1 + 6*16]
+paddsw  xmm5, %4
+packuswbxmm3, xmm4
+packuswb  %3, xmm5
+movq[DEST+0*strideq], xmm3
+movhps  [DEST+1*strideq], xmm3
+movq[DEST+2*strideq], %3
+movhps  [DEST+  r3q], %3
+%else
+; l1:TAN3=m13  l2:m3  l5:REG0=m8 l6=m5
+; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10
+movqxmm2, [DEST+0*strideq]
+movqxmm4, [DEST+1*strideq]
+movq   xmm12, [DEST+2*strideq]
+movq   xmm11, [DEST+  r3q]
+punpcklbw   xmm2, xmm0
+punpcklbw   xmm4, xmm0
+punpcklbw  xmm12, xmm0
+punpcklbw  xmm11, xmm0
+paddsw  xmm2, %3
+paddsw  xmm4, TAN3
+paddsw xmm12, xmm3
+paddsw xmm11, %5
+packuswbxmm2, xmm4
+packuswb   xmm12, xmm11
+movq[DEST+0*strideq], xmm2
+movhps  [DEST+1*strideq], xmm2
+movq[DEST+2*strideq], xmm12
+movhps  [DEST+  r3q], xmm12
+lea DEST, [DEST+4*strideq]
+movqxmm2, [DEST+0*strideq]
+movqxmm4, [DEST+1*strideq]
+movq   xmm12, [DEST+2*strideq]
+movq   xmm11, [DEST+  r3q]
+punpcklbw   xmm2, xmm0
+punpcklbw   xmm4, xmm0
+punpcklbw  xmm12, xmm0
+punpcklbw  xmm11, xmm0
+paddsw  xmm2, %6
+paddsw  xmm4, REG0
+paddsw xmm12, xmm5
+paddsw xmm11, %4
+packuswbxmm2, xmm4
+packuswb   xmm12, xmm11
+movq[DEST+0*strideq], xmm2
+movhps  [DEST+1*strideq], xmm2
+movq[DEST+2*strideq], xmm12
+movhps  [DEST+  r3q], xmm12
+%endif
 %endif
 %endmacro
 
@@ -623,6 +708,7 @@ cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
 INIT_XMM sse2
 IDCT_SSE2 0
 IDCT_SSE2 1
+IDCT_SSE2 2
 
 %if ARCH_X86_32
 
diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c
index 2530d7a..57f6ed6 100644
--- a/libavcodec/x86/xvididct_init.c
+++ b/libavcodec/x86/xvididct_init.c
@@ -27,12 +27,7 @@
 #include xvididct.h
 
 void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block);
-
-static void xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block)
-{
-ff_xvid_idct_sse2(block);
-

[FFmpeg-devel] [PATCH 4/4] x86: xvid_idct: SSE2 merged add version

2015-03-10 Thread Christophe Gisquet
---
 libavcodec/x86/xvididct.asm| 92 --
 libavcodec/x86/xvididct_init.c |  9 +
 2 files changed, 91 insertions(+), 10 deletions(-)

diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
index 58ffb11..0220885 100644
--- a/libavcodec/x86/xvididct.asm
+++ b/libavcodec/x86/xvididct.asm
@@ -384,6 +384,12 @@ SECTION .text
 ; Must now load args as gprs are no longer used for masks
 ; DEST is set to where address of dest was loaded
 %if ARCH_X86_32
+%if %2 == 2 ; Not enough xmms, store
+movdqa   [%1+1*16], TAN3
+movdqa   [%1+2*16], xmm3
+movdqa   [%1+5*16], REG0
+movdqa   [%1+6*16], xmm5
+%endif
 %xdefine DEST r2q ; BLOCK is r0, stride r1
 movifnidn DEST, destm
 movifnidn strideq, stridem
@@ -397,8 +403,6 @@ SECTION .text
 movq [DEST + strideq], TAN3
 movhps   [DEST + 2*strideq], TAN3
 ; REG0 and TAN3 are now available (and likely used in second half)
-%else
-%warning Unimplemented
 %endif
 %endif
 %endmacro
@@ -427,7 +431,88 @@ SECTION .text
 movq [DEST + 2*strideq], xmm5
 movhps   [DEST + strideq], xmm5
 %elif %2 == 2
-%warning Unimplemented
+pxorxmm0, xmm0
+%if ARCH_X86_32
+; free: m3 REG0=m4 m5
+; input: m1, m7, m2, m6
+movqxmm3, [DEST+0*strideq]
+movqxmm4, [DEST+1*strideq]
+punpcklbw   xmm3, xmm0
+punpcklbw   xmm4, xmm0
+paddsw  xmm3, %3
+paddsw  xmm4, [%1 + 1*16]
+movq  %3, [DEST+2*strideq]
+movqxmm5, [DEST+  r3q]
+punpcklbw %3, xmm0
+punpcklbw   xmm5, xmm0
+paddsw%3, [%1 + 2*16]
+paddsw  xmm5, %5
+packuswbxmm3, xmm4
+packuswb  %3, xmm5
+movq[DEST+0*strideq], xmm3
+movhps  [DEST+1*strideq], xmm3
+movq[DEST+2*strideq], %3
+movhps  [DEST+  r3q], %3
+lea DEST, [DEST+4*strideq]
+movqxmm3, [DEST+0*strideq]
+movqxmm4, [DEST+1*strideq]
+movq  %3, [DEST+2*strideq]
+movqxmm5, [DEST+  r3q]
+punpcklbw   xmm3, xmm0
+punpcklbw   xmm4, xmm0
+punpcklbw %3, xmm0
+punpcklbw   xmm5, xmm0
+paddsw  xmm3, %6
+paddsw  xmm4, [%1 + 5*16]
+paddsw%3, [%1 + 6*16]
+paddsw  xmm5, %4
+packuswbxmm3, xmm4
+packuswb  %3, xmm5
+movq[DEST+0*strideq], xmm3
+movhps  [DEST+1*strideq], xmm3
+movq[DEST+2*strideq], %3
+movhps  [DEST+  r3q], %3
+%else
+; l1:TAN3=m13  l2:m3  l5:REG0=m8 l6=m5
+; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10
+movqxmm2, [DEST+0*strideq]
+movqxmm4, [DEST+1*strideq]
+movq   xmm12, [DEST+2*strideq]
+movq   xmm11, [DEST+  r3q]
+punpcklbw   xmm2, xmm0
+punpcklbw   xmm4, xmm0
+punpcklbw  xmm12, xmm0
+punpcklbw  xmm11, xmm0
+paddsw  xmm2, %3
+paddsw  xmm4, TAN3
+paddsw xmm12, xmm3
+paddsw xmm11, %5
+packuswbxmm2, xmm4
+packuswb   xmm12, xmm11
+movq[DEST+0*strideq], xmm2
+movhps  [DEST+1*strideq], xmm2
+movq[DEST+2*strideq], xmm12
+movhps  [DEST+  r3q], xmm12
+lea DEST, [DEST+4*strideq]
+movqxmm2, [DEST+0*strideq]
+movqxmm4, [DEST+1*strideq]
+movq   xmm12, [DEST+2*strideq]
+movq   xmm11, [DEST+  r3q]
+punpcklbw   xmm2, xmm0
+punpcklbw   xmm4, xmm0
+punpcklbw  xmm12, xmm0
+punpcklbw  xmm11, xmm0
+paddsw  xmm2, %6
+paddsw  xmm4, REG0
+paddsw xmm12, xmm5
+paddsw xmm11, %4
+packuswbxmm2, xmm4
+packuswb   xmm12, xmm11
+movq[DEST+0*strideq], xmm2
+movhps  [DEST+1*strideq], xmm2
+movq[DEST+2*strideq], xmm12
+movhps  [DEST+  r3q], xmm12
+%endif
 %endif
 %endmacro
 
@@ -623,6 +708,7 @@ cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, 
stride, block
 INIT_XMM sse2
 IDCT_SSE2 0
 IDCT_SSE2 1
+IDCT_SSE2 2
 
 %if ARCH_X86_32
 
diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c
index 2530d7a..57f6ed6 100644
--- a/libavcodec/x86/xvididct_init.c
+++ b/libavcodec/x86/xvididct_init.c
@@ -27,12 +27,7 @@
 #include xvididct.h
 
 void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block);
-
-static void xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block)
-{
-ff_xvid_idct_sse2(block);
-ff_add_pixels_clamped(block, dest, line_size);
-}
+void ff_xvid_idct_add_sse2(uint8_t *dest, int line_size, short *block);
 
 #if ARCH_X86_32
 static void xvid_idct_mmx_put(uint8_t *dest, int line_size, short *block)
@@ -88,7 +83,7 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, 
AVCodecContext *avctx,
 
 if (EXTERNAL_SSE2(cpu_flags)) {
 c-idct_put  = ff_xvid_idct_put_sse2;
-c-idct_add  = xvid_idct_sse2_add;
+c-idct_add  = ff_xvid_idct_add_sse2;
 c-idct  = ff_xvid_idct_sse2;