Re: [FFmpeg-devel] [PATCH 1/4] x86: xvid: port SSE2 idct to yasm

2015-03-12 Thread Christophe Gisquet
Hi,

2015-03-11 3:46 GMT+01:00 James Almer jamr...@gmail.com:
 Should be YASM-OBJS, and moved to the end of the file.
 Also, related to the build failure Michael mentioned for the second patch, 
 this is
 missing an inline - external change in libavcodec/x86/dct-test.c

Here you are.

Passes fate-xvid-idct and dct-test builds and runs on both Win32 and Win64.

-- 
Christophe
From 86da5a1f111f9f36318daa906c3245d6b883feb3 Mon Sep 17 00:00:00 2001
From: Christophe Gisquet christophe.gisq...@gmail.com
Date: Tue, 10 Mar 2015 23:11:51 +
Subject: [PATCH 1/4] x86: xvid_idct: port SSE2 iDCT to yasm

The main difference consists in renaming properly labels, and
letting yasm select the gprs for skipping 1D transforms.
---
 libavcodec/x86/Makefile|   4 +-
 libavcodec/x86/dct-test.c  |   4 +-
 libavcodec/x86/xvididct.asm| 379 ++
 libavcodec/x86/xvididct_init.c |  18 +-
 libavcodec/x86/xvididct_sse2.c | 406 -
 5 files changed, 398 insertions(+), 413 deletions(-)
 create mode 100644 libavcodec/x86/xvididct.asm
 delete mode 100644 libavcodec/x86/xvididct_sse2.c

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 6b9164a..f46c7d5 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -73,8 +73,7 @@ MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o
 MMX-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct.o
 
 # decoders/encoders
-MMX-OBJS-$(CONFIG_MPEG4_DECODER)   += x86/xvididct_mmx.o\
-  x86/xvididct_sse2.o
+MMX-OBJS-$(CONFIG_MPEG4_DECODER)   += x86/xvididct_mmx.o
 MMX-OBJS-$(CONFIG_SNOW_DECODER)+= x86/snowdsp.o
 MMX-OBJS-$(CONFIG_SNOW_ENCODER)+= x86/snowdsp.o
 MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o
@@ -141,6 +140,7 @@ YASM-OBJS-$(CONFIG_HEVC_DECODER)   += x86/hevc_mc.o \
   x86/hevc_res_add.o\
   x86/hevc_sao.o
 YASM-OBJS-$(CONFIG_MLP_DECODER)+= x86/mlpdsp.o
+YASM-OBJS-$(CONFIG_MPEG4_DECODER)  += x86/xvididct.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)+= x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
 YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
diff --git a/libavcodec/x86/dct-test.c b/libavcodec/x86/dct-test.c
index 3414cb0..e14ce9a 100644
--- a/libavcodec/x86/dct-test.c
+++ b/libavcodec/x86/dct-test.c
@@ -67,9 +67,9 @@ static const struct algo idct_tab_arch[] = {
 #if HAVE_MMXEXT_INLINE
 { XVID-MMXEXT, ff_xvid_idct_mmxext, FF_IDCT_PERM_NONE,   AV_CPU_FLAG_MMXEXT, 1 },
 #endif
-#if HAVE_SSE2_INLINE
+#if HAVE_SSE2_EXTERNAL
 { XVID-SSE2,   ff_xvid_idct_sse2,   FF_IDCT_PERM_SSE2,   AV_CPU_FLAG_SSE2,   1 },
-#if ARCH_X86_64  HAVE_YASM
+#if ARCH_X86_64
 { PR-SSE2, ff_prores_idct_put_10_sse2_wrap, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 },
 #endif
 #endif
diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
new file mode 100644
index 000..d16db34
--- /dev/null
+++ b/libavcodec/x86/xvididct.asm
@@ -0,0 +1,379 @@
+; XVID MPEG-4 VIDEO CODEC
+; - SSE2 inverse discrete cosine transform -
+;
+; Copyright(C) 2003 Pascal Massimino s...@planet-d.net
+;
+; Conversion to gcc syntax with modifications
+; by Alexander Strange astra...@ithinksw.com
+;
+; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
+;
+; This file is part of FFmpeg.
+;
+; Vertical pass is an implementation of the scheme:
+;  Loeffler C., Ligtenberg A., and Moschytz C.S.:
+;  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
+;  Proc. ICASSP 1989, 988-991.
+;
+; Horizontal pass is a double 4x4 vector/matrix multiplication,
+; (see also Intel's Application Note 922:
+;  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+;  Copyright (C) 1999 Intel Corporation)
+;
+; More details at http://skal.planet-d.net/coding/dct.html
+;
+; FFmpeg is free software; you can redistribute it and/or
+; modify it under the terms of the GNU Lesser General Public
+; License as published by the Free Software Foundation; either
+; version 2.1 of the License, or (at your option) any later version.
+;
+; FFmpeg is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; Lesser General Public License for more details.
+;
+; You should have received a copy of the GNU Lesser General Public License
+; along with FFmpeg; if not, write to the Free Software Foundation,
+; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+%include libavutil/x86/x86util.asm
+
+SECTION_RODATA
+tan1:   times 8 dw 13036
+tan2:   times 8 dw 27146
+tan3:   times 8 dw 43790
+sqrt2:  times 8 dw 23170
+
+iTab1:  dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d
+dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 

Re: [FFmpeg-devel] [PATCH 1/4] x86: xvid: port SSE2 idct to yasm

2015-03-12 Thread Michael Niedermayer
On Thu, Mar 12, 2015 at 08:09:35PM +0100, Christophe Gisquet wrote:
 Hi,
 
 2015-03-11 3:46 GMT+01:00 James Almer jamr...@gmail.com:
  Should be YASM-OBJS, and moved to the end of the file.
  Also, related to the build failure Michael mentioned for the second patch, 
  this is
  missing an inline - external change in libavcodec/x86/dct-test.c
 
 Here you are.
 
 Passes fate-xvid-idct and dct-test builds and runs on both Win32 and Win64.
 
 -- 
 Christophe

  b/libavcodec/x86/Makefile|4 
  b/libavcodec/x86/dct-test.c  |4 
  b/libavcodec/x86/xvididct.asm|  379 
  b/libavcodec/x86/xvididct_init.c |   18 +
  libavcodec/x86/xvididct_sse2.c   |  406 
 ---
  5 files changed, 398 insertions(+), 413 deletions(-)
 f0d22fc5a505e06184d1c88c3632c1d357d0f576  
 0001-x86-xvid_idct-port-SSE2-iDCT-to-yasm.patch
 From 86da5a1f111f9f36318daa906c3245d6b883feb3 Mon Sep 17 00:00:00 2001
 From: Christophe Gisquet christophe.gisq...@gmail.com
 Date: Tue, 10 Mar 2015 23:11:51 +
 Subject: [PATCH 1/4] x86: xvid_idct: port SSE2 iDCT to yasm

applied

thanks

[...]

-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

In a rich man's house there is no place to spit but his face.
-- Diogenes of Sinope


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 1/4] x86: xvid: port SSE2 idct to yasm

2015-03-10 Thread James Almer
On 10/03/15 8:11 PM, Christophe Gisquet wrote:
 The main difference consists in renaming properly labels, and
 letting yasm select the gprs for skipping 1D transforms.
 ---
  libavcodec/x86/Makefile|   2 +-
  libavcodec/x86/xvididct.asm| 379 ++
  libavcodec/x86/xvididct_init.c |  18 +-
  libavcodec/x86/xvididct_sse2.c | 406 
 -
  4 files changed, 395 insertions(+), 410 deletions(-)
  create mode 100644 libavcodec/x86/xvididct.asm
  delete mode 100644 libavcodec/x86/xvididct_sse2.c
 
 diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
 index 6b9164a..276df44 100644
 --- a/libavcodec/x86/Makefile
 +++ b/libavcodec/x86/Makefile
 @@ -74,7 +74,7 @@ MMX-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct.o
  
  # decoders/encoders
  MMX-OBJS-$(CONFIG_MPEG4_DECODER)   += x86/xvididct_mmx.o\
 -  x86/xvididct_sse2.o
 +  x86/xvididct.o

Should be YASM-OBJS, and moved to the end of the file.
Also, related to the build failure Michael mentioned for the second patch, this 
is 
missing an inline - external change in libavcodec/x86/dct-test.c

As you said the patch is pretty much straightforward, so if fate passes and the 
output 
is the same then it should be ok.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 1/4] x86: xvid: port SSE2 idct to yasm

2015-03-10 Thread Christophe Gisquet
The main difference consists in renaming properly labels, and
letting yasm select the gprs for skipping 1D transforms.
---
 libavcodec/x86/Makefile|   2 +-
 libavcodec/x86/xvididct.asm| 379 ++
 libavcodec/x86/xvididct_init.c |  18 +-
 libavcodec/x86/xvididct_sse2.c | 406 -
 4 files changed, 395 insertions(+), 410 deletions(-)
 create mode 100644 libavcodec/x86/xvididct.asm
 delete mode 100644 libavcodec/x86/xvididct_sse2.c

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 6b9164a..276df44 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -74,7 +74,7 @@ MMX-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct.o
 
 # decoders/encoders
 MMX-OBJS-$(CONFIG_MPEG4_DECODER)   += x86/xvididct_mmx.o\
-  x86/xvididct_sse2.o
+  x86/xvididct.o
 MMX-OBJS-$(CONFIG_SNOW_DECODER)+= x86/snowdsp.o
 MMX-OBJS-$(CONFIG_SNOW_ENCODER)+= x86/snowdsp.o
 MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o
diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
new file mode 100644
index 000..d16db34
--- /dev/null
+++ b/libavcodec/x86/xvididct.asm
@@ -0,0 +1,379 @@
+; XVID MPEG-4 VIDEO CODEC
+; - SSE2 inverse discrete cosine transform -
+;
+; Copyright(C) 2003 Pascal Massimino s...@planet-d.net
+;
+; Conversion to gcc syntax with modifications
+; by Alexander Strange astra...@ithinksw.com
+;
+; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
+;
+; This file is part of FFmpeg.
+;
+; Vertical pass is an implementation of the scheme:
+;  Loeffler C., Ligtenberg A., and Moschytz C.S.:
+;  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
+;  Proc. ICASSP 1989, 988-991.
+;
+; Horizontal pass is a double 4x4 vector/matrix multiplication,
+; (see also Intel's Application Note 922:
+;  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+;  Copyright (C) 1999 Intel Corporation)
+;
+; More details at http://skal.planet-d.net/coding/dct.html
+;
+; FFmpeg is free software; you can redistribute it and/or
+; modify it under the terms of the GNU Lesser General Public
+; License as published by the Free Software Foundation; either
+; version 2.1 of the License, or (at your option) any later version.
+;
+; FFmpeg is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; Lesser General Public License for more details.
+;
+; You should have received a copy of the GNU Lesser General Public License
+; along with FFmpeg; if not, write to the Free Software Foundation,
+; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+%include libavutil/x86/x86util.asm
+
+SECTION_RODATA
+tan1:   times 8 dw 13036
+tan2:   times 8 dw 27146
+tan3:   times 8 dw 43790
+sqrt2:  times 8 dw 23170
+
+iTab1:  dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d
+dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61
+dw 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7
+dw 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
+iTab2:  dw 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5
+dw 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04
+dw 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41
+dw 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
+iTab3:  dw 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf
+dw 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf
+dw 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d
+dw 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
+iTab4:  dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746
+dw 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac
+dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df
+dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
+
+walkenIdctRounders: times 4 dd 65536
+times 4 dd  3597
+times 4 dd  2260
+times 4 dd  1203
+times 4 dd   120
+times 4 dd   512
+
+pb_127: times 8 db 127
+
+SECTION .text
+
+; Temporary storage before the column pass
+%define ROW1 xmm6
+%define ROW3 xmm4
+%define ROW5 xmm5
+%define ROW7 xmm7
+
+%macro CLEAR_ODD 1
+pxor  %1, %1
+%endmacro
+%macro PUT_ODD 1
+pshufhw   %1, xmm2, 0x1B
+%endmacro
+
+%macro MOV32 2
+%if ARCH_X86_32
+movdqa%2, %1
+%endif
+%endmacro
+
+%macro CLEAR_EVEN 1
+%if ARCH_X86_64
+CLEAR_ODD %1
+%endif
+%endmacro
+
+%macro PUT_EVEN 1
+%if ARCH_X86_64
+PUT_ODD   %1
+%else
+pshufhw xmm2, xmm2, 0x1B
+movdqa%1, xmm2