Re: [FFmpeg-devel] [PATCH 1/4] x86: xvid: port SSE2 idct to yasm
Hi, 2015-03-11 3:46 GMT+01:00 James Almer jamr...@gmail.com: Should be YASM-OBJS, and moved to the end of the file. Also, related to the build failure Michael mentioned for the second patch, this is missing an inline - external change in libavcodec/x86/dct-test.c Here you are. Passes fate-xvid-idct and dct-test builds and runs on both Win32 and Win64. -- Christophe From 86da5a1f111f9f36318daa906c3245d6b883feb3 Mon Sep 17 00:00:00 2001 From: Christophe Gisquet christophe.gisq...@gmail.com Date: Tue, 10 Mar 2015 23:11:51 + Subject: [PATCH 1/4] x86: xvid_idct: port SSE2 iDCT to yasm The main difference consists in renaming properly labels, and letting yasm select the gprs for skipping 1D transforms. --- libavcodec/x86/Makefile| 4 +- libavcodec/x86/dct-test.c | 4 +- libavcodec/x86/xvididct.asm| 379 ++ libavcodec/x86/xvididct_init.c | 18 +- libavcodec/x86/xvididct_sse2.c | 406 - 5 files changed, 398 insertions(+), 413 deletions(-) create mode 100644 libavcodec/x86/xvididct.asm delete mode 100644 libavcodec/x86/xvididct_sse2.c diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 6b9164a..f46c7d5 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -73,8 +73,7 @@ MMX-OBJS-$(CONFIG_FDCTDSP) += x86/fdct.o MMX-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct.o # decoders/encoders -MMX-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_mmx.o\ - x86/xvididct_sse2.o +MMX-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_mmx.o MMX-OBJS-$(CONFIG_SNOW_DECODER)+= x86/snowdsp.o MMX-OBJS-$(CONFIG_SNOW_ENCODER)+= x86/snowdsp.o MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o @@ -141,6 +140,7 @@ YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_mc.o \ x86/hevc_res_add.o\ x86/hevc_sao.o YASM-OBJS-$(CONFIG_MLP_DECODER)+= x86/mlpdsp.o +YASM-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct.o YASM-OBJS-$(CONFIG_PNG_DECODER)+= x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o diff --git a/libavcodec/x86/dct-test.c b/libavcodec/x86/dct-test.c index 3414cb0..e14ce9a 100644 --- a/libavcodec/x86/dct-test.c +++ b/libavcodec/x86/dct-test.c @@ -67,9 +67,9 @@ static const struct algo idct_tab_arch[] = { #if HAVE_MMXEXT_INLINE { XVID-MMXEXT, ff_xvid_idct_mmxext, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMXEXT, 1 }, #endif -#if HAVE_SSE2_INLINE +#if HAVE_SSE2_EXTERNAL { XVID-SSE2, ff_xvid_idct_sse2, FF_IDCT_PERM_SSE2, AV_CPU_FLAG_SSE2, 1 }, -#if ARCH_X86_64 HAVE_YASM +#if ARCH_X86_64 { PR-SSE2, ff_prores_idct_put_10_sse2_wrap, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 }, #endif #endif diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm new file mode 100644 index 000..d16db34 --- /dev/null +++ b/libavcodec/x86/xvididct.asm @@ -0,0 +1,379 @@ +; XVID MPEG-4 VIDEO CODEC +; - SSE2 inverse discrete cosine transform - +; +; Copyright(C) 2003 Pascal Massimino s...@planet-d.net +; +; Conversion to gcc syntax with modifications +; by Alexander Strange astra...@ithinksw.com +; +; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid. +; +; This file is part of FFmpeg. +; +; Vertical pass is an implementation of the scheme: +; Loeffler C., Ligtenberg A., and Moschytz C.S.: +; Practical Fast 1D DCT Algorithm with Eleven Multiplications, +; Proc. ICASSP 1989, 988-991. +; +; Horizontal pass is a double 4x4 vector/matrix multiplication, +; (see also Intel's Application Note 922: +; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm +; Copyright (C) 1999 Intel Corporation) +; +; More details at http://skal.planet-d.net/coding/dct.html +; +; FFmpeg is free software; you can redistribute it and/or +; modify it under the terms of the GNU Lesser General Public +; License as published by the Free Software Foundation; either +; version 2.1 of the License, or (at your option) any later version. +; +; FFmpeg is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; Lesser General Public License for more details. +; +; You should have received a copy of the GNU Lesser General Public License +; along with FFmpeg; if not, write to the Free Software Foundation, +; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +%include libavutil/x86/x86util.asm + +SECTION_RODATA +tan1: times 8 dw 13036 +tan2: times 8 dw 27146 +tan3: times 8 dw 43790 +sqrt2: times 8 dw 23170 + +iTab1: dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d +dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000,
Re: [FFmpeg-devel] [PATCH 1/4] x86: xvid: port SSE2 idct to yasm
On Thu, Mar 12, 2015 at 08:09:35PM +0100, Christophe Gisquet wrote: Hi, 2015-03-11 3:46 GMT+01:00 James Almer jamr...@gmail.com: Should be YASM-OBJS, and moved to the end of the file. Also, related to the build failure Michael mentioned for the second patch, this is missing an inline - external change in libavcodec/x86/dct-test.c Here you are. Passes fate-xvid-idct and dct-test builds and runs on both Win32 and Win64. -- Christophe b/libavcodec/x86/Makefile|4 b/libavcodec/x86/dct-test.c |4 b/libavcodec/x86/xvididct.asm| 379 b/libavcodec/x86/xvididct_init.c | 18 + libavcodec/x86/xvididct_sse2.c | 406 --- 5 files changed, 398 insertions(+), 413 deletions(-) f0d22fc5a505e06184d1c88c3632c1d357d0f576 0001-x86-xvid_idct-port-SSE2-iDCT-to-yasm.patch From 86da5a1f111f9f36318daa906c3245d6b883feb3 Mon Sep 17 00:00:00 2001 From: Christophe Gisquet christophe.gisq...@gmail.com Date: Tue, 10 Mar 2015 23:11:51 + Subject: [PATCH 1/4] x86: xvid_idct: port SSE2 iDCT to yasm applied thanks [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB In a rich man's house there is no place to spit but his face. -- Diogenes of Sinope signature.asc Description: Digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH 1/4] x86: xvid: port SSE2 idct to yasm
On 10/03/15 8:11 PM, Christophe Gisquet wrote: The main difference consists in renaming properly labels, and letting yasm select the gprs for skipping 1D transforms. --- libavcodec/x86/Makefile| 2 +- libavcodec/x86/xvididct.asm| 379 ++ libavcodec/x86/xvididct_init.c | 18 +- libavcodec/x86/xvididct_sse2.c | 406 - 4 files changed, 395 insertions(+), 410 deletions(-) create mode 100644 libavcodec/x86/xvididct.asm delete mode 100644 libavcodec/x86/xvididct_sse2.c diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 6b9164a..276df44 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -74,7 +74,7 @@ MMX-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct.o # decoders/encoders MMX-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_mmx.o\ - x86/xvididct_sse2.o + x86/xvididct.o Should be YASM-OBJS, and moved to the end of the file. Also, related to the build failure Michael mentioned for the second patch, this is missing an inline - external change in libavcodec/x86/dct-test.c As you said the patch is pretty much straightforward, so if fate passes and the output is the same then it should be ok. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH 1/4] x86: xvid: port SSE2 idct to yasm
The main difference consists in renaming properly labels, and letting yasm select the gprs for skipping 1D transforms. --- libavcodec/x86/Makefile| 2 +- libavcodec/x86/xvididct.asm| 379 ++ libavcodec/x86/xvididct_init.c | 18 +- libavcodec/x86/xvididct_sse2.c | 406 - 4 files changed, 395 insertions(+), 410 deletions(-) create mode 100644 libavcodec/x86/xvididct.asm delete mode 100644 libavcodec/x86/xvididct_sse2.c diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 6b9164a..276df44 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -74,7 +74,7 @@ MMX-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct.o # decoders/encoders MMX-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_mmx.o\ - x86/xvididct_sse2.o + x86/xvididct.o MMX-OBJS-$(CONFIG_SNOW_DECODER)+= x86/snowdsp.o MMX-OBJS-$(CONFIG_SNOW_ENCODER)+= x86/snowdsp.o MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm new file mode 100644 index 000..d16db34 --- /dev/null +++ b/libavcodec/x86/xvididct.asm @@ -0,0 +1,379 @@ +; XVID MPEG-4 VIDEO CODEC +; - SSE2 inverse discrete cosine transform - +; +; Copyright(C) 2003 Pascal Massimino s...@planet-d.net +; +; Conversion to gcc syntax with modifications +; by Alexander Strange astra...@ithinksw.com +; +; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid. +; +; This file is part of FFmpeg. +; +; Vertical pass is an implementation of the scheme: +; Loeffler C., Ligtenberg A., and Moschytz C.S.: +; Practical Fast 1D DCT Algorithm with Eleven Multiplications, +; Proc. ICASSP 1989, 988-991. +; +; Horizontal pass is a double 4x4 vector/matrix multiplication, +; (see also Intel's Application Note 922: +; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm +; Copyright (C) 1999 Intel Corporation) +; +; More details at http://skal.planet-d.net/coding/dct.html +; +; FFmpeg is free software; you can redistribute it and/or +; modify it under the terms of the GNU Lesser General Public +; License as published by the Free Software Foundation; either +; version 2.1 of the License, or (at your option) any later version. +; +; FFmpeg is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; Lesser General Public License for more details. +; +; You should have received a copy of the GNU Lesser General Public License +; along with FFmpeg; if not, write to the Free Software Foundation, +; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +%include libavutil/x86/x86util.asm + +SECTION_RODATA +tan1: times 8 dw 13036 +tan2: times 8 dw 27146 +tan3: times 8 dw 43790 +sqrt2: times 8 dw 23170 + +iTab1: dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d +dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61 +dw 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7 +dw 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b +iTab2: dw 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5 +dw 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04 +dw 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41 +dw 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df +iTab3: dw 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf +dw 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf +dw 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d +dw 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04 +iTab4: dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746 +dw 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac +dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df +dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e + +walkenIdctRounders: times 4 dd 65536 +times 4 dd 3597 +times 4 dd 2260 +times 4 dd 1203 +times 4 dd 120 +times 4 dd 512 + +pb_127: times 8 db 127 + +SECTION .text + +; Temporary storage before the column pass +%define ROW1 xmm6 +%define ROW3 xmm4 +%define ROW5 xmm5 +%define ROW7 xmm7 + +%macro CLEAR_ODD 1 +pxor %1, %1 +%endmacro +%macro PUT_ODD 1 +pshufhw %1, xmm2, 0x1B +%endmacro + +%macro MOV32 2 +%if ARCH_X86_32 +movdqa%2, %1 +%endif +%endmacro + +%macro CLEAR_EVEN 1 +%if ARCH_X86_64 +CLEAR_ODD %1 +%endif +%endmacro + +%macro PUT_EVEN 1 +%if ARCH_X86_64 +PUT_ODD %1 +%else +pshufhw xmm2, xmm2, 0x1B +movdqa%1, xmm2