Re: [FFmpeg-devel] [PATCH] huffyuvencdsp: Convert ff_diff_bytes_mmx to yasm

2015-10-20 Thread James Almer
On 10/19/2015 11:11 PM, Timothy Gu wrote:
> Heavily based upon ff_add_bytes by Christophe Gisquet.
> ---
> 
> Taken into account James' comment, and fixed x86_32. Also saves one additional
> GPR.
> 
> ---
>  libavcodec/x86/Makefile|  1 +
>  libavcodec/x86/huffyuvencdsp.asm   | 73 
> ++
>  libavcodec/x86/huffyuvencdsp_mmx.c | 37 ---
>  3 files changed, 80 insertions(+), 31 deletions(-)
>  create mode 100644 libavcodec/x86/huffyuvencdsp.asm
> 
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 4591e4b..e1b1f0c 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -115,6 +115,7 @@ YASM-OBJS-$(CONFIG_H264QPEL)   += 
> x86/h264_qpel_8bit.o  \
>  YASM-OBJS-$(CONFIG_HPELDSP)+= x86/fpel.o\
>x86/hpeldsp.o
>  YASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o
> +YASM-OBJS-$(CONFIG_HUFFYUVENCDSP)  += x86/huffyuvencdsp.o
>  YASM-OBJS-$(CONFIG_IDCTDSP)+= x86/idctdsp.o
>  YASM-OBJS-$(CONFIG_LLAUDDSP)   += x86/lossless_audiodsp.o
>  YASM-OBJS-$(CONFIG_LLVIDDSP)   += x86/lossless_videodsp.o
> diff --git a/libavcodec/x86/huffyuvencdsp.asm 
> b/libavcodec/x86/huffyuvencdsp.asm
> new file mode 100644
> index 000..e001906
> --- /dev/null
> +++ b/libavcodec/x86/huffyuvencdsp.asm
> @@ -0,0 +1,73 @@
> +;
> +;* SIMD-optimized HuffYUV encoding functions
> +;* Copyright (c) 2000, 2001 Fabrice Bellard
> +;* Copyright (c) 2002-2004 Michael Niedermayer 
> +;*
> +;* MMX optimization by Nick Kurshev 
> +;* Conversion to NASM format by Tiancheng "Timothy" Gu 
> 
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> +;**
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +section .text
> +
> +INIT_MMX mmx
> +; void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t 
> *src2,
> +;intptr_t w);
> +%if ARCH_X86_32
> +cglobal diff_bytes, 3,5,2, dst, src1, src2
> +%define wq r4q
> +DECLARE_REG_TMP 3
> +mov   wq, r3mp
> +%else
> +cglobal diff_bytes, 4,5,2, dst, src1, src2, w
> +DECLARE_REG_TMP 4
> +%endif ; ARCH_X86_32
> +%define i t0q
> +movi, wq
> +andi, -2 * mmsize
> +jz  .setup_loop2
> +add dstq, i
> +addsrc1q, i
> +addsrc2q, i
> +negi
> +.loop:
> +mova  m0, [src1q + i]
> +mova  m1, [src1q + i + mmsize]
> +psubb m0, [src2q + i]
> +psubb m1, [src2q + i + mmsize]
> +mova  [dstq + i], m0
> +mova [mmsize + dstq + i], m1
> +addi, 2 * mmsize
> +jl .loop
> +.setup_loop2:
> +and   wq, 2 * mmsize - 1
> +jz  .end
> +add dstq, wq
> +addsrc1q, wq
> +addsrc2q, wq
> +neg   wq
> +.loop2:
> +mov  t0b, [src1q + wq]
> +sub  t0b, [src2q + wq]
> +mov  [dstq + wq], t0b
> +inc   wq
> +jl.loop2
> +.end:
> +REP_RET
> diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c 
> b/libavcodec/x86/huffyuvencdsp_mmx.c
> index ee60f4c..c5f81c8 100644
> --- a/libavcodec/x86/huffyuvencdsp_mmx.c
> +++ b/libavcodec/x86/huffyuvencdsp_mmx.c
> @@ -29,33 +29,10 @@
>  #include "libavcodec/huffyuvencdsp.h"
>  #include "libavcodec/mathops.h"
>  
> -#if HAVE_INLINE_ASM
> -
> -static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t 
> *src2,
> -   intptr_t w)
> -{
> -x86_reg i = 0;
> -
> -if (w >= 16)
> -__asm__ volatile (
> -"1: \n\t"
> -"movq  (%2, %0), %%mm0  \n\t"
> -"movq  (%1, %0), %%mm1  \n\t"
> -"psubb %%mm0, %%mm1 \n\t"
> -"movq %%mm1, (%3, %0)   \n\t"
> -"movq 8(%2, %0), 

Re: [FFmpeg-devel] [PATCH] huffyuvencdsp: Convert ff_diff_bytes_mmx to yasm

2015-10-20 Thread Timothy Gu
On Tue, Oct 20, 2015 at 2:18 PM James Almer  wrote:

> Removing this will make the INLINE_MMXEXT if statement below fail to
> compile on
> builds with inline asm disabled (msvc, etc). Even with dead code
> elimination in
> mind you'd need at least a prototype for the relevant functions.
> Just move this line below to keep the INLINE_MMXEXT if statement wrapped
> with it.
>

Good catch!


> Fate passes, so aside from the above it LGTM.
>

Pushed, thanks for the review.

Timothy
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] huffyuvencdsp: Convert ff_diff_bytes_mmx to yasm

2015-10-19 Thread Timothy Gu
Heavily based upon ff_add_bytes by Christophe Gisquet.
---

Taken into account James' comment, and fixed x86_32. Also saves one additional
GPR.

---
 libavcodec/x86/Makefile|  1 +
 libavcodec/x86/huffyuvencdsp.asm   | 73 ++
 libavcodec/x86/huffyuvencdsp_mmx.c | 37 ---
 3 files changed, 80 insertions(+), 31 deletions(-)
 create mode 100644 libavcodec/x86/huffyuvencdsp.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 4591e4b..e1b1f0c 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -115,6 +115,7 @@ YASM-OBJS-$(CONFIG_H264QPEL)   += 
x86/h264_qpel_8bit.o  \
 YASM-OBJS-$(CONFIG_HPELDSP)+= x86/fpel.o\
   x86/hpeldsp.o
 YASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o
+YASM-OBJS-$(CONFIG_HUFFYUVENCDSP)  += x86/huffyuvencdsp.o
 YASM-OBJS-$(CONFIG_IDCTDSP)+= x86/idctdsp.o
 YASM-OBJS-$(CONFIG_LLAUDDSP)   += x86/lossless_audiodsp.o
 YASM-OBJS-$(CONFIG_LLVIDDSP)   += x86/lossless_videodsp.o
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
new file mode 100644
index 000..e001906
--- /dev/null
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -0,0 +1,73 @@
+;
+;* SIMD-optimized HuffYUV encoding functions
+;* Copyright (c) 2000, 2001 Fabrice Bellard
+;* Copyright (c) 2002-2004 Michael Niedermayer 
+;*
+;* MMX optimization by Nick Kurshev 
+;* Conversion to NASM format by Tiancheng "Timothy" Gu 
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;**
+
+%include "libavutil/x86/x86util.asm"
+
+section .text
+
+INIT_MMX mmx
+; void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t 
*src2,
+;intptr_t w);
+%if ARCH_X86_32
+cglobal diff_bytes, 3,5,2, dst, src1, src2
+%define wq r4q
+DECLARE_REG_TMP 3
+mov   wq, r3mp
+%else
+cglobal diff_bytes, 4,5,2, dst, src1, src2, w
+DECLARE_REG_TMP 4
+%endif ; ARCH_X86_32
+%define i t0q
+movi, wq
+andi, -2 * mmsize
+jz  .setup_loop2
+add dstq, i
+addsrc1q, i
+addsrc2q, i
+negi
+.loop:
+mova  m0, [src1q + i]
+mova  m1, [src1q + i + mmsize]
+psubb m0, [src2q + i]
+psubb m1, [src2q + i + mmsize]
+mova  [dstq + i], m0
+mova [mmsize + dstq + i], m1
+addi, 2 * mmsize
+jl .loop
+.setup_loop2:
+and   wq, 2 * mmsize - 1
+jz  .end
+add dstq, wq
+addsrc1q, wq
+addsrc2q, wq
+neg   wq
+.loop2:
+mov  t0b, [src1q + wq]
+sub  t0b, [src2q + wq]
+mov  [dstq + wq], t0b
+inc   wq
+jl.loop2
+.end:
+REP_RET
diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c 
b/libavcodec/x86/huffyuvencdsp_mmx.c
index ee60f4c..c5f81c8 100644
--- a/libavcodec/x86/huffyuvencdsp_mmx.c
+++ b/libavcodec/x86/huffyuvencdsp_mmx.c
@@ -29,33 +29,10 @@
 #include "libavcodec/huffyuvencdsp.h"
 #include "libavcodec/mathops.h"
 
-#if HAVE_INLINE_ASM
-
-static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t 
*src2,
-   intptr_t w)
-{
-x86_reg i = 0;
-
-if (w >= 16)
-__asm__ volatile (
-"1: \n\t"
-"movq  (%2, %0), %%mm0  \n\t"
-"movq  (%1, %0), %%mm1  \n\t"
-"psubb %%mm0, %%mm1 \n\t"
-"movq %%mm1, (%3, %0)   \n\t"
-"movq 8(%2, %0), %%mm0  \n\t"
-"movq 8(%1, %0), %%mm1  \n\t"
-"psubb %%mm0, %%mm1 \n\t"
-"movq %%mm1, 8(%3, %0)  \n\t"
-"add $16, %0\n\t"
-"cmp %4, %0 \n\t"
-" jb 1b \n\t"
-: "+r"