Re: [libav-devel] [PATCH 2/2] hevc: x86: Add add_residual optimizations

2016-10-19 Thread Henrik Gramner
On Wed, Oct 19, 2016 at 10:18 AM, Diego Biurrun  wrote:
> +%macro ADD_RES_MMX_4_8 0
> +mova  m2, [r1]
> +mova  m4, [r1+8]
> +pxor  m3, m3
> +psubw m3, m2
> +packuswb  m2, m2
> +packuswb  m3, m3
> +pxor  m5, m5
> +psubw m5, m4
> +packuswb  m4, m4
> +packuswb  m5, m5
> +
> +movh  m0, [r0]
> +movh  m1, [r0+r2]
> +paddusb   m0, m2
> +paddusb   m1, m4
> +psubusb   m0, m3
> +psubusb   m1, m5
> +movh[r0], m0
> +movh [r0+r2], m1
> +%endmacro

mova  m0, [r1]
mova  m2, [r1+8]
pxor  m1, m1
pxor  m3, m3
psubw m1, m0
psubw m3, m2
packuswb  m0, m2
packuswb  m1, m3

movd  m2, [r0]
movd  m3, [r0+r2]
punpckldq m2, m3
paddusb   m0, m2
psubusb   m0, m1
movd[r0], m0
psrlq m0, 32
movd [r0+r2], m0

[...]

> +cglobal hevc_add_residual_4_8, 3, 4, 6

r3 isn't used, no need to reserve it.

[...]

> +%if cpuflag(avx)
> +psubw m3, m0, m4
> +psubw m5, m0, m6
> +%else
> +mova  m3, m0
> +mova  m5, m0
> +psubw m3, m4
> +psubw m5, m6
> +%endif

Pointless %else. x86inc will do this automatically for non-AVX when
3-arg syntax is used.

[...]

> +decr4d
> +jnz .loop

Nit: jg .loop

[...]

> +cglobal hevc_add_residual_4_10,3,4, 6

r3 isn't used.

[...]

> +cglobal hevc_add_residual_8_10,3,5,6

r4 isn't used.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/2] hevc: x86: Add add_residual optimizations

2016-10-19 Thread Luca Barbato
On 19/10/2016 10:18, Diego Biurrun wrote:
> Currently on its last run through oracle, should be good to go.

Ok if it survives.

lu
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 2/2] hevc: x86: Add add_residual optimizations

2016-10-19 Thread Diego Biurrun
From: Pierre Edouard Lepere 

Initially written by Pierre Edouard Lepere 
,
extended by James Almer .

Signed-off-by: Alexandra Hájková 
Signed-off-by: Diego Biurrun 
---

consistent macro names, indentation, cosmetics

Currently on its last run through oracle, should be good to go.

 libavcodec/x86/Makefile |   7 +-
 libavcodec/x86/hevc_add_res.asm | 385 
 libavcodec/x86/hevcdsp_init.c   |  42 +
 3 files changed, 431 insertions(+), 3 deletions(-)
 create mode 100644 libavcodec/x86/hevc_add_res.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index a38535b..094c1fa 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -115,9 +115,10 @@ YASM-OBJS-$(CONFIG_AAC_DECODER)+= x86/sbrdsp.o
 YASM-OBJS-$(CONFIG_APE_DECODER)+= x86/apedsp.o
 YASM-OBJS-$(CONFIG_DCA_DECODER)+= x86/dcadsp.o
 YASM-OBJS-$(CONFIG_DNXHD_ENCODER)  += x86/dnxhdenc.o
-YASM-OBJS-$(CONFIG_HEVC_DECODER)   += x86/hevc_deblock.o\
-  x86/hevc_mc.o \
-  x86/hevc_idct.o
+YASM-OBJS-$(CONFIG_HEVC_DECODER)   += x86/hevc_add_res.o\
+  x86/hevc_deblock.o\
+  x86/hevc_idct.o   \
+  x86/hevc_mc.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)+= x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
 YASM-OBJS-$(CONFIG_RV40_DECODER)   += x86/rv40dsp.o
diff --git a/libavcodec/x86/hevc_add_res.asm b/libavcodec/x86/hevc_add_res.asm
new file mode 100644
index 000..b6d0c7a
--- /dev/null
+++ b/libavcodec/x86/hevc_add_res.asm
@@ -0,0 +1,385 @@
+; *
+; * Provide SIMD optimizations for add_residual functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; *
+; * This file is part of Libav.
+; *
+; * Libav is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * Libav is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with Libav; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+; 
**
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+max_pixels_10:  times 16  dw ((1 << 10)-1)
+
+SECTION .text
+
+; the add_res macros and functions were largely inspired by h264_idct.asm from 
the x264 project
+%macro ADD_RES_MMX_4_8 0
+mova  m2, [r1]
+mova  m4, [r1+8]
+pxor  m3, m3
+psubw m3, m2
+packuswb  m2, m2
+packuswb  m3, m3
+pxor  m5, m5
+psubw m5, m4
+packuswb  m4, m4
+packuswb  m5, m5
+
+movh  m0, [r0]
+movh  m1, [r0+r2]
+paddusb   m0, m2
+paddusb   m1, m4
+psubusb   m0, m3
+psubusb   m1, m5
+movh[r0], m0
+movh [r0+r2], m1
+%endmacro
+
+
+INIT_MMX mmxext
+; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *coeffs, 
ptrdiff_t stride)
+cglobal hevc_add_residual_4_8, 3, 4, 6
+ADD_RES_MMX_4_8
+add   r1, 16
+lea   r0, [r0+r2*2]
+ADD_RES_MMX_4_8
+RET
+
+%macro ADD_RES_SSE_8_8 0
+pxor  m3, m3
+mova  m4, [r1]
+mova  m6, [r1+16]
+mova  m0, [r1+32]
+mova  m2, [r1+48]
+psubw m5, m3, m4
+psubw m7, m3, m6
+psubw m1, m3, m0
+packuswb  m4, m0
+packuswb  m5, m1
+psubw m3, m2
+packuswb  m6, m2
+packuswb  m7, m3
+
+movq  m0, [r0]
+movq  m1, [r0+r2]
+movhpsm0, [r0+r2*2]
+movhpsm1, [r0+r3]
+paddusb   m0, m4
+paddusb   m1, m6
+psubusb   m0, m5
+psubusb   m1, m7
+movq[r0], m0
+movq [r0+r2], m1
+movhps [r0+2*r2], m0
+movhps   [r0+r3], m1
+%endmacro
+
+%macro ADD_RES_SSE_16_32_8 3
+mova 

[libav-devel] [PATCH 2/2] hevc: x86: Add add_residual optimizations

2016-10-17 Thread Luca Barbato
From: Pierre Edouard Lepere 

Initially written by Pierre Edouard Lepere 
,
extended by James Almer .

Signed-off-by: Alexandra Hájková 
Signed-off-by: Luca Barbato 
---
 libavcodec/x86/Makefile |   3 +-
 libavcodec/x86/hevc_add_res.asm | 391 
 libavcodec/x86/hevcdsp_init.c   |  40 
 3 files changed, 433 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/x86/hevc_add_res.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index a38535b..7574085 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -117,7 +117,8 @@ YASM-OBJS-$(CONFIG_DCA_DECODER)+= x86/dcadsp.o
 YASM-OBJS-$(CONFIG_DNXHD_ENCODER)  += x86/dnxhdenc.o
 YASM-OBJS-$(CONFIG_HEVC_DECODER)   += x86/hevc_deblock.o\
   x86/hevc_mc.o \
-  x86/hevc_idct.o
+  x86/hevc_idct.o   \
+  x86/hevc_add_res.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)+= x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
 YASM-OBJS-$(CONFIG_RV40_DECODER)   += x86/rv40dsp.o
diff --git a/libavcodec/x86/hevc_add_res.asm b/libavcodec/x86/hevc_add_res.asm
new file mode 100644
index 000..0e6706b
--- /dev/null
+++ b/libavcodec/x86/hevc_add_res.asm
@@ -0,0 +1,391 @@
+; *
+; * Provide SIMD optimizations for add_residual functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; *
+; * This file is part of Libav.
+; *
+; * Libav is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * Libav is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with Libav; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+; 
**
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+max_pixels_10:  times 16  dw ((1 << 10)-1)
+
+SECTION .text
+
+; the add_res macros and functions were largely inspired by x264 project's 
code in the h264_idct.asm file
+%macro ADD_RES_MMX_4_8 0
+mova  m2, [r1]
+mova  m4, [r1+8]
+pxor  m3, m3
+psubw m3, m2
+packuswb  m2, m2
+packuswb  m3, m3
+pxor  m5, m5
+psubw m5, m4
+packuswb  m4, m4
+packuswb  m5, m5
+
+movh  m0, [r0 ]
+movh  m1, [r0+r2  ]
+paddusb   m0, m2
+paddusb   m1, m4
+psubusb   m0, m3
+psubusb   m1, m5
+movh   [r0 ], m0
+movh   [r0+r2  ], m1
+%endmacro
+
+
+INIT_MMX mmxext
+; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *coeffs, 
ptrdiff_t stride)
+cglobal hevc_add_residual_4_8, 3, 4, 6
+ADD_RES_MMX_4_8
+add   r1, 16
+lea   r0, [r0+r2*2]
+ADD_RES_MMX_4_8
+RET
+
+%macro ADD_RES_SSE_8_8 0
+pxor  m3, m3
+mova  m4, [r1]
+mova  m6, [r1+16]
+mova  m0, [r1+32]
+mova  m2, [r1+48]
+psubw m5, m3, m4
+psubw m7, m3, m6
+psubw m1, m3, m0
+packuswb  m4, m0
+packuswb  m5, m1
+psubw m3, m2
+packuswb  m6, m2
+packuswb  m7, m3
+
+movqm0, [r0 ]
+movqm1, [r0+r2  ]
+movhps  m0, [r0+r2*2]
+movhps  m1, [r0+r3  ]
+paddusb m0, m4
+paddusb m1, m6
+psubusb m0, m5
+psubusb m1, m7
+movq [r0 ], m0
+movq [r0+r2  ], m1
+movhps   [r0+2*r2], m0
+movhps   [r0+r3  ], m1
+%endmacro
+
+%macro ADD_RES_SSE_16_32_8 3
+mova xm2, [r1+%1   ]
+mova xm6, [r1+%1+16]
+%if cpuflag(avx2)
+vinserti128   m2, m2, [r1+%1+32], 1
+vinserti128   m6, m6, [r1+%1+48], 1
+%endif
+%if cpuflag(avx)
+psubw m1, m0, m2
+psubw m5, m0, m6
+%else
+mova  m1, m0
+mova