Re: [FFmpeg-devel] [PATCH v2] avcodec/arm/hevcdsp_sao : add NEON optimization for sao

2018-03-25 Thread Shengbin Meng


> On 22 Mar 2018, at 20:51, Yingming Fan  wrote:
> 
> From: Meng Wang 
> 
> Signed-off-by: Meng Wang 
> ---
> This v2 patch remove unused codes 'stride_dst /= sizeof(uint8_t);' compared 
> to v1. V1 have this codes because we referred to hevc dsp template codes.
> 
> As FFmpeg hevc decoder have no SAO neon optimization, we add sao_band and 
> sao_edge neon codes in this patch.
> I have already submit a patch called 'checkasm/hevc_sao : add hevc_sao for 
> checkasm' several days ago.
> Results below was printed by hevc_sao checkasm on an armv7 device Nexus 5. 
> From the results we can see: hevc_sao_band speed up ~2x, hevc_sao_edge speed 
> up ~4x. 
> Also test FATE under armv7 device and MacOS.
> 
> hevc_sao_band_8x8_8_c: 804.9
> hevc_sao_band_8x8_8_neon: 452.4
> hevc_sao_band_16x16_8_c: 2638.1
> hevc_sao_band_16x16_8_neon: 1169.9
> hevc_sao_band_32x32_8_c: 9259.9
> hevc_sao_band_32x32_8_neon: 3956.1
> hevc_sao_band_48x48_8_c: 20344.6
> hevc_sao_band_48x48_8_neon: 8649.6
> hevc_sao_band_64x64_8_c: 35684.6
> hevc_sao_band_64x64_8_neon: 15213.1
> hevc_sao_edge_8x8_8_c: 1761.6
> hevc_sao_edge_8x8_8_neon: 414.6
> hevc_sao_edge_16x16_8_c: 6844.4
> hevc_sao_edge_16x16_8_neon: 1589.9
> hevc_sao_edge_32x32_8_c: 27156.4
> hevc_sao_edge_32x32_8_neon: 6116.6
> hevc_sao_edge_48x48_8_c: 60004.6
> hevc_sao_edge_48x48_8_neon: 13686.4
> hevc_sao_edge_64x64_8_c: 106708.1
> hevc_sao_edge_64x64_8_neon: 24240.1
> 
> libavcodec/arm/Makefile|   3 +-
> libavcodec/arm/hevcdsp_init_neon.c |  59 
> libavcodec/arm/hevcdsp_sao_neon.S  | 181 +
> 3 files changed, 242 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/arm/hevcdsp_sao_neon.S
> 
> diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
> index 1eeac5449e..9c164f82ae 100644
> --- a/libavcodec/arm/Makefile
> +++ b/libavcodec/arm/Makefile
> @@ -136,7 +136,8 @@ NEON-OBJS-$(CONFIG_DCA_DECODER)+= 
> arm/synth_filter_neon.o
> NEON-OBJS-$(CONFIG_HEVC_DECODER)   += arm/hevcdsp_init_neon.o   \
>   arm/hevcdsp_deblock_neon.o\
>   arm/hevcdsp_idct_neon.o   \
> -  arm/hevcdsp_qpel_neon.o
> +  arm/hevcdsp_qpel_neon.o   \
> +  arm/hevcdsp_sao_neon.o
> NEON-OBJS-$(CONFIG_RV30_DECODER)   += arm/rv34dsp_neon.o
> NEON-OBJS-$(CONFIG_RV40_DECODER)   += arm/rv34dsp_neon.o\
>   arm/rv40dsp_neon.o
> diff --git a/libavcodec/arm/hevcdsp_init_neon.c 
> b/libavcodec/arm/hevcdsp_init_neon.c
> index a4628d2a93..af68e24f93 100644
> --- a/libavcodec/arm/hevcdsp_init_neon.c
> +++ b/libavcodec/arm/hevcdsp_init_neon.c
> @@ -21,8 +21,16 @@
> #include "libavutil/attributes.h"
> #include "libavutil/arm/cpu.h"
> #include "libavcodec/hevcdsp.h"
> +#include "libavcodec/avcodec.h"
> #include "hevcdsp_arm.h"
> 
> +void ff_hevc_sao_band_filter_neon_8_wrapper(uint8_t *_dst, uint8_t *_src,
> +  ptrdiff_t stride_dst, ptrdiff_t stride_src,
> +  int16_t *sao_offset_val, int 
> sao_left_class,
> +  int width, int height);
> +void ff_hevc_sao_edge_filter_neon_8_wrapper(uint8_t *_dst, uint8_t *_src, 
> ptrdiff_t stride_dst, int16_t *sao_offset_val,
> +  int eo, int width, int height);
> +
> void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int 
> _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
> void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int 
> _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
> void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int 
> *_tc, uint8_t *_no_p, uint8_t *_no_q);
> @@ -142,6 +150,47 @@ QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v2_neon_8);
> QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v3_neon_8);
> #undef QPEL_FUNC_UW
> 
> +void ff_hevc_sao_band_filter_neon_8(uint8_t *dst, uint8_t *src, ptrdiff_t 
> stride_dst, ptrdiff_t stride_src, int width, int height, int16_t 
> *offset_table);
> +
> +void ff_hevc_sao_band_filter_neon_8_wrapper(uint8_t *_dst, uint8_t *_src,
> +  ptrdiff_t stride_dst, ptrdiff_t stride_src,
> +  int16_t *sao_offset_val, int 
> sao_left_class,
> +  int width, int height) {
> +uint8_t *dst = (uint8_t *)_dst;
> +uint8_t *src = (uint8_t *)_src;
This conversion is also not needed since we are only handling 8-bit pixels here.

> +int16_t offset_table[32] = {0};
> +int k;
> +
> +for (k = 0; k < 4; k++) {
> +offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
> +}
> +
> +ff_hevc_sao_band_filter_neon_8(dst, src, 

[FFmpeg-devel] [PATCH v2] avcodec/arm/hevcdsp_sao : add NEON optimization for sao

2018-03-22 Thread Yingming Fan
From: Meng Wang 

Signed-off-by: Meng Wang 
---
This v2 patch remove unused codes 'stride_dst /= sizeof(uint8_t);' compared to 
v1. V1 have this codes because we referred to hevc dsp template codes.

As FFmpeg hevc decoder have no SAO neon optimization, we add sao_band and 
sao_edge neon codes in this patch.
I have already submit a patch called 'checkasm/hevc_sao : add hevc_sao for 
checkasm' several days ago.
Results below was printed by hevc_sao checkasm on an armv7 device Nexus 5. 
From the results we can see: hevc_sao_band speed up ~2x, hevc_sao_edge speed up 
~4x. 
Also test FATE under armv7 device and MacOS.

hevc_sao_band_8x8_8_c: 804.9
hevc_sao_band_8x8_8_neon: 452.4
hevc_sao_band_16x16_8_c: 2638.1
hevc_sao_band_16x16_8_neon: 1169.9
hevc_sao_band_32x32_8_c: 9259.9
hevc_sao_band_32x32_8_neon: 3956.1
hevc_sao_band_48x48_8_c: 20344.6
hevc_sao_band_48x48_8_neon: 8649.6
hevc_sao_band_64x64_8_c: 35684.6
hevc_sao_band_64x64_8_neon: 15213.1
hevc_sao_edge_8x8_8_c: 1761.6
hevc_sao_edge_8x8_8_neon: 414.6
hevc_sao_edge_16x16_8_c: 6844.4
hevc_sao_edge_16x16_8_neon: 1589.9
hevc_sao_edge_32x32_8_c: 27156.4
hevc_sao_edge_32x32_8_neon: 6116.6
hevc_sao_edge_48x48_8_c: 60004.6
hevc_sao_edge_48x48_8_neon: 13686.4
hevc_sao_edge_64x64_8_c: 106708.1
hevc_sao_edge_64x64_8_neon: 24240.1

 libavcodec/arm/Makefile|   3 +-
 libavcodec/arm/hevcdsp_init_neon.c |  59 
 libavcodec/arm/hevcdsp_sao_neon.S  | 181 +
 3 files changed, 242 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/arm/hevcdsp_sao_neon.S

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 1eeac5449e..9c164f82ae 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -136,7 +136,8 @@ NEON-OBJS-$(CONFIG_DCA_DECODER)+= 
arm/synth_filter_neon.o
 NEON-OBJS-$(CONFIG_HEVC_DECODER)   += arm/hevcdsp_init_neon.o   \
   arm/hevcdsp_deblock_neon.o\
   arm/hevcdsp_idct_neon.o   \
-  arm/hevcdsp_qpel_neon.o
+  arm/hevcdsp_qpel_neon.o   \
+  arm/hevcdsp_sao_neon.o
 NEON-OBJS-$(CONFIG_RV30_DECODER)   += arm/rv34dsp_neon.o
 NEON-OBJS-$(CONFIG_RV40_DECODER)   += arm/rv34dsp_neon.o\
   arm/rv40dsp_neon.o
diff --git a/libavcodec/arm/hevcdsp_init_neon.c 
b/libavcodec/arm/hevcdsp_init_neon.c
index a4628d2a93..af68e24f93 100644
--- a/libavcodec/arm/hevcdsp_init_neon.c
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -21,8 +21,16 @@
 #include "libavutil/attributes.h"
 #include "libavutil/arm/cpu.h"
 #include "libavcodec/hevcdsp.h"
+#include "libavcodec/avcodec.h"
 #include "hevcdsp_arm.h"
 
+void ff_hevc_sao_band_filter_neon_8_wrapper(uint8_t *_dst, uint8_t *_src,
+  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+  int16_t *sao_offset_val, int sao_left_class,
+  int width, int height);
+void ff_hevc_sao_edge_filter_neon_8_wrapper(uint8_t *_dst, uint8_t *_src, 
ptrdiff_t stride_dst, int16_t *sao_offset_val,
+  int eo, int width, int height);
+
 void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int 
_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
 void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int 
_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
 void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int 
*_tc, uint8_t *_no_p, uint8_t *_no_q);
@@ -142,6 +150,47 @@ QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v2_neon_8);
 QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v3_neon_8);
 #undef QPEL_FUNC_UW
 
+void ff_hevc_sao_band_filter_neon_8(uint8_t *dst, uint8_t *src, ptrdiff_t 
stride_dst, ptrdiff_t stride_src, int width, int height, int16_t *offset_table);
+
+void ff_hevc_sao_band_filter_neon_8_wrapper(uint8_t *_dst, uint8_t *_src,
+  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+  int16_t *sao_offset_val, int sao_left_class,
+  int width, int height) {
+uint8_t *dst = (uint8_t *)_dst;
+uint8_t *src = (uint8_t *)_src;
+int16_t offset_table[32] = {0};
+int k;
+
+for (k = 0; k < 4; k++) {
+offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
+}
+
+ff_hevc_sao_band_filter_neon_8(dst, src, stride_dst, stride_src, width, 
height, offset_table);
+}
+
+void ff_hevc_sao_edge_filter_neon_8(uint8_t *dst, uint8_t *src, ptrdiff_t 
stride_dst, ptrdiff_t stride_src, int width, int height,
+int a_stride, int b_stride, int16_t 
*sao_offset_val, uint8_t *edge_idx);
+
+void ff_hevc_sao_edge_filter_neon_8_wrapper(uint8_t *_dst,