Re: [FFmpeg-devel] [PATCH] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI

2022-08-15 Thread Wang, Bin
-Original Message-
From: ffmpeg-devel  On Behalf Of Andreas 
Rheinhardt
Sent: Friday, August 12, 2022 5:11 PM
To: ffmpeg-devel@ffmpeg.org
Subject: Re: [FFmpeg-devel] [PATCH] libavfilter/x86/vf_convolution: add sobel 
filter optimization and unit test with intel AVX512 VNNI

bin.wang-at-intel@ffmpeg.org:
> From: bwang30 
> 
> This commit enabled assembly code with intel AVX512 VNNI and added 
> unit test for sobel filter
> 
> sobel_c: 4537
> sobel_avx512icl 2470
> 
> Signed-off-by: bwang30 
> ---
>  libavfilter/convolution.h |   2 +
>  libavfilter/vf_convolution.c  |   8 ++
>  libavfilter/x86/vf_convolution.asm| 162 ++
>  libavfilter/x86/vf_convolution_init.c |  18 +++
>  tests/checkasm/Makefile   |   1 +
>  tests/checkasm/checkasm.c |   3 +
>  tests/checkasm/checkasm.h |   1 +
>  tests/checkasm/vf_convolution.c   | 116 ++
>  8 files changed, 311 insertions(+)
>  create mode 100644 tests/checkasm/vf_convolution.c
> 
> diff --git a/libavfilter/convolution.h b/libavfilter/convolution.h 
> index 88aabe9a20..143b0fb2d9 100644
> --- a/libavfilter/convolution.h
> +++ b/libavfilter/convolution.h
> @@ -61,4 +61,6 @@ typedef struct ConvolutionContext {  } 
> ConvolutionContext;
>  
>  void ff_convolution_init_x86(ConvolutionContext *s);
> +void ff_sobel_init_x86(ConvolutionContext *s); int 
> +ff_filter_param_init(AVFilterContext *ctx);
>  #endif
> diff --git a/libavfilter/vf_convolution.c 
> b/libavfilter/vf_convolution.c index 9a9c099e6d..98aa952258 100644
> --- a/libavfilter/vf_convolution.c
> +++ b/libavfilter/vf_convolution.c
> @@ -874,6 +874,9 @@ static int param_init(AVFilterContext *ctx)
>  if (s->depth > 8)
>  for (p = 0; p < s->nb_planes; p++)
>  s->filter[p] = filter16_sobel;
> +#if CONFIG_CONVOLUTION_FILTER && ARCH_X86_64
> +ff_sobel_init_x86(s);
> +#endif
>  } else if (!strcmp(ctx->filter->name, "kirsch")) {
>  if (s->depth > 8)
>  for (p = 0; p < s->nb_planes; p++) @@ -887,6 +890,11 @@ 
> static int param_init(AVFilterContext *ctx)
>  return 0;
>  }
>  
> +int ff_filter_param_init(AVFilterContext *ctx) {
> +return param_init(ctx);
> +}
> +
>  static int config_input(AVFilterLink *inlink)  {
>  AVFilterContext *ctx = inlink->dst; diff --git 
> a/libavfilter/x86/vf_convolution.asm 
> b/libavfilter/x86/vf_convolution.asm
> index 754d4d1064..59c807b218 100644
> --- a/libavfilter/x86/vf_convolution.asm
> +++ b/libavfilter/x86/vf_convolution.asm
> @@ -22,6 +22,10 @@
>  
>  SECTION_RODATA
>  half:   dd 0.5
> +data_p1: dd  1
> +data_n1: dd -1
> +data_p2: dd  2
> +data_n2: dd -2
>  
>  SECTION .text
>  
> @@ -154,3 +158,161 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, 
> bias, matrix, ptr, c0, c1, c2, c  INIT_XMM sse4
>  FILTER_3X3
>  %endif
> +
> +
> +%macro SOBEL_MUL_16 3
> +movd xmm2, [%2]
> +VPBROADCASTD m2, xmm2
> +movdqu xmm3, [c%1q + xq]
> +vpmovzxbd m3, xmm3
> +vpdpbusd  m%3, m3, m2
> +%endmacro
> +
> +%macro SOBEL_ADD_16 2
> +movdqu xmm3, [c%1q + xq]
> +vpmovzxbd m3, xmm3
> +vpaddd  m%2, m3
> +%endmacro
> +
> +
> +%macro SOBEL_MUL 2
> +movzx ptrd, byte [c%1q + xq]
> +imul  ptrd, [%2]
> +add   rd, ptrd
> +%endmacro
> +
> +%macro SOBEL_ADD 1
> +movzx ptrd, byte [c%1q + xq]
> +add   rd, ptrd
> +%endmacro
> +
> +; void filter_sobel_avx512(uint8_t *dst, int width,
> +;  float scale, float delta, const int *const matrix,
> +;  const uint8_t *c[], int peak, int radius,
> +;  int dstride, int stride)
> +%macro FILTER_SOBEL 0
> +%if UNIX64
> +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, 
> +c3, c4, c5, c6, c7, c8, r, x %else cglobal filter_sobel, 4, 15, 7, 
> +dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, 
> +c8, r, x %endif %if WIN64
> +SWAP xmm0, xmm2
> +SWAP xmm1, xmm3
> +mov  r2q, matrixmp
> +mov  r3q, ptrmp
> +DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, 
> +c7, c8, r, x %endif
> +movsxdifnidn widthq, widthd
> +VBROADCASTSS m0, xmm0
> +VBROADCASTSS m1, xmm1
> +pxor  m6, m6
> +mov   c0q, [ptrq + 0*gprsize]
> +mov   c1q, [ptrq + 1*gprsize]
> +mov   c2q, [ptrq + 2*gprsize]
> +mov   c3q, [ptrq + 3*gprsize]
> +mov   c4q, [ptrq + 4*gprsize]
> +mov   c5q, [ptrq + 5*gprsize]
> +mov   c6q, [ptrq + 6*gprsize]
> +mov   c7q, [pt

Re: [FFmpeg-devel] [PATCH] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI

2022-08-12 Thread Andreas Rheinhardt
bin.wang-at-intel@ffmpeg.org:
> From: bwang30 
> 
> This commit enabled assembly code with intel AVX512 VNNI and added unit test 
> for sobel filter
> 
> sobel_c: 4537
> sobel_avx512icl 2470
> 
> Signed-off-by: bwang30 
> ---
>  libavfilter/convolution.h |   2 +
>  libavfilter/vf_convolution.c  |   8 ++
>  libavfilter/x86/vf_convolution.asm| 162 ++
>  libavfilter/x86/vf_convolution_init.c |  18 +++
>  tests/checkasm/Makefile   |   1 +
>  tests/checkasm/checkasm.c |   3 +
>  tests/checkasm/checkasm.h |   1 +
>  tests/checkasm/vf_convolution.c   | 116 ++
>  8 files changed, 311 insertions(+)
>  create mode 100644 tests/checkasm/vf_convolution.c
> 
> diff --git a/libavfilter/convolution.h b/libavfilter/convolution.h
> index 88aabe9a20..143b0fb2d9 100644
> --- a/libavfilter/convolution.h
> +++ b/libavfilter/convolution.h
> @@ -61,4 +61,6 @@ typedef struct ConvolutionContext {
>  } ConvolutionContext;
>  
>  void ff_convolution_init_x86(ConvolutionContext *s);
> +void ff_sobel_init_x86(ConvolutionContext *s);
> +int ff_filter_param_init(AVFilterContext *ctx);
>  #endif
> diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c
> index 9a9c099e6d..98aa952258 100644
> --- a/libavfilter/vf_convolution.c
> +++ b/libavfilter/vf_convolution.c
> @@ -874,6 +874,9 @@ static int param_init(AVFilterContext *ctx)
>  if (s->depth > 8)
>  for (p = 0; p < s->nb_planes; p++)
>  s->filter[p] = filter16_sobel;
> +#if CONFIG_CONVOLUTION_FILTER && ARCH_X86_64
> +ff_sobel_init_x86(s);
> +#endif
>  } else if (!strcmp(ctx->filter->name, "kirsch")) {
>  if (s->depth > 8)
>  for (p = 0; p < s->nb_planes; p++)
> @@ -887,6 +890,11 @@ static int param_init(AVFilterContext *ctx)
>  return 0;
>  }
>  
> +int ff_filter_param_init(AVFilterContext *ctx)
> +{
> +return param_init(ctx);
> +}
> +
>  static int config_input(AVFilterLink *inlink)
>  {
>  AVFilterContext *ctx = inlink->dst;
> diff --git a/libavfilter/x86/vf_convolution.asm 
> b/libavfilter/x86/vf_convolution.asm
> index 754d4d1064..59c807b218 100644
> --- a/libavfilter/x86/vf_convolution.asm
> +++ b/libavfilter/x86/vf_convolution.asm
> @@ -22,6 +22,10 @@
>  
>  SECTION_RODATA
>  half:   dd 0.5
> +data_p1: dd  1
> +data_n1: dd -1
> +data_p2: dd  2
> +data_n2: dd -2
>  
>  SECTION .text
>  
> @@ -154,3 +158,161 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, 
> matrix, ptr, c0, c1, c2, c
>  INIT_XMM sse4
>  FILTER_3X3
>  %endif
> +
> +
> +%macro SOBEL_MUL_16 3
> +movd xmm2, [%2]
> +VPBROADCASTD m2, xmm2
> +movdqu xmm3, [c%1q + xq]
> +vpmovzxbd m3, xmm3
> +vpdpbusd  m%3, m3, m2
> +%endmacro
> +
> +%macro SOBEL_ADD_16 2
> +movdqu xmm3, [c%1q + xq]
> +vpmovzxbd m3, xmm3
> +vpaddd  m%2, m3
> +%endmacro
> +
> +
> +%macro SOBEL_MUL 2
> +movzx ptrd, byte [c%1q + xq]
> +imul  ptrd, [%2]
> +add   rd, ptrd
> +%endmacro
> +
> +%macro SOBEL_ADD 1
> +movzx ptrd, byte [c%1q + xq]
> +add   rd, ptrd
> +%endmacro
> +
> +; void filter_sobel_avx512(uint8_t *dst, int width,
> +;  float scale, float delta, const int *const matrix,
> +;  const uint8_t *c[], int peak, int radius,
> +;  int dstride, int stride)
> +%macro FILTER_SOBEL 0
> +%if UNIX64
> +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4, 
> c5, c6, c7, c8, r, x
> +%else
> +cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, 
> c2, c3, c4, c5, c6, c7, c8, r, x
> +%endif
> +%if WIN64
> +SWAP xmm0, xmm2
> +SWAP xmm1, xmm3
> +mov  r2q, matrixmp
> +mov  r3q, ptrmp
> +DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, 
> r, x
> +%endif
> +movsxdifnidn widthq, widthd
> +VBROADCASTSS m0, xmm0
> +VBROADCASTSS m1, xmm1
> +pxor  m6, m6
> +mov   c0q, [ptrq + 0*gprsize]
> +mov   c1q, [ptrq + 1*gprsize]
> +mov   c2q, [ptrq + 2*gprsize]
> +mov   c3q, [ptrq + 3*gprsize]
> +mov   c4q, [ptrq + 4*gprsize]
> +mov   c5q, [ptrq + 5*gprsize]
> +mov   c6q, [ptrq + 6*gprsize]
> +mov   c7q, [ptrq + 7*gprsize]
> +mov   c8q, [ptrq + 8*gprsize]
> +
> +xor   xq, xq
> +cmp   widthq, mmsize/4
> +jl .loop2
> +
> +mov   rq, widthq
> +and   rq, mmsize/4-1
> +sub   widthq, rq
> +
> +.loop1:
> +pxor m4, m4 
> +pxor m5, m5 
> +
> +;Gx
> +SOBEL_MUL_16 0, data_n1, 4
> +SOBEL_MUL_16 1, data_n2, 4
> +SOBEL_MUL_16 2, data_n1, 4
> +SOBEL_ADD_16 6, 4
> +SOBEL_MUL_16 7, data_p2, 4
> +SOBEL_ADD_16 8, 4
> +
> +cvtdq2ps  m4, m4
> +mulps m4, m4
> +
> +;Gy
> +SOBEL_MUL_16 0, data_n1, 5
> +SOBEL_ADD_16 2, 5
> +SOBEL_MUL_16 3, data_n2, 5
> +SOBEL_MUL_16 5, data_p2, 5
> +SOBEL_MUL_16 6, data_n1, 5
> +SOBEL_ADD_16 8, 5
> 

[FFmpeg-devel] [PATCH] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI

2022-08-12 Thread bin . wang-at-intel . com
From: bwang30 

This commit enabled assembly code with intel AVX512 VNNI and added unit test 
for sobel filter

sobel_c: 4537
sobel_avx512icl 2470

Signed-off-by: bwang30 
---
 libavfilter/convolution.h |   2 +
 libavfilter/vf_convolution.c  |   8 ++
 libavfilter/x86/vf_convolution.asm| 162 ++
 libavfilter/x86/vf_convolution_init.c |  18 +++
 tests/checkasm/Makefile   |   1 +
 tests/checkasm/checkasm.c |   3 +
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/vf_convolution.c   | 116 ++
 8 files changed, 311 insertions(+)
 create mode 100644 tests/checkasm/vf_convolution.c

diff --git a/libavfilter/convolution.h b/libavfilter/convolution.h
index 88aabe9a20..143b0fb2d9 100644
--- a/libavfilter/convolution.h
+++ b/libavfilter/convolution.h
@@ -61,4 +61,6 @@ typedef struct ConvolutionContext {
 } ConvolutionContext;
 
 void ff_convolution_init_x86(ConvolutionContext *s);
+void ff_sobel_init_x86(ConvolutionContext *s);
+int ff_filter_param_init(AVFilterContext *ctx);
 #endif
diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c
index 9a9c099e6d..98aa952258 100644
--- a/libavfilter/vf_convolution.c
+++ b/libavfilter/vf_convolution.c
@@ -874,6 +874,9 @@ static int param_init(AVFilterContext *ctx)
 if (s->depth > 8)
 for (p = 0; p < s->nb_planes; p++)
 s->filter[p] = filter16_sobel;
+#if CONFIG_CONVOLUTION_FILTER && ARCH_X86_64
+ff_sobel_init_x86(s);
+#endif
 } else if (!strcmp(ctx->filter->name, "kirsch")) {
 if (s->depth > 8)
 for (p = 0; p < s->nb_planes; p++)
@@ -887,6 +890,11 @@ static int param_init(AVFilterContext *ctx)
 return 0;
 }
 
+int ff_filter_param_init(AVFilterContext *ctx)
+{
+return param_init(ctx);
+}
+
 static int config_input(AVFilterLink *inlink)
 {
 AVFilterContext *ctx = inlink->dst;
diff --git a/libavfilter/x86/vf_convolution.asm 
b/libavfilter/x86/vf_convolution.asm
index 754d4d1064..59c807b218 100644
--- a/libavfilter/x86/vf_convolution.asm
+++ b/libavfilter/x86/vf_convolution.asm
@@ -22,6 +22,10 @@
 
 SECTION_RODATA
 half:   dd 0.5
+data_p1: dd  1
+data_n1: dd -1
+data_p2: dd  2
+data_n2: dd -2
 
 SECTION .text
 
@@ -154,3 +158,161 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, 
matrix, ptr, c0, c1, c2, c
 INIT_XMM sse4
 FILTER_3X3
 %endif
+
+
+%macro SOBEL_MUL_16 3
+movd xmm2, [%2]
+VPBROADCASTD m2, xmm2
+movdqu xmm3, [c%1q + xq]
+vpmovzxbd m3, xmm3
+vpdpbusd  m%3, m3, m2
+%endmacro
+
+%macro SOBEL_ADD_16 2
+movdqu xmm3, [c%1q + xq]
+vpmovzxbd m3, xmm3
+vpaddd  m%2, m3
+%endmacro
+
+
+%macro SOBEL_MUL 2
+movzx ptrd, byte [c%1q + xq]
+imul  ptrd, [%2]
+add   rd, ptrd
+%endmacro
+
+%macro SOBEL_ADD 1
+movzx ptrd, byte [c%1q + xq]
+add   rd, ptrd
+%endmacro
+
+; void filter_sobel_avx512(uint8_t *dst, int width,
+;  float scale, float delta, const int *const matrix,
+;  const uint8_t *c[], int peak, int radius,
+;  int dstride, int stride)
+%macro FILTER_SOBEL 0
+%if UNIX64
+cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4, 
c5, c6, c7, c8, r, x
+%else
+cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, 
c2, c3, c4, c5, c6, c7, c8, r, x
+%endif
+%if WIN64
+SWAP xmm0, xmm2
+SWAP xmm1, xmm3
+mov  r2q, matrixmp
+mov  r3q, ptrmp
+DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, 
r, x
+%endif
+movsxdifnidn widthq, widthd
+VBROADCASTSS m0, xmm0
+VBROADCASTSS m1, xmm1
+pxor  m6, m6
+mov   c0q, [ptrq + 0*gprsize]
+mov   c1q, [ptrq + 1*gprsize]
+mov   c2q, [ptrq + 2*gprsize]
+mov   c3q, [ptrq + 3*gprsize]
+mov   c4q, [ptrq + 4*gprsize]
+mov   c5q, [ptrq + 5*gprsize]
+mov   c6q, [ptrq + 6*gprsize]
+mov   c7q, [ptrq + 7*gprsize]
+mov   c8q, [ptrq + 8*gprsize]
+
+xor   xq, xq
+cmp   widthq, mmsize/4
+jl .loop2
+
+mov   rq, widthq
+and   rq, mmsize/4-1
+sub   widthq, rq
+
+.loop1:
+pxor m4, m4 
+pxor m5, m5 
+
+;Gx
+SOBEL_MUL_16 0, data_n1, 4
+SOBEL_MUL_16 1, data_n2, 4
+SOBEL_MUL_16 2, data_n1, 4
+SOBEL_ADD_16 6, 4
+SOBEL_MUL_16 7, data_p2, 4
+SOBEL_ADD_16 8, 4
+
+cvtdq2ps  m4, m4
+mulps m4, m4
+
+;Gy
+SOBEL_MUL_16 0, data_n1, 5
+SOBEL_ADD_16 2, 5
+SOBEL_MUL_16 3, data_n2, 5
+SOBEL_MUL_16 5, data_p2, 5
+SOBEL_MUL_16 6, data_n1, 5
+SOBEL_ADD_16 8, 5
+
+cvtdq2psm5, m5
+VFMADD231PS m4, m5, m5
+
+sqrtpsm4, m4
+mulps m4, m0   ; sum *= scale
+addps m4, m1   ; sum += delta
+cvttps2dq m4, m4
+vpmovusdb xmm4, m4
+movdqu[dstq + xq], xmm4
+
+add xq, mmsize/4
+cmp xq, widthq
+jl .loop1
+
+add widthq, rq
+cmp xq, widthq
+jge .end
+
+.loop2:
+xor