Re: [FFmpeg-devel] [PATCH] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI
-Original Message- From: ffmpeg-devel On Behalf Of Andreas Rheinhardt Sent: Friday, August 12, 2022 5:11 PM To: ffmpeg-devel@ffmpeg.org Subject: Re: [FFmpeg-devel] [PATCH] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI bin.wang-at-intel@ffmpeg.org: > From: bwang30 > > This commit enabled assembly code with intel AVX512 VNNI and added > unit test for sobel filter > > sobel_c: 4537 > sobel_avx512icl 2470 > > Signed-off-by: bwang30 > --- > libavfilter/convolution.h | 2 + > libavfilter/vf_convolution.c | 8 ++ > libavfilter/x86/vf_convolution.asm| 162 ++ > libavfilter/x86/vf_convolution_init.c | 18 +++ > tests/checkasm/Makefile | 1 + > tests/checkasm/checkasm.c | 3 + > tests/checkasm/checkasm.h | 1 + > tests/checkasm/vf_convolution.c | 116 ++ > 8 files changed, 311 insertions(+) > create mode 100644 tests/checkasm/vf_convolution.c > > diff --git a/libavfilter/convolution.h b/libavfilter/convolution.h > index 88aabe9a20..143b0fb2d9 100644 > --- a/libavfilter/convolution.h > +++ b/libavfilter/convolution.h > @@ -61,4 +61,6 @@ typedef struct ConvolutionContext { } > ConvolutionContext; > > void ff_convolution_init_x86(ConvolutionContext *s); > +void ff_sobel_init_x86(ConvolutionContext *s); int > +ff_filter_param_init(AVFilterContext *ctx); > #endif > diff --git a/libavfilter/vf_convolution.c > b/libavfilter/vf_convolution.c index 9a9c099e6d..98aa952258 100644 > --- a/libavfilter/vf_convolution.c > +++ b/libavfilter/vf_convolution.c > @@ -874,6 +874,9 @@ static int param_init(AVFilterContext *ctx) > if (s->depth > 8) > for (p = 0; p < s->nb_planes; p++) > s->filter[p] = filter16_sobel; > +#if CONFIG_CONVOLUTION_FILTER && ARCH_X86_64 > +ff_sobel_init_x86(s); > +#endif > } else if (!strcmp(ctx->filter->name, "kirsch")) { > if (s->depth > 8) > for (p = 0; p < s->nb_planes; p++) @@ -887,6 +890,11 @@ > static int param_init(AVFilterContext *ctx) > return 0; > } > > +int ff_filter_param_init(AVFilterContext *ctx) { > +return param_init(ctx); > +} > + > static int config_input(AVFilterLink *inlink) { > AVFilterContext *ctx = inlink->dst; diff --git > a/libavfilter/x86/vf_convolution.asm > b/libavfilter/x86/vf_convolution.asm > index 754d4d1064..59c807b218 100644 > --- a/libavfilter/x86/vf_convolution.asm > +++ b/libavfilter/x86/vf_convolution.asm > @@ -22,6 +22,10 @@ > > SECTION_RODATA > half: dd 0.5 > +data_p1: dd 1 > +data_n1: dd -1 > +data_p2: dd 2 > +data_n2: dd -2 > > SECTION .text > > @@ -154,3 +158,161 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, > bias, matrix, ptr, c0, c1, c2, c INIT_XMM sse4 > FILTER_3X3 > %endif > + > + > +%macro SOBEL_MUL_16 3 > +movd xmm2, [%2] > +VPBROADCASTD m2, xmm2 > +movdqu xmm3, [c%1q + xq] > +vpmovzxbd m3, xmm3 > +vpdpbusd m%3, m3, m2 > +%endmacro > + > +%macro SOBEL_ADD_16 2 > +movdqu xmm3, [c%1q + xq] > +vpmovzxbd m3, xmm3 > +vpaddd m%2, m3 > +%endmacro > + > + > +%macro SOBEL_MUL 2 > +movzx ptrd, byte [c%1q + xq] > +imul ptrd, [%2] > +add rd, ptrd > +%endmacro > + > +%macro SOBEL_ADD 1 > +movzx ptrd, byte [c%1q + xq] > +add rd, ptrd > +%endmacro > + > +; void filter_sobel_avx512(uint8_t *dst, int width, > +; float scale, float delta, const int *const matrix, > +; const uint8_t *c[], int peak, int radius, > +; int dstride, int stride) > +%macro FILTER_SOBEL 0 > +%if UNIX64 > +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, > +c3, c4, c5, c6, c7, c8, r, x %else cglobal filter_sobel, 4, 15, 7, > +dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, > +c8, r, x %endif %if WIN64 > +SWAP xmm0, xmm2 > +SWAP xmm1, xmm3 > +mov r2q, matrixmp > +mov r3q, ptrmp > +DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, > +c7, c8, r, x %endif > +movsxdifnidn widthq, widthd > +VBROADCASTSS m0, xmm0 > +VBROADCASTSS m1, xmm1 > +pxor m6, m6 > +mov c0q, [ptrq + 0*gprsize] > +mov c1q, [ptrq + 1*gprsize] > +mov c2q, [ptrq + 2*gprsize] > +mov c3q, [ptrq + 3*gprsize] > +mov c4q, [ptrq + 4*gprsize] > +mov c5q, [ptrq + 5*gprsize] > +mov c6q, [ptrq + 6*gprsize] > +mov c7q, [pt
Re: [FFmpeg-devel] [PATCH] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI
bin.wang-at-intel@ffmpeg.org: > From: bwang30 > > This commit enabled assembly code with intel AVX512 VNNI and added unit test > for sobel filter > > sobel_c: 4537 > sobel_avx512icl 2470 > > Signed-off-by: bwang30 > --- > libavfilter/convolution.h | 2 + > libavfilter/vf_convolution.c | 8 ++ > libavfilter/x86/vf_convolution.asm| 162 ++ > libavfilter/x86/vf_convolution_init.c | 18 +++ > tests/checkasm/Makefile | 1 + > tests/checkasm/checkasm.c | 3 + > tests/checkasm/checkasm.h | 1 + > tests/checkasm/vf_convolution.c | 116 ++ > 8 files changed, 311 insertions(+) > create mode 100644 tests/checkasm/vf_convolution.c > > diff --git a/libavfilter/convolution.h b/libavfilter/convolution.h > index 88aabe9a20..143b0fb2d9 100644 > --- a/libavfilter/convolution.h > +++ b/libavfilter/convolution.h > @@ -61,4 +61,6 @@ typedef struct ConvolutionContext { > } ConvolutionContext; > > void ff_convolution_init_x86(ConvolutionContext *s); > +void ff_sobel_init_x86(ConvolutionContext *s); > +int ff_filter_param_init(AVFilterContext *ctx); > #endif > diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c > index 9a9c099e6d..98aa952258 100644 > --- a/libavfilter/vf_convolution.c > +++ b/libavfilter/vf_convolution.c > @@ -874,6 +874,9 @@ static int param_init(AVFilterContext *ctx) > if (s->depth > 8) > for (p = 0; p < s->nb_planes; p++) > s->filter[p] = filter16_sobel; > +#if CONFIG_CONVOLUTION_FILTER && ARCH_X86_64 > +ff_sobel_init_x86(s); > +#endif > } else if (!strcmp(ctx->filter->name, "kirsch")) { > if (s->depth > 8) > for (p = 0; p < s->nb_planes; p++) > @@ -887,6 +890,11 @@ static int param_init(AVFilterContext *ctx) > return 0; > } > > +int ff_filter_param_init(AVFilterContext *ctx) > +{ > +return param_init(ctx); > +} > + > static int config_input(AVFilterLink *inlink) > { > AVFilterContext *ctx = inlink->dst; > diff --git a/libavfilter/x86/vf_convolution.asm > b/libavfilter/x86/vf_convolution.asm > index 754d4d1064..59c807b218 100644 > --- a/libavfilter/x86/vf_convolution.asm > +++ b/libavfilter/x86/vf_convolution.asm > @@ -22,6 +22,10 @@ > > SECTION_RODATA > half: dd 0.5 > +data_p1: dd 1 > +data_n1: dd -1 > +data_p2: dd 2 > +data_n2: dd -2 > > SECTION .text > > @@ -154,3 +158,161 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, > matrix, ptr, c0, c1, c2, c > INIT_XMM sse4 > FILTER_3X3 > %endif > + > + > +%macro SOBEL_MUL_16 3 > +movd xmm2, [%2] > +VPBROADCASTD m2, xmm2 > +movdqu xmm3, [c%1q + xq] > +vpmovzxbd m3, xmm3 > +vpdpbusd m%3, m3, m2 > +%endmacro > + > +%macro SOBEL_ADD_16 2 > +movdqu xmm3, [c%1q + xq] > +vpmovzxbd m3, xmm3 > +vpaddd m%2, m3 > +%endmacro > + > + > +%macro SOBEL_MUL 2 > +movzx ptrd, byte [c%1q + xq] > +imul ptrd, [%2] > +add rd, ptrd > +%endmacro > + > +%macro SOBEL_ADD 1 > +movzx ptrd, byte [c%1q + xq] > +add rd, ptrd > +%endmacro > + > +; void filter_sobel_avx512(uint8_t *dst, int width, > +; float scale, float delta, const int *const matrix, > +; const uint8_t *c[], int peak, int radius, > +; int dstride, int stride) > +%macro FILTER_SOBEL 0 > +%if UNIX64 > +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4, > c5, c6, c7, c8, r, x > +%else > +cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, > c2, c3, c4, c5, c6, c7, c8, r, x > +%endif > +%if WIN64 > +SWAP xmm0, xmm2 > +SWAP xmm1, xmm3 > +mov r2q, matrixmp > +mov r3q, ptrmp > +DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, > r, x > +%endif > +movsxdifnidn widthq, widthd > +VBROADCASTSS m0, xmm0 > +VBROADCASTSS m1, xmm1 > +pxor m6, m6 > +mov c0q, [ptrq + 0*gprsize] > +mov c1q, [ptrq + 1*gprsize] > +mov c2q, [ptrq + 2*gprsize] > +mov c3q, [ptrq + 3*gprsize] > +mov c4q, [ptrq + 4*gprsize] > +mov c5q, [ptrq + 5*gprsize] > +mov c6q, [ptrq + 6*gprsize] > +mov c7q, [ptrq + 7*gprsize] > +mov c8q, [ptrq + 8*gprsize] > + > +xor xq, xq > +cmp widthq, mmsize/4 > +jl .loop2 > + > +mov rq, widthq > +and rq, mmsize/4-1 > +sub widthq, rq > + > +.loop1: > +pxor m4, m4 > +pxor m5, m5 > + > +;Gx > +SOBEL_MUL_16 0, data_n1, 4 > +SOBEL_MUL_16 1, data_n2, 4 > +SOBEL_MUL_16 2, data_n1, 4 > +SOBEL_ADD_16 6, 4 > +SOBEL_MUL_16 7, data_p2, 4 > +SOBEL_ADD_16 8, 4 > + > +cvtdq2ps m4, m4 > +mulps m4, m4 > + > +;Gy > +SOBEL_MUL_16 0, data_n1, 5 > +SOBEL_ADD_16 2, 5 > +SOBEL_MUL_16 3, data_n2, 5 > +SOBEL_MUL_16 5, data_p2, 5 > +SOBEL_MUL_16 6, data_n1, 5 > +SOBEL_ADD_16 8, 5 >
[FFmpeg-devel] [PATCH] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI
From: bwang30 This commit enabled assembly code with intel AVX512 VNNI and added unit test for sobel filter sobel_c: 4537 sobel_avx512icl 2470 Signed-off-by: bwang30 --- libavfilter/convolution.h | 2 + libavfilter/vf_convolution.c | 8 ++ libavfilter/x86/vf_convolution.asm| 162 ++ libavfilter/x86/vf_convolution_init.c | 18 +++ tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 + tests/checkasm/checkasm.h | 1 + tests/checkasm/vf_convolution.c | 116 ++ 8 files changed, 311 insertions(+) create mode 100644 tests/checkasm/vf_convolution.c diff --git a/libavfilter/convolution.h b/libavfilter/convolution.h index 88aabe9a20..143b0fb2d9 100644 --- a/libavfilter/convolution.h +++ b/libavfilter/convolution.h @@ -61,4 +61,6 @@ typedef struct ConvolutionContext { } ConvolutionContext; void ff_convolution_init_x86(ConvolutionContext *s); +void ff_sobel_init_x86(ConvolutionContext *s); +int ff_filter_param_init(AVFilterContext *ctx); #endif diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c index 9a9c099e6d..98aa952258 100644 --- a/libavfilter/vf_convolution.c +++ b/libavfilter/vf_convolution.c @@ -874,6 +874,9 @@ static int param_init(AVFilterContext *ctx) if (s->depth > 8) for (p = 0; p < s->nb_planes; p++) s->filter[p] = filter16_sobel; +#if CONFIG_CONVOLUTION_FILTER && ARCH_X86_64 +ff_sobel_init_x86(s); +#endif } else if (!strcmp(ctx->filter->name, "kirsch")) { if (s->depth > 8) for (p = 0; p < s->nb_planes; p++) @@ -887,6 +890,11 @@ static int param_init(AVFilterContext *ctx) return 0; } +int ff_filter_param_init(AVFilterContext *ctx) +{ +return param_init(ctx); +} + static int config_input(AVFilterLink *inlink) { AVFilterContext *ctx = inlink->dst; diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm index 754d4d1064..59c807b218 100644 --- a/libavfilter/x86/vf_convolution.asm +++ b/libavfilter/x86/vf_convolution.asm @@ -22,6 +22,10 @@ SECTION_RODATA half: dd 0.5 +data_p1: dd 1 +data_n1: dd -1 +data_p2: dd 2 +data_n2: dd -2 SECTION .text @@ -154,3 +158,161 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c INIT_XMM sse4 FILTER_3X3 %endif + + +%macro SOBEL_MUL_16 3 +movd xmm2, [%2] +VPBROADCASTD m2, xmm2 +movdqu xmm3, [c%1q + xq] +vpmovzxbd m3, xmm3 +vpdpbusd m%3, m3, m2 +%endmacro + +%macro SOBEL_ADD_16 2 +movdqu xmm3, [c%1q + xq] +vpmovzxbd m3, xmm3 +vpaddd m%2, m3 +%endmacro + + +%macro SOBEL_MUL 2 +movzx ptrd, byte [c%1q + xq] +imul ptrd, [%2] +add rd, ptrd +%endmacro + +%macro SOBEL_ADD 1 +movzx ptrd, byte [c%1q + xq] +add rd, ptrd +%endmacro + +; void filter_sobel_avx512(uint8_t *dst, int width, +; float scale, float delta, const int *const matrix, +; const uint8_t *c[], int peak, int radius, +; int dstride, int stride) +%macro FILTER_SOBEL 0 +%if UNIX64 +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x +%else +cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x +%endif +%if WIN64 +SWAP xmm0, xmm2 +SWAP xmm1, xmm3 +mov r2q, matrixmp +mov r3q, ptrmp +DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x +%endif +movsxdifnidn widthq, widthd +VBROADCASTSS m0, xmm0 +VBROADCASTSS m1, xmm1 +pxor m6, m6 +mov c0q, [ptrq + 0*gprsize] +mov c1q, [ptrq + 1*gprsize] +mov c2q, [ptrq + 2*gprsize] +mov c3q, [ptrq + 3*gprsize] +mov c4q, [ptrq + 4*gprsize] +mov c5q, [ptrq + 5*gprsize] +mov c6q, [ptrq + 6*gprsize] +mov c7q, [ptrq + 7*gprsize] +mov c8q, [ptrq + 8*gprsize] + +xor xq, xq +cmp widthq, mmsize/4 +jl .loop2 + +mov rq, widthq +and rq, mmsize/4-1 +sub widthq, rq + +.loop1: +pxor m4, m4 +pxor m5, m5 + +;Gx +SOBEL_MUL_16 0, data_n1, 4 +SOBEL_MUL_16 1, data_n2, 4 +SOBEL_MUL_16 2, data_n1, 4 +SOBEL_ADD_16 6, 4 +SOBEL_MUL_16 7, data_p2, 4 +SOBEL_ADD_16 8, 4 + +cvtdq2ps m4, m4 +mulps m4, m4 + +;Gy +SOBEL_MUL_16 0, data_n1, 5 +SOBEL_ADD_16 2, 5 +SOBEL_MUL_16 3, data_n2, 5 +SOBEL_MUL_16 5, data_p2, 5 +SOBEL_MUL_16 6, data_n1, 5 +SOBEL_ADD_16 8, 5 + +cvtdq2psm5, m5 +VFMADD231PS m4, m5, m5 + +sqrtpsm4, m4 +mulps m4, m0 ; sum *= scale +addps m4, m1 ; sum += delta +cvttps2dq m4, m4 +vpmovusdb xmm4, m4 +movdqu[dstq + xq], xmm4 + +add xq, mmsize/4 +cmp xq, widthq +jl .loop1 + +add widthq, rq +cmp xq, widthq +jge .end + +.loop2: +xor