Re: [FFmpeg-devel] [PATCH 3/3] avfilter: add avx2 filter_line function for bwdif

2023-03-13 Thread James Darnley

On 3/11/23 17:14, Thomas Mundt wrote:


+%if mmsize == 32
+vpbroadcastd m12, DWORD clip_maxm



I get a green pattern at bit depths > 8.
Looks good with:
vpbroadcastw m12, WORD clip_maxm

+%else

  movdm12, DWORD clip_maxm
  SPLATW  m12, m12, 0
+%endif


Of course it should be a word broadcast!

But why doesn't my checkasm test catch it?


  bwdif->filter_line = ff_bwdif_filter_line_sse2;
  if (EXTERNAL_SSSE3(cpu_flags))
  bwdif->filter_line = ff_bwdif_filter_line_ssse3;
+if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
+bwdif->filter_line = ff_bwdif_filter_line_avx2;
  } else if (bit_depth <= 12) {
  if (EXTERNAL_SSE2(cpu_flags))
  bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
  if (EXTERNAL_SSSE3(cpu_flags))
  bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
+if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
+bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2;
  }
  }


I was intending to only modify/write the 8-bit function so this is a 
mistake.


Thanks.  I'll be back with a version 2.

[re-sending to list]
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 3/3] avfilter: add avx2 filter_line function for bwdif

2023-03-11 Thread Thomas Mundt
Hi James,

Am Mo., 20. Feb. 2023 um 20:59 Uhr schrieb James Darnley :

> 2.24x faster (1925±1.3 vs. 859±2.2 decicycles) compared with ssse3
> ---
>  libavfilter/x86/vf_bwdif.asm| 29 -
>  libavfilter/x86/vf_bwdif_init.c | 12 
>  2 files changed, 36 insertions(+), 5 deletions(-)
>
> diff --git a/libavfilter/x86/vf_bwdif.asm b/libavfilter/x86/vf_bwdif.asm
> index 0b453da53b..5cc61435fd 100644
> --- a/libavfilter/x86/vf_bwdif.asm
> +++ b/libavfilter/x86/vf_bwdif.asm
> @@ -26,18 +26,22 @@
>
>  %include "libavutil/x86/x86util.asm"
>
> -SECTION_RODATA
> +SECTION_RODATA 32
>
> -pw_coefhf:  times 4 dw  1016, 5570
> -pw_coefhf1: times 8 dw -3801
> -pw_coefsp:  times 4 dw  5077, -981
> -pw_splfdif: times 4 dw  -768,  768
> +pw_coefhf:  times 8 dw  1016, 5570
> +pw_coefhf1: times 16 dw -3801
> +pw_coefsp:  times 8 dw  5077, -981
> +pw_splfdif: times 8 dw  -768,  768
>
>  SECTION .text
>
>  %macro LOAD8 2
> +%if mmsize == 32
> +pmovzxbw %1, %2
> +%else
>  movh %1, %2
>  punpcklbw%1, m7
> +%endif
>  %endmacro
>
>  %macro LOAD12 2
> @@ -45,8 +49,14 @@ SECTION .text
>  %endmacro
>
>  %macro DISP8 0
> +%if mmsize == 32
> +vextracti128  xm1,m2, 1
> +packuswb  xm2,   xm1
> +movu [dstq], xm2
> +%else
>  packuswb m2, m2
>  movh [dstq], m2
> +%endif
>  %endmacro
>
>  %macro DISP12 0
> @@ -244,8 +254,12 @@ cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst,
> prev, cur, next, w, \
>prefs, mrefs, prefs2,
> mrefs2, \
>prefs3, mrefs3, prefs4, \
>mrefs4, parity, clip_max
> +%if mmsize == 32
> +vpbroadcastd m12, DWORD clip_maxm
>

I get a green pattern at bit depths > 8.
Looks good with:
vpbroadcastw m12, WORD clip_maxm

+%else
>  movdm12, DWORD clip_maxm
>  SPLATW  m12, m12, 0
> +%endif
>  %else
>  cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
>prefs, mrefs, prefs2,
> mrefs2, \
> @@ -264,3 +278,8 @@ INIT_XMM ssse3
>  BWDIF
>  INIT_XMM sse2
>  BWDIF
> +
> +%if HAVE_AVX2_EXTERNAL && ARCH_X86_64
> +INIT_YMM avx2
> +BWDIF
> +%endif
> diff --git a/libavfilter/x86/vf_bwdif_init.c
> b/libavfilter/x86/vf_bwdif_init.c
> index ba7bc40c3d..f833318c10 100644
> --- a/libavfilter/x86/vf_bwdif_init.c
> +++ b/libavfilter/x86/vf_bwdif_init.c
> @@ -32,6 +32,10 @@ void ff_bwdif_filter_line_ssse3(void *dst, void *prev,
> void *cur, void *next,
>  int w, int prefs, int mrefs, int prefs2,
>  int mrefs2, int prefs3, int mrefs3, int
> prefs4,
>  int mrefs4, int parity, int clip_max);
> +void ff_bwdif_filter_line_avx2(void *dst, void *prev, void *cur, void
> *next,
> +   int w, int prefs, int mrefs, int prefs2,
> +   int mrefs2, int prefs3, int mrefs3, int
> prefs4,
> +   int mrefs4, int parity, int clip_max);
>
>  void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur,
> void *next,
>   int w, int prefs, int mrefs, int
> prefs2,
> @@ -41,6 +45,10 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void
> *prev, void *cur, void *ne
>int w, int prefs, int mrefs, int
> prefs2,
>int mrefs2, int prefs3, int mrefs3,
> int prefs4,
>int mrefs4, int parity, int
> clip_max);
> +void ff_bwdif_filter_line_12bit_avx2(void *dst, void *prev, void *cur,
> void *next,
> + int w, int prefs, int mrefs, int
> prefs2,
> + int mrefs2, int prefs3, int mrefs3,
> int prefs4,
> + int mrefs4, int parity, int
> clip_max);
>
>  av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
>  {
> @@ -51,10 +59,14 @@ av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif,
> int bit_depth)
>  bwdif->filter_line = ff_bwdif_filter_line_sse2;
>  if (EXTERNAL_SSSE3(cpu_flags))
>  bwdif->filter_line = ff_bwdif_filter_line_ssse3;
> +if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
> +bwdif->filter_line = ff_bwdif_filter_line_avx2;
>  } else if (bit_depth <= 12) {
>  if (EXTERNAL_SSE2(cpu_flags))
>  bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
>  if (EXTERNAL_SSSE3(cpu_flags))
>  bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
> +if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
> +bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2;
>  }
>  }
> --
> 2.39.1
___

[FFmpeg-devel] [PATCH 3/3] avfilter: add avx2 filter_line function for bwdif

2023-02-20 Thread James Darnley
2.24x faster (1925±1.3 vs. 859±2.2 decicycles) compared with ssse3
---
 libavfilter/x86/vf_bwdif.asm| 29 -
 libavfilter/x86/vf_bwdif_init.c | 12 
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/libavfilter/x86/vf_bwdif.asm b/libavfilter/x86/vf_bwdif.asm
index 0b453da53b..5cc61435fd 100644
--- a/libavfilter/x86/vf_bwdif.asm
+++ b/libavfilter/x86/vf_bwdif.asm
@@ -26,18 +26,22 @@
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
-pw_coefhf:  times 4 dw  1016, 5570
-pw_coefhf1: times 8 dw -3801
-pw_coefsp:  times 4 dw  5077, -981
-pw_splfdif: times 4 dw  -768,  768
+pw_coefhf:  times 8 dw  1016, 5570
+pw_coefhf1: times 16 dw -3801
+pw_coefsp:  times 8 dw  5077, -981
+pw_splfdif: times 8 dw  -768,  768
 
 SECTION .text
 
 %macro LOAD8 2
+%if mmsize == 32
+pmovzxbw %1, %2
+%else
 movh %1, %2
 punpcklbw%1, m7
+%endif
 %endmacro
 
 %macro LOAD12 2
@@ -45,8 +49,14 @@ SECTION .text
 %endmacro
 
 %macro DISP8 0
+%if mmsize == 32
+vextracti128  xm1,m2, 1
+packuswb  xm2,   xm1
+movu [dstq], xm2
+%else
 packuswb m2, m2
 movh [dstq], m2
+%endif
 %endmacro
 
 %macro DISP12 0
@@ -244,8 +254,12 @@ cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, 
cur, next, w, \
   prefs, mrefs, prefs2, mrefs2, \
   prefs3, mrefs3, prefs4, \
   mrefs4, parity, clip_max
+%if mmsize == 32
+vpbroadcastd m12, DWORD clip_maxm
+%else
 movdm12, DWORD clip_maxm
 SPLATW  m12, m12, 0
+%endif
 %else
 cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
   prefs, mrefs, prefs2, mrefs2, \
@@ -264,3 +278,8 @@ INIT_XMM ssse3
 BWDIF
 INIT_XMM sse2
 BWDIF
+
+%if HAVE_AVX2_EXTERNAL && ARCH_X86_64
+INIT_YMM avx2
+BWDIF
+%endif
diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c
index ba7bc40c3d..f833318c10 100644
--- a/libavfilter/x86/vf_bwdif_init.c
+++ b/libavfilter/x86/vf_bwdif_init.c
@@ -32,6 +32,10 @@ void ff_bwdif_filter_line_ssse3(void *dst, void *prev, void 
*cur, void *next,
 int w, int prefs, int mrefs, int prefs2,
 int mrefs2, int prefs3, int mrefs3, int prefs4,
 int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_avx2(void *dst, void *prev, void *cur, void *next,
+   int w, int prefs, int mrefs, int prefs2,
+   int mrefs2, int prefs3, int mrefs3, int prefs4,
+   int mrefs4, int parity, int clip_max);
 
 void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, void 
*next,
  int w, int prefs, int mrefs, int prefs2,
@@ -41,6 +45,10 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, 
void *cur, void *ne
   int w, int prefs, int mrefs, int prefs2,
   int mrefs2, int prefs3, int mrefs3, int 
prefs4,
   int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_12bit_avx2(void *dst, void *prev, void *cur, void 
*next,
+ int w, int prefs, int mrefs, int prefs2,
+ int mrefs2, int prefs3, int mrefs3, int 
prefs4,
+ int mrefs4, int parity, int clip_max);
 
 av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
 {
@@ -51,10 +59,14 @@ av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int 
bit_depth)
 bwdif->filter_line = ff_bwdif_filter_line_sse2;
 if (EXTERNAL_SSSE3(cpu_flags))
 bwdif->filter_line = ff_bwdif_filter_line_ssse3;
+if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
+bwdif->filter_line = ff_bwdif_filter_line_avx2;
 } else if (bit_depth <= 12) {
 if (EXTERNAL_SSE2(cpu_flags))
 bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
 if (EXTERNAL_SSSE3(cpu_flags))
 bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
+if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
+bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2;
 }
 }
-- 
2.39.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".