Re: [FFmpeg-devel] [PATCH 3/3] avfilter: add avx2 filter_line function for bwdif

2023-03-13 Thread James Darnley

On 3/11/23 17:14, Thomas Mundt wrote:


+%if mmsize == 32
+vpbroadcastd m12, DWORD clip_maxm



I get a green pattern at bit depths > 8.
Looks good with:
vpbroadcastw m12, WORD clip_maxm

+%else

  movdm12, DWORD clip_maxm
  SPLATW  m12, m12, 0
+%endif


Of course it should be a word broadcast!

But why doesn't my checkasm test catch it?


  bwdif->filter_line = ff_bwdif_filter_line_sse2;
  if (EXTERNAL_SSSE3(cpu_flags))
  bwdif->filter_line = ff_bwdif_filter_line_ssse3;
+if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
+bwdif->filter_line = ff_bwdif_filter_line_avx2;
  } else if (bit_depth <= 12) {
  if (EXTERNAL_SSE2(cpu_flags))
  bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
  if (EXTERNAL_SSSE3(cpu_flags))
  bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
+if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
+bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2;
  }
  }


I was intending to only modify/write the 8-bit function so this is a 
mistake.


Thanks.  I'll be back with a version 2.

[re-sending to list]
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 2/3] checkasm: add test for bwdif

2023-03-13 Thread James Darnley

On 3/11/23 17:18, Thomas Mundt wrote:






I'm not familiar with checkasm tests, but isn't this one limited to a bit
depth of 8?


Yes, that was the idea because I was only intending to modify the 8-bit 
function, for now.  The function pointer is the same for all depths so 
you need to initialize it with a different depth.  Judging from your 
other email I might need to write them anyway.


[re-sending to list]
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] tests: actually test yadif's 10 and 16-bit functions

2023-03-05 Thread James Darnley

On 2/20/23 14:06, James Darnley wrote:

On 2/20/23 13:49, Nicolas George wrote:

James Darnley (12023-02-20):


snip


Moving scale before yadif is right, but format= is redundant with
-pix_fmt.

Regards,



So the patch should just be moving the scale filter first?  Sure.  Any 
other comments?  I wait a short while then make that change and push.


I forgot about this.  Now that the repo seems to be working again after 
the HW failure I will push on Monday.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] libavfilter/x86/vf_convolution.asm- fix missing decelerator for AVX512ICL sobel

2023-02-24 Thread James Darnley

On 2/24/23 04:00, Felix LeClair wrote:

Fixes:  Compilation of Sobel with AVX512ICL
Caused: Comment left without deleniator in AVX512ICL version of SOBEL

Testing:Confirmed working on AVX512 Alderlake (AKA SPR without AMX)



diff --git a/libavfilter/x86/vf_convolution.asm 
b/libavfilter/x86/vf_convolution.asm
index 9ac9ef5d73..8b85897819 100644
--- a/libavfilter/x86/vf_convolution.asm
+++ b/libavfilter/x86/vf_convolution.asm
@@ -232,8 +232,8 @@ cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, 
matrix, ptr, c0, c1, c2,
 psubd m4, m5
 vpermbm3, m6, m3
 mova  m5, m4
-vpdpbusd  m4, m2, [sobel_mulA] {1to16}
-vpdpbusd  m5, m3, [sobel_mulB] {1to16}
+vpdpbusd  m4, m2, [sobel_mulA]; {1to16}
+vpdpbusd  m5, m3, [sobel_mulB]; {1to16}
 
 cvtdq2ps  m4, m4

 mulps m4, m4



Fix compilation with what?

I'm not familiar with the sobel algorith/function so I can't say whether 
the code is correct.  However those constants are only dword sized and 
that is how you do a memory broadcast with avx512(icl).  Furthermore 
testing your change on an icl system results in a failure in checkasm.


So what program and what version fails to assemble that?

[re-sending to list]
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 3/3] avfilter: add avx2 filter_line function for bwdif

2023-02-20 Thread James Darnley
2.24x faster (1925±1.3 vs. 859±2.2 decicycles) compared with ssse3
---
 libavfilter/x86/vf_bwdif.asm| 29 -
 libavfilter/x86/vf_bwdif_init.c | 12 
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/libavfilter/x86/vf_bwdif.asm b/libavfilter/x86/vf_bwdif.asm
index 0b453da53b..5cc61435fd 100644
--- a/libavfilter/x86/vf_bwdif.asm
+++ b/libavfilter/x86/vf_bwdif.asm
@@ -26,18 +26,22 @@
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
-pw_coefhf:  times 4 dw  1016, 5570
-pw_coefhf1: times 8 dw -3801
-pw_coefsp:  times 4 dw  5077, -981
-pw_splfdif: times 4 dw  -768,  768
+pw_coefhf:  times 8 dw  1016, 5570
+pw_coefhf1: times 16 dw -3801
+pw_coefsp:  times 8 dw  5077, -981
+pw_splfdif: times 8 dw  -768,  768
 
 SECTION .text
 
 %macro LOAD8 2
+%if mmsize == 32
+pmovzxbw %1, %2
+%else
 movh %1, %2
 punpcklbw%1, m7
+%endif
 %endmacro
 
 %macro LOAD12 2
@@ -45,8 +49,14 @@ SECTION .text
 %endmacro
 
 %macro DISP8 0
+%if mmsize == 32
+vextracti128  xm1,m2, 1
+packuswb  xm2,   xm1
+movu [dstq], xm2
+%else
 packuswb m2, m2
 movh [dstq], m2
+%endif
 %endmacro
 
 %macro DISP12 0
@@ -244,8 +254,12 @@ cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, 
cur, next, w, \
   prefs, mrefs, prefs2, mrefs2, \
   prefs3, mrefs3, prefs4, \
   mrefs4, parity, clip_max
+%if mmsize == 32
+vpbroadcastd m12, DWORD clip_maxm
+%else
 movdm12, DWORD clip_maxm
 SPLATW  m12, m12, 0
+%endif
 %else
 cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
   prefs, mrefs, prefs2, mrefs2, \
@@ -264,3 +278,8 @@ INIT_XMM ssse3
 BWDIF
 INIT_XMM sse2
 BWDIF
+
+%if HAVE_AVX2_EXTERNAL && ARCH_X86_64
+INIT_YMM avx2
+BWDIF
+%endif
diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c
index ba7bc40c3d..f833318c10 100644
--- a/libavfilter/x86/vf_bwdif_init.c
+++ b/libavfilter/x86/vf_bwdif_init.c
@@ -32,6 +32,10 @@ void ff_bwdif_filter_line_ssse3(void *dst, void *prev, void 
*cur, void *next,
 int w, int prefs, int mrefs, int prefs2,
 int mrefs2, int prefs3, int mrefs3, int prefs4,
 int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_avx2(void *dst, void *prev, void *cur, void *next,
+   int w, int prefs, int mrefs, int prefs2,
+   int mrefs2, int prefs3, int mrefs3, int prefs4,
+   int mrefs4, int parity, int clip_max);
 
 void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, void 
*next,
  int w, int prefs, int mrefs, int prefs2,
@@ -41,6 +45,10 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, 
void *cur, void *ne
   int w, int prefs, int mrefs, int prefs2,
   int mrefs2, int prefs3, int mrefs3, int 
prefs4,
   int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_12bit_avx2(void *dst, void *prev, void *cur, void 
*next,
+ int w, int prefs, int mrefs, int prefs2,
+ int mrefs2, int prefs3, int mrefs3, int 
prefs4,
+ int mrefs4, int parity, int clip_max);
 
 av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
 {
@@ -51,10 +59,14 @@ av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int 
bit_depth)
 bwdif->filter_line = ff_bwdif_filter_line_sse2;
 if (EXTERNAL_SSSE3(cpu_flags))
 bwdif->filter_line = ff_bwdif_filter_line_ssse3;
+if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
+bwdif->filter_line = ff_bwdif_filter_line_avx2;
 } else if (bit_depth <= 12) {
 if (EXTERNAL_SSE2(cpu_flags))
 bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
 if (EXTERNAL_SSSE3(cpu_flags))
 bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
+if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
+bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2;
 }
 }
-- 
2.39.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 2/3] checkasm: add test for bwdif

2023-02-20 Thread James Darnley
---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/vf_bwdif.c | 70 +++
 tests/fate/checkasm.mak   |  1 +
 5 files changed, 76 insertions(+)
 create mode 100644 tests/checkasm/vf_bwdif.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index a6f06c7007..b6a43f181f 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -40,6 +40,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC)  += $(AVCODECOBJS-yes)
 # libavfilter tests
 AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o
 AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o
+AVFILTEROBJS-$(CONFIG_BWDIF_FILTER)  += vf_bwdif.o
 AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
 AVFILTEROBJS-$(CONFIG_EQ_FILTER) += vf_eq.o
 AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)  += vf_gblur.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index e96d84a7da..5e729cf0e0 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -179,6 +179,9 @@ static const struct {
 #if CONFIG_BLEND_FILTER
 { "vf_blend", checkasm_check_blend },
 #endif
+#if CONFIG_BWDIF_FILTER
+{ "vf_bwdif", checkasm_check_vf_bwdif },
+#endif
 #if CONFIG_COLORSPACE_FILTER
 { "vf_colorspace", checkasm_check_colorspace },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 8744a81218..e9e73c6fa0 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -82,6 +82,7 @@ void checkasm_check_utvideodsp(void);
 void checkasm_check_v210dec(void);
 void checkasm_check_v210enc(void);
 void checkasm_check_vc1dsp(void);
+void checkasm_check_vf_bwdif(void);
 void checkasm_check_vf_eq(void);
 void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
new file mode 100644
index 00..e27f9b7494
--- /dev/null
+++ b/tests/checkasm/vf_bwdif.c
@@ -0,0 +1,70 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+#include "checkasm.h"
+#include "libavcodec/internal.h"
+#include "libavfilter/bwdif.h"
+
+#define WIDTH 256
+
+#define randomize_buffers(buf0, buf1, mask, count) \
+for (size_t i; i < count; i++) \
+buf0[i] = buf1[i] = rnd() & mask
+
+void checkasm_check_vf_bwdif(void)
+{
+BWDIFContext ctx_8, ctx_10, ctx_16;
+
+ff_bwdif_init_filter_line(&ctx_8, 8);
+ff_bwdif_init_filter_line(&ctx_10, 10);
+ff_bwdif_init_filter_line(&ctx_16, 16);
+
+if (check_func(ctx_8.filter_line, "bwdif8")) {
+uint8_t prev0[9*WIDTH], prev1[9*WIDTH];
+uint8_t next0[9*WIDTH], next1[9*WIDTH];
+uint8_t cur0[9*WIDTH], cur1[9*WIDTH];
+uint8_t dst0[WIDTH], dst1[WIDTH];
+
+declare_func(void, void *dst, void *prev, void *cur, void *next,
+int w, int prefs, int mrefs, int prefs2, int mrefs2,
+int prefs3, int mrefs3, int prefs4, int mrefs4,
+int parity, int clip_max);
+
+randomize_buffers(prev0, prev1, 0xff, 9*WIDTH);
+randomize_buffers(next0, next1, 0xff, 9*WIDTH);
+randomize_buffers(cur0, cur1, 0xff, 9*WIDTH);
+
+call_ref(dst0, prev0 + 4*WIDTH, cur0 + 4*WIDTH, next0 + 4*WIDTH, WIDTH,
+WIDTH, -WIDTH, 2*WIDTH, -2*WIDTH, 3*WIDTH, -3*WIDTH, 4*WIDTH, 
-4*WIDTH,
+0, 0xff);
+call_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH, WIDTH,
+WIDTH, -WIDTH, 2*WIDTH, -2*WIDTH, 3*WIDTH, -3*WIDTH, 4*WIDTH, 
-4*WIDTH,
+0, 0xff);
+
+if (memcmp(dst0, dst1, WIDTH)
+|| memcmp(prev0, prev1, sizeof prev0)
+|| memcmp(next0, next1, sizeof next0)
+|| memcmp(cur0, cur1, sizeof cur0))
+fail();
+bench_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH, 
WIDTH,
+WIDTH, -WIDTH, 2*WIDTH, -2*WIDTH, 3*WIDTH, -3*WIDTH, 4*WIDTH, 
-4*WIDTH,
+0, 0xff);
+}
+report("bwdif8");
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index a4e95541f5..6a7d4a1226 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -37,6 +37,7 @@ FATE_CHECKASM

[FFmpeg-devel] [PATCH 1/3] avfilter: move bwdif's filter_line init into a dedicated function

2023-02-20 Thread James Darnley
---
 libavfilter/bwdif.h |  3 ++-
 libavfilter/vf_bwdif.c  | 13 +
 libavfilter/x86/vf_bwdif_init.c |  4 +---
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index 889ff772ed..5749345f78 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -37,6 +37,7 @@ typedef struct BWDIFContext {
 int parity, int clip_max, int spat);
 } BWDIFContext;
 
-void ff_bwdif_init_x86(BWDIFContext *bwdif);
+void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
+void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);
 
 #endif /* AVFILTER_BWDIF_H */
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 65c617ebb3..34e8c5e234 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -340,7 +340,14 @@ static int config_props(AVFilterLink *link)
 
 yadif->csp = av_pix_fmt_desc_get(link->format);
 yadif->filter = filter;
-if (yadif->csp->comp[0].depth > 8) {
+ff_bwdif_init_filter_line(s, yadif->csp->comp[0].depth);
+
+return 0;
+}
+
+av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
+{
+if (bit_depth > 8) {
 s->filter_intra = filter_intra_16bit;
 s->filter_line  = filter_line_c_16bit;
 s->filter_edge  = filter_edge_16bit;
@@ -351,10 +358,8 @@ static int config_props(AVFilterLink *link)
 }
 
 #if ARCH_X86
-ff_bwdif_init_x86(s);
+ff_bwdif_init_x86(s, bit_depth);
 #endif
-
-return 0;
 }
 
 
diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c
index e24e5cd9b1..ba7bc40c3d 100644
--- a/libavfilter/x86/vf_bwdif_init.c
+++ b/libavfilter/x86/vf_bwdif_init.c
@@ -42,11 +42,9 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, 
void *cur, void *ne
   int mrefs2, int prefs3, int mrefs3, int 
prefs4,
   int mrefs4, int parity, int clip_max);
 
-av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif)
+av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
 {
-YADIFContext *yadif = &bwdif->yadif;
 int cpu_flags = av_get_cpu_flags();
-int bit_depth = (!yadif->csp) ? 8 : yadif->csp->comp[0].depth;
 
 if (bit_depth <= 8) {
 if (EXTERNAL_SSE2(cpu_flags))
-- 
2.39.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] tests: actually test yadif's 10 and 16-bit functions

2023-02-20 Thread James Darnley

On 2/20/23 13:49, Nicolas George wrote:

James Darnley (12023-02-20):


-fate-filter-yadif10: CMD = framecrc -flags bitexact -idct simple -i 
$(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt 
yuv420p10le -frames:v 30 -vf yadif=0,scale
-fate-filter-yadif16: CMD = framecrc -flags bitexact -idct simple -i 
$(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt 
yuv420p16le -frames:v 30 -vf yadif=0,scale
+fate-filter-yadif10: CMD = framecrc -flags bitexact -idct simple -i 
$(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt 
yuv420p10le -frames:v 30 -vf scale,format=yuv420p10le,yadif=0
+fate-filter-yadif16: CMD = framecrc -flags bitexact -idct simple -i 
$(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt 
yuv420p16le -frames:v 30 -vf scale,format=yuv420p16le,yadif=0


Moving scale before yadif is right, but format= is redundant with
-pix_fmt.

Regards,



So the patch should just be moving the scale filter first?  Sure.  Any 
other comments?  I wait a short while then make that change and push.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 3/3] avfilter/yadif: add avx2 filter_line function

2023-02-20 Thread James Darnley

On 2/10/23 14:06, James Darnley wrote:

snip
This patch set is broken.  The checkasm test is incomplete.  This avx2 
function has some bug that only manifests when the strides (prefs mrefs) 
are opposite signs (one positive and one negative).  That situation is 
what happens with real usage.  I fixed my checkasm test which also shows it.


Consider this patch set retracted until I can fix it.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH] tests: actually test yadif's 10 and 16-bit functions

2023-02-20 Thread James Darnley
---
 tests/fate/filter-video.mak   |  4 +--
 tests/ref/fate/filter-yadif10 | 60 +--
 tests/ref/fate/filter-yadif16 | 60 +--
 3 files changed, 62 insertions(+), 62 deletions(-)

diff --git a/tests/fate/filter-video.mak b/tests/fate/filter-video.mak
index 63873a7a07..65965d8518 100644
--- a/tests/fate/filter-video.mak
+++ b/tests/fate/filter-video.mak
@@ -16,8 +16,8 @@ fate-filter-yadif-mode0: CMD = framecrc -flags bitexact -idct 
simple -i $(TARGET
 fate-filter-yadif-mode1: CMD = framecrc -flags bitexact -idct simple -i 
$(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -frames:v 59 -vf yadif=1
 
 FATE_YADIF-$(call FILTERDEMDEC, YADIF SCALE, MPEGTS, MPEG2VIDEO) += 
fate-filter-yadif10 fate-filter-yadif16
-fate-filter-yadif10: CMD = framecrc -flags bitexact -idct simple -i 
$(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt 
yuv420p10le -frames:v 30 -vf yadif=0,scale
-fate-filter-yadif16: CMD = framecrc -flags bitexact -idct simple -i 
$(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt 
yuv420p16le -frames:v 30 -vf yadif=0,scale
+fate-filter-yadif10: CMD = framecrc -flags bitexact -idct simple -i 
$(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt 
yuv420p10le -frames:v 30 -vf scale,format=yuv420p10le,yadif=0
+fate-filter-yadif16: CMD = framecrc -flags bitexact -idct simple -i 
$(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt 
yuv420p16le -frames:v 30 -vf scale,format=yuv420p16le,yadif=0
 
 FATE_FILTER_SAMPLES-yes += $(FATE_YADIF-yes)
 
diff --git a/tests/ref/fate/filter-yadif10 b/tests/ref/fate/filter-yadif10
index 28e799fc1f..1a8063fee9 100644
--- a/tests/ref/fate/filter-yadif10
+++ b/tests/ref/fate/filter-yadif10
@@ -3,33 +3,33 @@
 #codec_id 0: rawvideo
 #dimensions 0: 720x576
 #sar 0: 16/15
-0,  9,  9,1,  1244160, 0xe0c2231b
-0, 10, 10,1,  1244160, 0xdc7caa43
-0, 11, 11,1,  1244160, 0x52c4dfbf
-0, 12, 12,1,  1244160, 0x7c577f07
-0, 13, 13,1,  1244160, 0x5b6ad7ce
-0, 14, 14,1,  1244160, 0x6f15ce76
-0, 15, 15,1,  1244160, 0xf120034a
-0, 16, 16,1,  1244160, 0x9c65ba64
-0, 17, 17,1,  1244160, 0x883b237e
-0, 18, 18,1,  1244160, 0xb8292e0d
-0, 19, 19,1,  1244160, 0xbc392721
-0, 20, 20,1,  1244160, 0x7cd82ec9
-0, 21, 21,1,  1244160, 0x167325eb
-0, 22, 22,1,  1244160, 0x49bafa73
-0, 23, 23,1,  1244160, 0xe1ff6dbf
-0, 24, 24,1,  1244160, 0x85f710b6
-0, 25, 25,1,  1244160, 0xd1fd4cdb
-0, 26, 26,1,  1244160, 0xafee03c5
-0, 27, 27,1,  1244160, 0x566be070
-0, 28, 28,1,  1244160, 0xb6abbd01
-0, 29, 29,1,  1244160, 0xa98f38fd
-0, 30, 30,1,  1244160, 0x00f4736b
-0, 31, 31,1,  1244160, 0x6b0f9dd2
-0, 32, 32,1,  1244160, 0x15810b92
-0, 33, 33,1,  1244160, 0x0b516465
-0, 34, 34,1,  1244160, 0x927d15e6
-0, 35, 35,1,  1244160, 0xd102f2bf
-0, 36, 36,1,  1244160, 0xdd8b3b20
-0, 37, 37,1,  1244160, 0x229ac529
-0, 38, 38,1,  1244160, 0xf844e0a2
+0,  9,  9,1,  1244160, 0x67910b3d
+0, 10, 10,1,  1244160, 0xdbb80927
+0, 11, 11,1,  1244160, 0xd5d4f27a
+0, 12, 12,1,  1244160, 0xde270630
+0, 13, 13,1,  1244160, 0xe57833cc
+0, 14, 14,1,  1244160, 0xc806eabd
+0, 15, 15,1,  1244160, 0xe041958a
+0, 16, 16,1,  1244160, 0x0007fdc7
+0, 17, 17,1,  1244160, 0xed25afda
+0, 18, 18,1,  1244160, 0x43f8e068
+0, 19, 19,1,  1244160, 0xd95b763a
+0, 20, 20,1,  1244160, 0xf99cacdb
+0, 21, 21,1,  1244160, 0x3c33ec50
+0, 22, 22,1,  1244160, 0xf5260151
+0, 23, 23,1,  1244160, 0x88e9f2e9
+0, 24, 24,1,  1244160, 0x104cfe20
+0, 25, 25,1,  1244160, 0x804d6a33
+0, 26, 26,1,  1244160, 0x8c668008
+0, 27, 27,1,  1244160, 0x63cf270a
+0, 28, 28,1,  1244160, 0xc526e89a
+0, 29, 29,1,  1244160, 0xe318e4d4
+0, 30, 30,1,  1244160, 0x7c6b63a3
+0, 31, 31,1,  1244160, 0x40deffd

[FFmpeg-devel] [PATCH 1/3] avfilter: move yadif's filter_line init into a dedicated function

2023-02-10 Thread James Darnley
---
 libavfilter/vf_yadif.c  | 13 +
 libavfilter/x86/vf_yadif_init.c |  4 +---
 libavfilter/yadif.h |  3 ++-
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c
index afa4d1d53d..1f9434f961 100644
--- a/libavfilter/vf_yadif.c
+++ b/libavfilter/vf_yadif.c
@@ -303,7 +303,14 @@ static int config_output(AVFilterLink *outlink)
 
 s->csp = av_pix_fmt_desc_get(outlink->format);
 s->filter = filter;
-if (s->csp->comp[0].depth > 8) {
+ff_yadif_init_filter_line(s, s->csp->comp[0].depth);
+
+return 0;
+}
+
+av_cold void ff_yadif_init_filter_line(YADIFContext *s, int bit_depth)
+{
+if (bit_depth > 8) {
 s->filter_line  = filter_line_c_16bit;
 s->filter_edges = filter_edges_16bit;
 } else {
@@ -312,10 +319,8 @@ static int config_output(AVFilterLink *outlink)
 }
 
 #if ARCH_X86
-ff_yadif_init_x86(s);
+ff_yadif_init_x86(s, bit_depth);
 #endif
-
-return 0;
 }
 
 
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index 257c3f9199..d648f0f835 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -47,11 +47,9 @@ void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, 
void *cur,
   void *next, int w, int prefs,
   int mrefs, int parity, int mode);
 
-av_cold void ff_yadif_init_x86(YADIFContext *yadif)
+av_cold void ff_yadif_init_x86(YADIFContext *yadif, int bit_depth)
 {
 int cpu_flags = av_get_cpu_flags();
-int bit_depth = (!yadif->csp) ? 8
-  : yadif->csp->comp[0].depth;
 
 if (bit_depth >= 15) {
 if (EXTERNAL_SSE2(cpu_flags))
diff --git a/libavfilter/yadif.h b/libavfilter/yadif.h
index c928911b35..5d8309b403 100644
--- a/libavfilter/yadif.h
+++ b/libavfilter/yadif.h
@@ -86,7 +86,8 @@ typedef struct YADIFContext {
 int current_field;  ///< YADIFCurrentField
 } YADIFContext;
 
-void ff_yadif_init_x86(YADIFContext *yadif);
+void ff_yadif_init_filter_line(YADIFContext *s, int bit_depth);
+void ff_yadif_init_x86(YADIFContext *yadif, int bit_depth);
 
 int ff_yadif_filter_frame(AVFilterLink *link, AVFrame *frame);
 
-- 
2.39.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 3/3] avfilter/yadif: add avx2 filter_line function

2023-02-10 Thread James Darnley
Zen 2 (Ryzen 7 3700X):
1.73x faster (3603±586.3 vs. 2082±317.1 decicycles) compared with ssse3

Using an SD y4m file speed increases from ~ 3600 fps to ~4700.
---
 libavfilter/x86/vf_yadif.asm| 83 +++--
 libavfilter/x86/vf_yadif_init.c |  4 ++
 2 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/libavfilter/x86/vf_yadif.asm b/libavfilter/x86/vf_yadif.asm
index 809cebdd3f..571febfca3 100644
--- a/libavfilter/x86/vf_yadif.asm
+++ b/libavfilter/x86/vf_yadif.asm
@@ -25,11 +25,30 @@
 
 SECTION_RODATA
 
-pb_1: times 16 db 1
-pw_1: times  8 dw 1
+pb_1: times 32 db 1
+pw_1: times 16 dw 1
 
 SECTION .text
 
+%unmacro RSHIFT 2
+
+%macro RSHIFT 2
+%if mmsize == 32
+vextracti128 xm7, %1, 1
+palignr xmm %+ %1, xm7, xmm %+ %1, 2
+%else
+psrldq %1, %2
+%endif
+%endmacro
+
+%macro UNPACK 1
+%if mmsize == 32
+pmovzxbw %1, xmm %+ %1
+%else
+punpcklbw %1, m7
+%endif
+%endmacro
+
 %macro CHECK 2
 movu  m2, [curq+t1+%1]
 movu  m3, [curq+t0+%2]
@@ -40,7 +59,7 @@ SECTION .text
 pand  m4, [pb_1]
 psubusb   m5, m4
 RSHIFTm5, 1
-punpcklbw m5, m7
+UNPACKm5
 mova  m4, m2
 psubusb   m2, m3
 psubusb   m3, m4
@@ -49,9 +68,9 @@ SECTION .text
 mova  m4, m2
 RSHIFTm3, 1
 RSHIFTm4, 2
-punpcklbw m2, m7
-punpcklbw m3, m7
-punpcklbw m4, m7
+UNPACKm2
+UNPACKm3
+UNPACKm4
 paddw m2, m3
 paddw m2, m4
 %endmacro
@@ -81,13 +100,19 @@ SECTION .text
 %endmacro
 
 %macro LOAD 2
-movh  %1, %2
-punpcklbw %1, m7
+%if mmsize == 32
+pmovzxbw %1, %2
+%else
+movh  %1, %2
+punpcklbw %1, m7
+%endif
 %endmacro
 
 %macro FILTER 3
 .loop%1:
-pxor m7, m7
+%if mmsize != 32
+pxor m7, m7
+%endif
 LOAD m0, [curq+t1]
 LOAD m1, [curq+t0]
 LOAD m2, [%2]
@@ -95,9 +120,9 @@ SECTION .text
 mova m4, m3
 paddwm3, m2
 psrawm3, 1
-mova   [rsp+ 0], m0
-mova   [rsp+16], m3
-mova   [rsp+32], m1
+mova   [rsp+0*mmsize], m0
+mova   [rsp+1*mmsize], m3
+mova   [rsp+2*mmsize], m1
 psubwm2, m4
 ABS1 m2, m4
 LOAD m3, [prevq+t1]
@@ -119,7 +144,7 @@ SECTION .text
 paddwm3, m4
 psrlwm3, 1
 pmaxsw   m2, m3
-mova   [rsp+48], m2
+mova   [rsp+3*mmsize], m2
 
 paddwm1, m0
 paddwm0, m0
@@ -134,9 +159,9 @@ SECTION .text
 psubusb  m3, m4
 pmaxub   m2, m3
 mova m3, m2
-psrldq   m3, 2
-punpcklbwm2, m7
-punpcklbwm3, m7
+RSHIFT   m3, 2
+UNPACK   m2
+UNPACK   m3
 paddwm0, m2
 paddwm0, m3
 psubwm0, [pw_1]
@@ -150,7 +175,7 @@ SECTION .text
 CHECK 1, -3
 CHECK2
 
-mova m6, [rsp+48]
+mova m6, [rsp+3*mmsize]
 cmp   DWORD r8m, 2
 jge .end%1
 LOAD m2, [%2+t1*2]
@@ -161,9 +186,9 @@ SECTION .text
 paddwm3, m5
 psrlwm2, 1
 psrlwm3, 1
-mova m4, [rsp+ 0]
-mova m5, [rsp+16]
-mova m7, [rsp+32]
+mova m4, [rsp+0*mmsize]
+mova m5, [rsp+1*mmsize]
+mova m7, [rsp+2*mmsize]
 psubwm2, m4
 psubwm3, m7
 mova m0, m5
@@ -182,15 +207,21 @@ SECTION .text
 pmaxsw   m6, m4
 
 .end%1:
-mova m2, [rsp+16]
+mova m2, [rsp+1*mmsize]
 mova m3, m2
 psubwm2, m6
 paddwm3, m6
 pmaxsw   m1, m2
 pminsw   m1, m3
-packuswb m1, m1
 
-movh [dstq], m1
+%if mmsize == 32
+vextracti128 xm4, ym1, 1
+packuswb xm1, xm4
+movu [dstq], xm1
+%else
+packuswb m1, m1
+movh [dstq], m1
+%endif
 adddstq, mmsize/2
 add   prevq, mmsize/2
 addcurq, mmsize/2
@@ -201,10 +232,10 @@ SECTION .text
 
 %macro YADIF 0
 %if ARCH_X86_32
-cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \
+cglobal yadif_filter_line, 4, 6, 8, 4*mmsize, dst, prev, cur, next, w, prefs, \
 mrefs, parity, mode
 %else
-cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \
+cglobal yadif_filter_line, 4, 7, 8, 4*mmsize, dst, prev, cur, next, w, prefs, \
 mrefs, parity, mode
 %endif
 %if ARCH_X86_32
@@ -233,3 +264,5 @@ INIT_XMM ssse3
 YADIF
 INIT_XMM sse2
 YADIF
+INIT_YMM avx2
+YADIF
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index d648f0f835..48858dc295 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -29,6 +29,8 @@ void ff_yadif_filter_line_sse2(void *dst, void *prev, void 
*cur,
 void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cu

[FFmpeg-devel] [PATCH 2/3] checkasm: add test for yadif

2023-02-10 Thread James Darnley
---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/vf_yadif.c | 62 +++
 4 files changed, 67 insertions(+)
 create mode 100644 tests/checkasm/vf_yadif.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index a6f06c7007..fc65bdc77d 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -47,6 +47,7 @@ AVFILTEROBJS-$(CONFIG_HFLIP_FILTER)  += vf_hflip.o
 AVFILTEROBJS-$(CONFIG_THRESHOLD_FILTER)  += vf_threshold.o
 AVFILTEROBJS-$(CONFIG_NLMEANS_FILTER)+= vf_nlmeans.o
 AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)  += vf_convolution.o
+AVFILTEROBJS-$(CONFIG_YADIF_FILTER)  += vf_yadif.o
 
 CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index e96d84a7da..2bb72cf839 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -200,6 +200,9 @@ static const struct {
 #if CONFIG_SOBEL_FILTER
 { "vf_sobel", checkasm_check_vf_sobel },
 #endif
+#if CONFIG_YADIF_FILTER
+{ "vf_yadif", checkasm_check_vf_yadif },
+#endif
 #endif
 #if CONFIG_SWSCALE
 { "sw_gbrp", checkasm_check_sw_gbrp },
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 8744a81218..0b9a83b5b5 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -87,6 +87,7 @@ void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
 void checkasm_check_vf_sobel(void);
+void checkasm_check_vf_yadif(void);
 void checkasm_check_vp8dsp(void);
 void checkasm_check_vp9dsp(void);
 void checkasm_check_videodsp(void);
diff --git a/tests/checkasm/vf_yadif.c b/tests/checkasm/vf_yadif.c
new file mode 100644
index 00..cb58519c23
--- /dev/null
+++ b/tests/checkasm/vf_yadif.c
@@ -0,0 +1,62 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+#include "checkasm.h"
+#include "libavcodec/internal.h"
+#include "libavfilter/yadif.h"
+
+#define WIDTH 256
+
+#define randomize_buffers(buf0, buf1, mask, count) \
+for (size_t i; i < count; i++) \
+buf0[i] = buf1[i] = rnd() & mask
+
+void checkasm_check_vf_yadif(void)
+{
+YADIFContext ctx_8, ctx_10, ctx_16;
+
+ff_yadif_init_filter_line(&ctx_8, 8);
+ff_yadif_init_filter_line(&ctx_10, 10);
+ff_yadif_init_filter_line(&ctx_16, 16);
+
+if (check_func(ctx_8.filter_line, "yadif8")) {
+uint8_t prev0[5*WIDTH + STRIDE_ALIGN], prev1[5*WIDTH + STRIDE_ALIGN];
+uint8_t next0[5*WIDTH + STRIDE_ALIGN], next1[5*WIDTH + STRIDE_ALIGN];
+uint8_t cur0[5*WIDTH + STRIDE_ALIGN], cur1[5*WIDTH + STRIDE_ALIGN];
+uint8_t dst0[WIDTH + STRIDE_ALIGN], dst1[WIDTH + STRIDE_ALIGN];
+
+declare_func(void, void *dst, void *prev, void *cur, void *next,
+int w, int prefs, int mrefs, int parity, int mode);
+
+randomize_buffers(prev0, prev1, 0xff, 5*WIDTH + STRIDE_ALIGN);
+randomize_buffers(next0, next1, 0xff, 5*WIDTH + STRIDE_ALIGN);
+randomize_buffers(cur0, cur1, 0xff, 5*WIDTH + STRIDE_ALIGN);
+
+call_ref(dst0, prev0, cur0, next0, WIDTH, WIDTH, WIDTH, 0, 1);
+call_new(dst1, prev1, cur1, next1, WIDTH, WIDTH, WIDTH, 0, 1);
+
+if (memcmp(dst0, dst1, WIDTH)
+|| memcmp(prev0, prev1, sizeof prev0)
+|| memcmp(next0, next1, sizeof next0)
+|| memcmp(cur0, cur1, sizeof cur0))
+fail();
+bench_new(dst1, prev1, cur1, next1, WIDTH, WIDTH, WIDTH, 0, 1);
+}
+report("yadif8");
+}
-- 
2.39.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [RFC PATCH 2/2] avcodec/x86: add avx512icl function for v210dec

2022-12-15 Thread James Darnley
Ice Lake (Xeon Silver 4316): 2.01x faster (1147±36.8 vs. 571±38.2 decicycles) 
compared with avx2
---

I think I can merge this with the existing macro without it being too ugly.
That might allow a plain avx512 version too but I can't say if that would be any
faster.

 libavcodec/x86/v210-init.c | 10 ++-
 libavcodec/x86/v210.asm| 60 +-
 tests/checkasm/v210dec.c   | 12 
 3 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c
index 5db1fef98c..8b3677b8aa 100644
--- a/libavcodec/x86/v210-init.c
+++ b/libavcodec/x86/v210-init.c
@@ -17,7 +17,7 @@
  */
 
 #include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
 #include "libavcodec/v210dec.h"
 
 extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, 
uint16_t *y, uint16_t *u, uint16_t *v, int width);
@@ -28,6 +28,8 @@ extern void ff_v210_planar_unpack_aligned_ssse3(const 
uint32_t *src, uint16_t *y
 extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t 
*y, uint16_t *u, uint16_t *v, int width);
 extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t 
*y, uint16_t *u, uint16_t *v, int width);
 
+extern void ff_v210_planar_unpack_avx512icl(const uint32_t *src, uint16_t *y, 
uint16_t *u, uint16_t *v, int width);
+
 av_cold void ff_v210_x86_init(V210DecContext *s)
 {
 #if HAVE_X86ASM
@@ -42,6 +44,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s)
 
 if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
 s->unpack_frame = ff_v210_planar_unpack_aligned_avx2;
+
+if (EXTERNAL_AVX512ICL(cpu_flags))
+s->unpack_frame = ff_v210_planar_unpack_avx512icl;
 }
 else {
 if (cpu_flags & AV_CPU_FLAG_SSSE3)
@@ -52,6 +57,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s)
 
 if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
 s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2;
+
+if (EXTERNAL_AVX512ICL(cpu_flags))
+s->unpack_frame = ff_v210_planar_unpack_avx512icl;
 }
 #endif
 }
diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
index 600a4ddc5f..f247737ed0 100644
--- a/libavcodec/x86/v210.asm
+++ b/libavcodec/x86/v210.asm
@@ -22,7 +22,21 @@
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA 32
+SECTION_RODATA 64
+
+perm_y:
+db  0,1,   4,5,   6,7,   8,9,  12,13, 14,15, 16,17, 20,21
+db 22,23, 24,25, 28,29, 30,31, 32,33, 36,37, 38,39, 40,41
+db 44,45, 46,47, 48,49, 52,53, 54,55, 56,57, 60,61, 62,63
+times 16 db 0xff ; align to 64
+
+perm_uv:
+db  0,1,   4,5,  10,11, 16,17, 20,21, 26,27, 32,33, 36,37
+db 42,43, 48,49, 52,53, 58,59
+times 8 db 0xff ; align to 32
+db  2,3,   8,9,  12,13, 18,19, 24,25, 28,29, 34,35, 40,41
+db 44,45, 50,51, 56,57, 60,61
+times 8 db 0xff ; align to 32
 
 ; for AVX2 version only
 v210_luma_permute: dd 0,1,2,4,5,6,7,7  ; 32-byte alignment required
@@ -34,6 +48,9 @@ v210_mult: dw 64,4,64,4,64,4,64,4
 v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
 v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
 
+shift: times 4 dw 6, 2
+kmask: dw 0x, 0x
+
 SECTION .text
 
 %macro v210_planar_unpack 1
@@ -127,3 +144,44 @@ v210_planar_unpack aligned
 INIT_YMM avx2
 v210_planar_unpack aligned
 %endif
+
+%if HAVE_AVX512ICL_EXTERNAL
+
+INIT_ZMM avx512icl
+
+cglobal v210_planar_unpack, 5, 5, 6, src, y, u, v, w
+movsxdifnidn wq, wd
+leayq, [yq+2*wq]
+adduq, wq
+addvq, wq
+negwq
+
+kmovw k1, [kmask]   ; odd dword mask
+kmovw k2, [kmask+2] ; even dword mask
+
+VBROADCASTI128 m0, [shift]
+mova   m1, [perm_y]
+mova   m2, [perm_uv]
+
+.loop:
+movum3, [srcq]
+vpsllvw m4, m3, m0
+pslld   m5, m3, 12
+psrlw   m4, 6
+psrld   m5, 22
+
+vpblendmd m3{k1}, m4, m5
+vpermbm3, m1, m3 ; could use vpcompressw
+movu  [yq+2*wq], m3
+
+vpblendmd m5{k2}, m4, m5
+vpermbm5, m2, m5
+movu  [uq+wq], ym5
+vextracti32x8 [vq+wq], zm5, 1
+
+add srcq, mmsize
+add wq, (mmsize*3)/8
+jl  .loop
+RET
+
+%endif
diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c
index 6aef519cc5..93993bae71 100644
--- a/tests/checkasm/v210dec.c
+++ b/tests/checkasm/v210dec.c
@@ -54,12 +54,12 @@ void checkasm_check_v210dec(void)
 if (check_func(h.unpack_frame, "v210_unpack")) {
 uint32_t src0[NUM_SAMPLES/3];
 uint32_t src1[NUM_SAMPLES/3];
-uint16_t y0[NUM_SAMPLES/2];
-uint16_t y1[NUM_SAMPLES/2];
-uint16_t u0[NUM_SAMPLES/4];
-uint16_t u1[NUM_SAMPLES/4];
-uint16_t v0[NUM_SAMPLES/4];
-uint16_t v1[NUM_SAMPLES/4];
+uint16_t y0[NUM_SAMPLES/2 + 15];
+uint16_t y1[NUM_SAMPLES/2 + 15];
+uint16_t u0[NUM_SAMPLES/4 + 7];
+   

[FFmpeg-devel] [PATCH 1/2] avcodec/x86/v210: add some comments to the improved avx2 function

2022-12-15 Thread James Darnley
---
 libavcodec/x86/v210.asm | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
index 3b9e0761df..600a4ddc5f 100644
--- a/libavcodec/x86/v210.asm
+++ b/libavcodec/x86/v210.asm
@@ -65,18 +65,18 @@ cglobal v210_planar_unpack_%1, 5, 5, 6 + 2 * cpuflag(avx2), 
src, y, u, v, w
 mova   m0, [srcq]
 %endif
 
-pmullw m1, m0, m3
-pslld  m0, 12
-psrlw  m1, 6   ; yB yA u5 v4 y8 y7 v3 u3 y5 y4 u2 v1 
y2 y1 v0 u0
-psrld  m0, 22  ; 00 v5 00 y9 00 u4 00 y6 00 v2 00 y3 
00 u1 00 y0
+pmullw m1, m0, m3 ; shifts the 1st and 3rd sample of each dword into the 
high 10 bits of each word
+pslld  m0, 12 ; shifts the 2nd sample of each dword into the high 10 
bits of each dword
+psrlw  m1, 6  ; shifts the 1st and 3rd samples back into the low 10 
bits
+psrld  m0, 22 ; shifts the 2nd sample back into the low 10 bits of 
each dword
 
 %if cpuflag(avx2)
-vpblendd m2, m1, m0, 0x55  ; yB yA 00 y9 y8 y7 00 y6 y5 y4 00 y3 
y2 y1 00 y0
+vpblendd m2, m1, m0, 0x55 ; merge the odd dwords from m0 and even from m1 
; yB yA 00 y9 y8 y7 00 y6 y5 y4 00 y3 y2 y1 00 y0
 pshufb m2, m4  ; 00 00 yB yA y9 y8 y7 y6 00 00 y5 y4 
y3 y2 y1 y0
 vpermd m2, m6, m2  ; 00 00 00 00 yB yA y9 y8 y7 y6 y5 y4 
y3 y2 y1 y0
 movu   [yq+2*wq], m2
 
-vpblendd m1, m1, m0, 0xaa  ; 00 v5 u5 v4 00 u4 v3 u3 00 v2 u2 v1 
00 u1 v0 u0
+vpblendd m1, m1, m0, 0xaa ; merge the even dwords from m0 and odd from m1 
; 00 v5 u5 v4 00 u4 v3 u3 00 v2 u2 v1 00 u1 v0 u0
 pshufb m1, m5  ; 00 v5 v4 v3 00 u5 u4 u3 00 v2 v1 v0 
00 u2 u1 u0
 vpermq m1, m1, 0xd8; 00 v5 v4 v3 00 v2 v1 v0 00 u5 u4 u3 
00 u2 u1 u0
 pshufb m1, m7  ; 00 00 v5 v4 v3 v2 v1 v0 00 00 u5 u4 
u3 u2 u1 u0
-- 
2.38.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] configure: support lsan as toolchain

2022-12-15 Thread James Darnley

On 12/7/22 17:08, James Darnley wrote:

---
  configure | 5 +
  1 file changed, 5 insertions(+)

diff --git a/configure b/configure
index f4eedfc207..eaa5ef6b20 100755
--- a/configure
+++ b/configure
@@ -4315,6 +4315,11 @@ case "$toolchain" in
  add_cflags  -fsanitize=address
  add_ldflags -fsanitize=address
  ;;
+*-lsan)
+cc_default="${toolchain%-lsan}"
+add_cflags  -fsanitize=leak
+add_ldflags -fsanitize=leak
+;;
  *-msan)
  cc_default="${toolchain%-msan}"
  add_cflags  -fsanitize=memory -fsanitize-memory-track-origins


ping

Any objections to this?
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH] configure: support lsan as toolchain

2022-12-07 Thread James Darnley
---
 configure | 5 +
 1 file changed, 5 insertions(+)

diff --git a/configure b/configure
index f4eedfc207..eaa5ef6b20 100755
--- a/configure
+++ b/configure
@@ -4315,6 +4315,11 @@ case "$toolchain" in
 add_cflags  -fsanitize=address
 add_ldflags -fsanitize=address
 ;;
+*-lsan)
+cc_default="${toolchain%-lsan}"
+add_cflags  -fsanitize=leak
+add_ldflags -fsanitize=leak
+;;
 *-msan)
 cc_default="${toolchain%-msan}"
 add_cflags  -fsanitize=memory -fsanitize-memory-track-origins
-- 
2.38.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v2 5/5] avcodec/x86/v210enc: remove unneeded instruction

2022-11-25 Thread James Darnley
---
 libavcodec/x86/v210enc.asm | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index d3639cd440..daf5f2ab81 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -331,7 +331,6 @@ cglobal v210_planar_pack_8, 5, 5, 7+notcpuflag(avx512icl), 
y, u, v, dst, width
 vpternlogd m0, m1, m6, 0xd8 ; C?B:A ; merge and mask out bad bits 
from B
 %else
 pand   m1, m6, m1
-pandn  m0, m6, m0
 porm0, m0, m1
 %endif
 
-- 
2.38.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v2 4/5] avcodec/x86/v210enc: expand and correct comments

2022-11-25 Thread James Darnley
---
 libavcodec/x86/v210enc.asm | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index 552164a8be..d3639cd440 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -314,7 +314,7 @@ cglobal v210_planar_pack_8, 5, 5, 7+notcpuflag(avx512icl), 
y, u, v, dst, width
 movu ym1, [yq + 2*widthq]
 vinserti32x4  m1, [uq + 1*widthq], 2
 vinserti32x4  m1, [vq + 1*widthq], 3
-vpermbm1, m2, m1 ; uyv0 yuy0 vyu0 yvy0
+vpermbm1, m2, m1 ; uyvx yuyx vyux yvyx
 %else
 movq xm0, [uq + 1*widthq];  uuxx
 movq xm1, [vq + 1*widthq];  vvxx
@@ -325,10 +325,10 @@ cglobal v210_planar_pack_8, 5, 5, 
7+notcpuflag(avx512icl), y, u, v, dst, width
 %endif
 CLIPUB   m1, m4, m5
 
-pmaddubsw  m0, m1, m3
-pslld  m1,  4
+pmaddubsw  m0, m1, m3 ; shift high and low samples of each dword and 
mask out other bits
+pslld  m1,  4 ; shift center sample of each dword
 %if cpuflag(avx512)
-vpternlogd m0, m1, m6, 0xd8 ; C?B:A
+vpternlogd m0, m1, m6, 0xd8 ; C?B:A ; merge and mask out bad bits 
from B
 %else
 pand   m1, m6, m1
 pandn  m0, m6, m0
-- 
2.38.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v2 3/5] avcodec/v210enc: add new 10-bit function for avx512 avx512icl

2022-11-25 Thread James Darnley
avx512 on Skylake-X (Xeon D-2123IT):
1.19x faster (970±91.2 vs. 817±104.4 decicycles) compared with avx2

avx512icl on Ice Lake (Xeon Silver 4316):
2.52x faster (1350±5.3 vs. 535±9.5 decicycles) compared with avx2
---
 libavcodec/x86/v210enc.asm| 99 +++
 libavcodec/x86/v210enc_init.c | 12 +
 2 files changed, 111 insertions(+)

diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index c2ad3d72c0..552164a8be 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -56,6 +56,36 @@ v210enc_8_permd: dd 0,1,4,5, 1,2,5,6
 v210enc_8_mult: db 4, 0, 64, 0
 v210enc_8_mask: dd 255<<12
 
+icl_perm_y: ; vpermb does not set bytes to zero when the high bit is set 
unlike pshufb
+%assign i 0
+%rep 8
+db -1,i+0,i+1,-1 , i+2,i+3,i+4,i+5
+%assign i i+6
+%endrep
+
+icl_perm_uv: ; vpermb does not set bytes to zero when the high bit is set 
unlike pshufb
+%assign i 0
+%rep 4
+db i+0,i+1,i+32,i+33 , -1,i+2,i+3,-1 , i+34,i+35,i+4,i+5 , -1,i+36,i+37,-1
+%assign i i+6
+%endrep
+
+icl_perm_y_kmask:  times 8 db 0b_0110
+icl_perm_uv_kmask: times 8 db 0b0110_
+
+icl_shift_y:  times 10 dw 2,0,4
+  times 4 db 0 ; padding to 64 bytes
+icl_shift_uv: times 5 dw 0,2,4
+  times 2 db 0 ; padding to 32 bytes
+  times 5 dw 4,0,2
+  times 2 db 0 ; padding to 32 bytes
+
+v210enc_10_permd_y:  dd 0,1,2,-1 , 3,4,5,-1
+v210enc_10_shufb_y:  db -1,0,1,-1 , 2,3,4,5 , -1,6,7,-1 , 8,9,10,11
+v210enc_10_permd_uv: dd 0,1,4,5 , 1,2,5,6
+v210enc_10_shufb_uv: db 0,1, 8, 9 , -1,2,3,-1 , 10,11,4,5 , -1,12,13,-1
+ db 2,3,10,11 , -1,4,5,-1 , 12,13,6,7 , -1,14,15,-1
+
 SECTION .text
 
 %macro v210_planar_pack_10 0
@@ -113,6 +143,75 @@ INIT_YMM avx2
 v210_planar_pack_10
 %endif
 
+%macro v210_planar_pack_10_new 0
+
+cglobal v210_planar_pack_10, 5, 5, 8+2*notcpuflag(avx512icl), y, u, v, dst, 
width
+lea yq, [yq+2*widthq]
+add uq, widthq
+add vq, widthq
+neg widthq
+
+%if cpuflag(avx512icl)
+movu  m6, [icl_perm_y]
+movu  m7, [icl_perm_uv]
+kmovq k1, [icl_perm_y_kmask]
+kmovq k2, [icl_perm_uv_kmask]
+%else
+movu   m6, [v210enc_10_permd_y]
+VBROADCASTI128 m7, [v210enc_10_shufb_y]
+movu   m8, [v210enc_10_permd_uv]
+movu   m9, [v210enc_10_shufb_uv]
+%endif
+movu  m2, [icl_shift_y]
+movu  m3, [icl_shift_uv]
+VBROADCASTI128 m4, [v210_enc_min_10] ; only ymm sized
+VBROADCASTI128 m5, [v210_enc_max_10] ; only ymm sized
+
+.loop:
+movu m0, [yq + widthq*2]
+%if cpuflag(avx512icl)
+movu ym1, [uq + widthq*1]
+vinserti32x8 zm1, [vq + widthq*1], 1
+%else
+movu xm1, [uq + widthq*1]
+vinserti128  ym1, [vq + widthq*1], 1
+%endif
+CLIPW m0, m4, m5
+CLIPW m1, m4, m5
+
+vpsllvw m0, m2
+vpsllvw m1, m3
+%if cpuflag(avx512icl)
+vpermb  m0{k1}{z}, m6, m0 ; make space for uv where the k-mask 
sets to zero
+vpermb  m1{k2}{z}, m7, m1 ; interleave uv and make space for y 
where the k-mask sets to zero
+%else
+vpermd m0, m6, m0
+pshufb m0, m7
+vpermd m1, m8, m1
+pshufb m1, m9
+%endif
+por m0, m1
+
+movu  [dstq], m0
+add dstq, mmsize
+add   widthq, (mmsize*3)/8
+jl .loop
+RET
+
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX512_EXTERNAL
+INIT_YMM avx512
+v210_planar_pack_10_new
+%endif
+%endif
+
+%if HAVE_AVX512ICL_EXTERNAL
+INIT_ZMM avx512icl
+v210_planar_pack_10_new
+%endif
+
 %macro v210_planar_pack_8 0
 
 ; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, 
uint8_t *dst, ptrdiff_t width)
diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c
index 6e9f8c6e61..44f22ca7fe 100644
--- a/libavcodec/x86/v210enc_init.c
+++ b/libavcodec/x86/v210enc_init.c
@@ -37,6 +37,12 @@ void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const 
uint16_t *u,
 void ff_v210_planar_pack_10_avx2(const uint16_t *y, const uint16_t *u,
  const uint16_t *v, uint8_t *dst,
  ptrdiff_t width);
+void ff_v210_planar_pack_10_avx512(const uint16_t *y, const uint16_t *u,
+   const uint16_t *v, uint8_t *dst,
+   ptrdiff_t width);
+void ff_v210_planar_pack_10_avx512icl(const uint16_t *y, const uint16_t *u,
+  const uint16_t *v, uint8_t *dst,
+  ptrdiff_t width);
 
 av_cold void ff_v210enc_init_x86(V210EncContext *s)
 {
@@ -60,10 +66,16 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s)
 if (EXTERNAL_AVX512(cpu_flags)) {
 s->sample_factor_8  = 2;
 s->pack_line_8  = ff_v210_planar_pack_8_avx512;
+#

[FFmpeg-devel] [PATCH v2 2/5] avcodec/x86/v210enc: replace register use with named register

2022-11-25 Thread James Darnley
---
 libavcodec/x86/v210enc.asm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index afac238ede..c2ad3d72c0 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -62,7 +62,7 @@ SECTION .text
 
 ; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, 
uint8_t *dst, ptrdiff_t width)
 cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width
-lea r0, [yq+2*widthq]
+lea yq, [yq+2*widthq]
 add uq, widthq
 add vq, widthq
 neg widthq
-- 
2.38.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v2 1/5] checkasm/v210enc: test the entire width of 10-bit planar input arrays

2022-11-25 Thread James Darnley
---
 tests/checkasm/v210enc.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/checkasm/v210enc.c b/tests/checkasm/v210enc.c
index 9942e08137..9fb8321c25 100644
--- a/tests/checkasm/v210enc.c
+++ b/tests/checkasm/v210enc.c
@@ -72,8 +72,10 @@
 randomize_buffers(mask);   
\
 call_ref(y0 + y_offset, u0 + uv_offset, v0 + uv_offset, dst0, 
width);  \
 call_new(y1 + y_offset, u1 + uv_offset, v1 + uv_offset, dst1, 
width);  \
-if (memcmp(y0, y1, BUF_SIZE) || memcmp(u0, u1, BUF_SIZE / 2) ||
\
-memcmp(v0, v1, BUF_SIZE / 2) || memcmp(dst0, dst1, width * 8 / 
3)) \
+if (memcmp(y0, y1, BUF_SIZE * sizeof(type))
\
+|| memcmp(u0, u1, BUF_SIZE * sizeof(type) / 2) 
\
+|| memcmp(v0, v1, BUF_SIZE * sizeof(type) / 2) 
\
+|| memcmp(dst0, dst1, width * 8 / 3))  
\
 fail();
\
 bench_new(y1 + y_offset, u1 + uv_offset, v1 + uv_offset, dst1, 
width); \
 }  
\
-- 
2.38.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 3/3] avcodec/v210enc: add new 10-bit function for avx512 avx512icl

2022-11-21 Thread James Darnley

ARCH_X86_64 is always defined. So checks of this type need to check with #if.


Thanks.  I forgot the ffmpeg convention there.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 3/3] avcodec/v210enc: add new 10-bit function for avx512 avx512icl

2022-11-21 Thread James Darnley
avx512 on Skylake-X (Xeon D-2123IT):
1.19x faster (970±91.2 vs. 817±104.4 decicycles) compared with avx2

avx512icl on Ice Lake (Xeon Silver 4316):
2.52x faster (1350±5.3 vs. 535±9.5 decicycles) compared with avx2
---
 libavcodec/x86/v210enc.asm| 99 +++
 libavcodec/x86/v210enc_init.c | 12 +
 2 files changed, 111 insertions(+)

diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index c2ad3d72c0..9cee954619 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -56,6 +56,36 @@ v210enc_8_permd: dd 0,1,4,5, 1,2,5,6
 v210enc_8_mult: db 4, 0, 64, 0
 v210enc_8_mask: dd 255<<12
 
+icl_perm_y: ; vpermb does not set bytes to zero when the high bit is set 
unlike pshufb
+%assign i 0
+%rep 8
+db -1,i+0,i+1,-1 , i+2,i+3,i+4,i+5
+%assign i i+6
+%endrep
+
+icl_perm_uv: ; vpermb does not set bytes to zero when the high bit is set 
unlike pshufb
+%assign i 0
+%rep 4
+db i+0,i+1,i+32,i+33 , -1,i+2,i+3,-1 , i+34,i+35,i+4,i+5 , -1,i+36,i+37,-1
+%assign i i+6
+%endrep
+
+icl_perm_y_kmask:  times 8 db 0b_0110
+icl_perm_uv_kmask: times 8 db 0b0110_
+
+icl_shift_y:  times 10 dw 2,0,4
+  times 4 db 0 ; padding to 64 bytes
+icl_shift_uv: times 5 dw 0,2,4
+  times 2 db 0 ; padding to 32 bytes
+  times 5 dw 4,0,2
+  times 2 db 0 ; padding to 32 bytes
+
+v210enc_10_permd_y:  dd 0,1,2,-1 , 3,4,5,-1
+v210enc_10_shufb_y:  db -1,0,1,-1 , 2,3,4,5 , -1,6,7,-1 , 8,9,10,11
+v210enc_10_permd_uv: dd 0,1,4,5 , 1,2,5,6
+v210enc_10_shufb_uv: db 0,1, 8, 9 , -1,2,3,-1 , 10,11,4,5 , -1,12,13,-1
+ db 2,3,10,11 , -1,4,5,-1 , 12,13,6,7 , -1,14,15,-1
+
 SECTION .text
 
 %macro v210_planar_pack_10 0
@@ -113,6 +143,75 @@ INIT_YMM avx2
 v210_planar_pack_10
 %endif
 
+%macro v210_planar_pack_10_new 0
+
+cglobal v210_planar_pack_10, 5, 5, 8+2*notcpuflag(avx512icl), y, u, v, dst, 
width
+lea yq, [yq+2*widthq]
+add uq, widthq
+add vq, widthq
+neg widthq
+
+%if cpuflag(avx512icl)
+movu  m6, [icl_perm_y]
+movu  m7, [icl_perm_uv]
+kmovq k1, [icl_perm_y_kmask]
+kmovq k2, [icl_perm_uv_kmask]
+%else
+movu   m6, [v210enc_10_permd_y]
+VBROADCASTI128 m7, [v210enc_10_shufb_y]
+movu   m8, [v210enc_10_permd_uv]
+movu   m9, [v210enc_10_shufb_uv]
+%endif
+movu  m2, [icl_shift_y]
+movu  m3, [icl_shift_uv]
+VBROADCASTI128 m4, [v210_enc_min_10] ; only ymm sized
+VBROADCASTI128 m5, [v210_enc_max_10] ; only ymm sized
+
+.loop:
+movu m0, [yq + widthq*2]
+%if cpuflag(avx512icl)
+movu ym1, [uq + widthq*1]
+vinserti32x8 zm1, [vq + widthq*1], 1
+%else
+movu xm1, [uq + widthq*1]
+vinserti128  ym1, [vq + widthq*1], 1
+%endif
+CLIPW m0, m4, m5
+CLIPW m1, m4, m5
+
+vpsllvw m0, m2
+vpsllvw m1, m3
+%if cpuflag(avx512icl)
+vpermb  m0{k1}{z}, m6, m0
+vpermb  m1{k2}{z}, m7, m1
+%else
+vpermd m0, m6, m0
+pshufb m0, m7
+vpermd m1, m8, m1
+pshufb m1, m9
+%endif
+por m0, m1
+
+movu  [dstq], m0
+add dstq, mmsize
+add   widthq, (mmsize*3)/8
+jl .loop
+RET
+
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX512_EXTERNAL
+INIT_YMM avx512
+v210_planar_pack_10_new
+%endif
+%endif
+
+%if HAVE_AVX512ICL_EXTERNAL
+INIT_ZMM avx512icl
+v210_planar_pack_10_new
+%endif
+
 %macro v210_planar_pack_8 0
 
 ; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, 
uint8_t *dst, ptrdiff_t width)
diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c
index 6e9f8c6e61..5d1ebcb893 100644
--- a/libavcodec/x86/v210enc_init.c
+++ b/libavcodec/x86/v210enc_init.c
@@ -37,6 +37,12 @@ void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const 
uint16_t *u,
 void ff_v210_planar_pack_10_avx2(const uint16_t *y, const uint16_t *u,
  const uint16_t *v, uint8_t *dst,
  ptrdiff_t width);
+void ff_v210_planar_pack_10_avx512(const uint16_t *y, const uint16_t *u,
+   const uint16_t *v, uint8_t *dst,
+   ptrdiff_t width);
+void ff_v210_planar_pack_10_avx512icl(const uint16_t *y, const uint16_t *u,
+  const uint16_t *v, uint8_t *dst,
+  ptrdiff_t width);
 
 av_cold void ff_v210enc_init_x86(V210EncContext *s)
 {
@@ -60,10 +66,16 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s)
 if (EXTERNAL_AVX512(cpu_flags)) {
 s->sample_factor_8  = 2;
 s->pack_line_8  = ff_v210_planar_pack_8_avx512;
+#ifdef ARCH_X86_64
+s->sample_factor_10  = 2;
+s->pack_line_10  = ff_v210_planar_pack_10_avx512;
+#e

[FFmpeg-devel] [PATCH 2/3] avcodec/x86/v210: replace register use with named register

2022-11-21 Thread James Darnley
---
 libavcodec/x86/v210enc.asm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index afac238ede..c2ad3d72c0 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -62,7 +62,7 @@ SECTION .text
 
 ; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, 
uint8_t *dst, ptrdiff_t width)
 cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width
-lea r0, [yq+2*widthq]
+lea yq, [yq+2*widthq]
 add uq, widthq
 add vq, widthq
 neg widthq
-- 
2.38.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 1/3] checkasm/v210enc: test the entire width of 10-bit planar input arrays

2022-11-21 Thread James Darnley
---
 tests/checkasm/v210enc.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/checkasm/v210enc.c b/tests/checkasm/v210enc.c
index 9942e08137..9fb8321c25 100644
--- a/tests/checkasm/v210enc.c
+++ b/tests/checkasm/v210enc.c
@@ -72,8 +72,10 @@
 randomize_buffers(mask);   
\
 call_ref(y0 + y_offset, u0 + uv_offset, v0 + uv_offset, dst0, 
width);  \
 call_new(y1 + y_offset, u1 + uv_offset, v1 + uv_offset, dst1, 
width);  \
-if (memcmp(y0, y1, BUF_SIZE) || memcmp(u0, u1, BUF_SIZE / 2) ||
\
-memcmp(v0, v1, BUF_SIZE / 2) || memcmp(dst0, dst1, width * 8 / 
3)) \
+if (memcmp(y0, y1, BUF_SIZE * sizeof(type))
\
+|| memcmp(u0, u1, BUF_SIZE * sizeof(type) / 2) 
\
+|| memcmp(v0, v1, BUF_SIZE * sizeof(type) / 2) 
\
+|| memcmp(dst0, dst1, width * 8 / 3))  
\
 fail();
\
 bench_new(y1 + y_offset, u1 + uv_offset, v1 + uv_offset, dst1, 
width); \
 }  
\
-- 
2.38.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] avcodec/v210enc: add new function for avx2 avx512 avx512icl

2022-10-31 Thread James Darnley

+%else
+pand   m1, m6, m1
+pandn  m0, m6, m0
+porm0, m0, m1
+%endif


Isn't that pattern a vpblendb or some such ?


I think Kieran already responded to this on IRC but I will too. 
Unfortunately not.  This blend is at the bit level.  This is v210 so the 
packing has the middle sample overlapping with the bottom sample in the 
second byte.


I also want to amend my performance numbers on Broadwell.  I can confirm 
Kieran's disagreement and can reproduce the 10% speed up on it:

1676±14.6 vs 1426±20.9

I will re-check Zen and amend the commit message as necessary.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH] avcodec/v210enc: add new function for avx2 avx512 avx512icl

2022-10-28 Thread James Darnley
Negligible speed difference for avx2 on Zen 2 (Ryzen 5700X) and
Broadwell (Xeon E5-2620 v4):
1690±4.3 decicycles vs. 1693±78.4
1439±31.1 decicycles vs 1429±16.7

Moderate speedup with avx512 on Skylake-X (Xeon D-2123IT):
1.22x faster (793±0.8 vs. 649±5.5 decicycles) compared with avx2

Better speedup with avx512icl on Ice Lake (Xeon Silver 4316):
1.77x faster (784±1.8 vs. 442±11.6 decicycles) compared with avx2

Co-authors:
Henrik Gramner 
Kieran Kunhya 
---
 libavcodec/x86/v210enc.asm| 80 ++-
 libavcodec/x86/v210enc_init.c | 14 ++
 2 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index 965f2bea3c..afac238ede 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -21,7 +21,7 @@
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA 32
+SECTION_RODATA 64
 
 cextern pw_4
 %define v210_enc_min_10 pw_4
@@ -46,6 +46,16 @@ v210_enc_chroma_shuf2_8: times 2 db 
3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1
 
 v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0
 
+v210enc_8_permb: db 32, 0,48,-1 ,  1,33, 2,-1 , 49, 3,34,-1 ,  4,50, 5,-1
+ db 35, 6,51,-1 ,  7,36, 8,-1 , 52, 9,37,-1 , 10,53,11,-1
+ db 38,12,54,-1 , 13,39,14,-1 , 55,15,40,-1 , 16,56,17,-1
+ db 41,18,57,-1 , 19,42,20,-1 , 58,21,43,-1 , 22,59,23,-1
+v210enc_8_shufb: db  0, 8, 1,-1 ,  9, 2,10,-1 ,  3,11, 4,-1 , 12, 5,13,-1
+ db  2,10, 3,-1 , 11, 4,12,-1 ,  5,13, 6,-1 , 14, 7,15,-1
+v210enc_8_permd: dd 0,1,4,5, 1,2,5,6
+v210enc_8_mult: db 4, 0, 64, 0
+v210enc_8_mask: dd 255<<12
+
 SECTION .text
 
 %macro v210_planar_pack_10 0
@@ -178,7 +188,73 @@ INIT_XMM avx
 v210_planar_pack_8
 %endif
 
+%macro v210_planar_pack_8_new 0
+
+cglobal v210_planar_pack_8, 5, 5, 7+notcpuflag(avx512icl), y, u, v, dst, width
+add yq, widthq
+shr widthq, 1
+add uq, widthq
+add vq, widthq
+neg widthq
+
+%if cpuflag(avx512icl)
+mova m2, [v210enc_8_permb]
+%else
+mova m2, [v210enc_8_permd]
+%endif
+vpbroadcastd   m3, [v210enc_8_mult]
+VBROADCASTI128 m4, [v210_enc_min_8] ; only ymm sized
+VBROADCASTI128 m5, [v210_enc_max_8] ; only ymm sized
+vpbroadcastd   m6, [v210enc_8_mask]
+%if notcpuflag(avx512icl)
+movu m7, [v210enc_8_shufb]
+%endif
+
+.loop:
+%if cpuflag(avx512icl)
+movu ym1, [yq + 2*widthq]
+vinserti32x4  m1, [uq + 1*widthq], 2
+vinserti32x4  m1, [vq + 1*widthq], 3
+vpermbm1, m2, m1 ; uyv0 yuy0 vyu0 yvy0
+%else
+movq xm0, [uq + 1*widthq];  uuxx
+movq xm1, [vq + 1*widthq];  vvxx
+punpcklbwxm1, xm0, xm1   ; uvuv uvuv uvuv 
+vinserti128   m1, m1, [yq + 2*widthq], 1 ; uvuv uvuv uvuv  
   
+vpermdm1, m2, m1 ; uvuv uvxx  yyxx 
xxuv uvuv xxyy 
+pshufbm1, m7 ; uyv0 yuy0 vyu0 yvy0
+%endif
+CLIPUB   m1, m4, m5
+
+pmaddubsw  m0, m1, m3
+pslld  m1,  4
+%if cpuflag(avx512)
+vpternlogd m0, m1, m6, 0xd8 ; C?B:A
+%else
+pand   m1, m6, m1
+pandn  m0, m6, m0
+porm0, m0, m1
+%endif
+
+movu  [dstq], m0
+add dstq, mmsize
+add   widthq, (mmsize*3)/16
+jl .loop
+RET
+
+%endmacro
+
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
-v210_planar_pack_8
+v210_planar_pack_8_new
+%endif
+
+%if HAVE_AVX512_EXTERNAL
+INIT_YMM avx512
+v210_planar_pack_8_new
+%endif
+
+%if HAVE_AVX512ICL_EXTERNAL
+INIT_ZMM avx512icl
+v210_planar_pack_8_new
 %endif
diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c
index 13a351dd1d..6e9f8c6e61 100644
--- a/libavcodec/x86/v210enc_init.c
+++ b/libavcodec/x86/v210enc_init.c
@@ -27,6 +27,10 @@ void ff_v210_planar_pack_8_avx(const uint8_t *y, const 
uint8_t *u,
const uint8_t *v, uint8_t *dst, ptrdiff_t 
width);
 void ff_v210_planar_pack_8_avx2(const uint8_t *y, const uint8_t *u,
 const uint8_t *v, uint8_t *dst, ptrdiff_t 
width);
+void ff_v210_planar_pack_8_avx512(const uint8_t *y, const uint8_t *u,
+const uint8_t *v, uint8_t *dst, ptrdiff_t 
width);
+void ff_v210_planar_pack_8_avx512icl(const uint8_t *y, const uint8_t *u,
+const uint8_t *v, uint8_t *dst, ptrdiff_t 
width);
 void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u,
   const uint16_t *v, uint8_t *dst,
   ptrdiff_t width);
@@ -52,4 +56,14 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s)
 s->sample_factor_10 = 2;
  

[FFmpeg-devel] [PATCH] checkasm: add a verbose check function for uint32_t data

2022-10-28 Thread James Darnley
---
 tests/checkasm/checkasm.c | 1 +
 tests/checkasm/checkasm.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 421bd096c5..c3d77cb6af 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -918,5 +918,6 @@ int checkasm_check_##type(const char *const file, const int 
line, \
 
 DEF_CHECKASM_CHECK_FUNC(uint8_t,  "%02x")
 DEF_CHECKASM_CHECK_FUNC(uint16_t, "%04x")
+DEF_CHECKASM_CHECK_FUNC(uint32_t, "%08x")
 DEF_CHECKASM_CHECK_FUNC(int16_t,  "%6d")
 DEF_CHECKASM_CHECK_FUNC(int32_t,  "%9d")
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index ee9151410e..5f68115035 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -296,6 +296,7 @@ int checkasm_check_##type(const char *const file, const int 
line, \
 
 DECL_CHECKASM_CHECK_FUNC(uint8_t);
 DECL_CHECKASM_CHECK_FUNC(uint16_t);
+DECL_CHECKASM_CHECK_FUNC(uint32_t);
 DECL_CHECKASM_CHECK_FUNC(int16_t);
 DECL_CHECKASM_CHECK_FUNC(int32_t);
 
-- 
2.38.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH] avutil/tests/cpu: print the avx512icl flag

2022-10-28 Thread James Darnley
---
 libavutil/tests/cpu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavutil/tests/cpu.c b/libavutil/tests/cpu.c
index 5bec742b2b..dadadb31dc 100644
--- a/libavutil/tests/cpu.c
+++ b/libavutil/tests/cpu.c
@@ -77,6 +77,7 @@ static const struct {
 { AV_CPU_FLAG_BMI2,  "bmi2"   },
 { AV_CPU_FLAG_AESNI, "aesni"  },
 { AV_CPU_FLAG_AVX512,"avx512" },
+{ AV_CPU_FLAG_AVX512ICL, "avx512icl"  },
 { AV_CPU_FLAG_SLOW_GATHER, "slowgather" },
 #elif ARCH_LOONGARCH
 { AV_CPU_FLAG_LSX,   "lsx"},
-- 
2.38.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH] mailmap: stop git lying about who I commit things as

2022-10-28 Thread James Darnley
---
 .mailmap | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.mailmap b/.mailmap
index ba072f38c8..af60290f77 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1,4 +1,3 @@
- 
  
  
  
-- 
2.38.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] RFC: v210enc optimisations and initial AVX-512

2022-10-26 Thread James Darnley
I guess it could also be scaled to ymm if you're a big Skylake fan :P 
(in which case you'd probably want to reorder the shuffle indices so  
that chroma comes first, i.e. movq [u] + movhps [v] + vinserti32x4[y])


What shuffle or permute did you have in mind when you suggested this for 
Skylake?  Without the permute I'm not sure how the change in ordering 
helps.  Aren't we stuck with data in separate lanes?  I'm probably 
missing something though.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] Discrepancy between comments for AVX512 flags

2022-08-26 Thread James Darnley
While cherry-picking some stuff for avx512 I have noticed that ffmpeg 
has a discrepancy in the comments for the two avx512 flags.


Lets start with the public header

libavutil/cpu.h
  56│ #define AV_CPU_FLAG_AVX512 0x10 ///< AVX-512 functions: requires 
OS support even if YMM/ZMM registers aren't used
  57│ #define AV_CPU_FLAG_AVX512ICL  0x20 ///< 
F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ


This seem to imply the first only detects ZMM support and the second 
groups all instruction sets together.  This appears to be different to 
what we imply in internal code

libavutil/x86/cpu.c
 151│ #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
libavutil/x86/x86inc.asm
 840│ %assign cpuflags_avx512(1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL 


The detection code itself has

libavutil/x86/cpu.c
 151│ #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
 152│ if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
 153│ if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 
0xd003) {
 154│ rval |= AV_CPU_FLAG_AVX512;
 155│ #if HAVE_AVX512ICL
 156│ if ((ebx & 0xd020) == 0xd020 && (ecx & 0x5f42) == 
0x5f42)
 157│ rval |= AV_CPU_FLAG_AVX512ICL;


If you decode the bits being checked you'll see that the base avx512 
checks ebx for F DQ CD BW VL and avx512icl checks ebx for IFMA CD BW VL 
and ecx for VBMI VBMI2 GFNI VAES VPCLMULQDQ VNNI BITALG VPOPCNTDQ.  The 
first matches what the internal comments imply.


Part of the difference is my fault and dates from when the flag was 
first added.


Has there been a discussion about which features should go with which flag?
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH] avfilter/vf_subtitles: add an option to choose sub stream by language

2022-04-18 Thread James Darnley
---
 doc/filters.texi   |  5 +
 libavfilter/vf_subtitles.c | 23 ---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/doc/filters.texi b/doc/filters.texi
index a161754233..cfbc807f16 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -21160,6 +21160,11 @@ Override default style or script info parameters of 
the subtitles. It accepts a
 string containing ASS style format @code{KEY=VALUE} couples separated by ",".
 @end table
 
+@item language
+Use first stream with the given language, ISO language code. @code{subtitles}
+filter only. Requires the language metadata to be read from the file.
+@end table
+
 If the first key is not specified, it is assumed that the first value
 specifies the @option{filename}.
 
diff --git a/libavfilter/vf_subtitles.c b/libavfilter/vf_subtitles.c
index 82e140e986..95f0a940d9 100644
--- a/libavfilter/vf_subtitles.c
+++ b/libavfilter/vf_subtitles.c
@@ -54,6 +54,7 @@ typedef struct AssContext {
 char *fontsdir;
 char *charenc;
 char *force_style;
+char *language;
 int stream_index;
 int alpha;
 uint8_t rgba_map[4];
@@ -271,6 +272,7 @@ static const AVOption subtitles_options[] = {
 {"stream_index", "set stream index", OFFSET(stream_index), 
AV_OPT_TYPE_INT,{ .i64 = -1 }, -1,   INT_MAX,  FLAGS},
 {"si",   "set stream index", OFFSET(stream_index), 
AV_OPT_TYPE_INT,{ .i64 = -1 }, -1,   INT_MAX,  FLAGS},
 {"force_style",  "force subtitle style", OFFSET(force_style),  
AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS},
+{"language", "use first stream of this language", OFFSET(language), 
AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS},
 {NULL},
 };
 
@@ -340,9 +342,8 @@ static av_cold int init_subtitles(AVFilterContext *ctx)
 goto end;
 
 /* Locate subtitles stream */
-if (ass->stream_index < 0)
-ret = av_find_best_stream(fmt, AVMEDIA_TYPE_SUBTITLE, -1, -1, NULL, 0);
-else {
+/* If the user has specified a particular stream use that. */
+if (ass->stream_index >= 0) {
 ret = -1;
 if (ass->stream_index < fmt->nb_streams) {
 for (j = 0; j < fmt->nb_streams; j++) {
@@ -357,6 +358,22 @@ static av_cold int init_subtitles(AVFilterContext *ctx)
 }
 }
 
+/* Otherwise find the first stream with the given language code. */
+else if (ass->language) {
+ret = -1;
+for (j = 0; j < fmt->nb_streams; j++) {
+const AVDictionaryEntry *lang = 
av_dict_get(fmt->streams[j]->metadata, "language", NULL, 0);
+if (lang && !strcmp(lang->value, ass->language)) {
+ret = j;
+break;
+}
+}
+}
+
+/* Finally fall back to the "best" stream. */
+else
+ret = av_find_best_stream(fmt, AVMEDIA_TYPE_SUBTITLE, -1, -1, NULL, 0);
+
 if (ret < 0) {
 av_log(ctx, AV_LOG_ERROR, "Unable to locate subtitle stream in %s\n",
ass->filename);
-- 
2.35.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 1/3] avcodec/bitpacked: ,

2020-06-03 Thread James Darnley
On 2020-06-04 01:19, Michael Niedermayer wrote:
> Fixes: array end overread
> Fixes: 
> 22395/clusterfuzz-testcase-minimized-ffmpeg_AV_CODEC_ID_BITPACKED_fuzzer-5760940300828672
> 
> Found-by: continuous fuzzing process 
> https://github.com/google/oss-fuzz/tree/master/projects/ffmpeg
> Signed-off-by: Michael Niedermayer 
> ---
>  libavcodec/bitpacked.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/libavcodec/bitpacked.c b/libavcodec/bitpacked.c
> index be7d1e3629..952ba73a32 100644
> --- a/libavcodec/bitpacked.c
> +++ b/libavcodec/bitpacked.c
> @@ -147,7 +147,7 @@ AVCodec ff_bitpacked_decoder = {
>  .decode = bitpacked_decode,
>  .capabilities = AV_CODEC_CAP_EXPERIMENTAL,
>  .codec_tags = (const uint32_t []){
> -MKTAG('U', 'Y', 'V', 'Y')
> +MKTAG('U', 'Y', 'V', 'Y'),
>  FF_CODEC_TAGS_END,
>  },
>  };
> 

I think you should add to the commit title.  Something like "add missing
comma to codec tags".

Other than that this looks fine.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [FFmpeg-cvslog] pthread_frame: merge the functionality for normal decoder init and init_thread_copy

2020-06-03 Thread James Darnley
On 2020-04-10 16:53, Anton Khirnov wrote:
> ffmpeg | branch: master | Anton Khirnov  | Mon Jan  9 
> 18:04:42 2017 +0100| [1f4cf92cfbd3accbae582ac63126ed5570ddfd37] | committer: 
> Anton Khirnov
> 
> pthread_frame: merge the functionality for normal decoder init and 
> init_thread_copy
> 
> The current design, where
> - proper init is called for the first per-thread context
> - first thread's private data is copied into private data for all the
>   other threads
> - a "fixup" function is called for all the other threads to e.g.
>   allocate dynamically allocated data
> is very fragile and hard to follow, so it is abandoned. Instead, the
> same init function is used to init each per-thread context. Where
> necessary, AVCodecInternal.is_copy can be used to differentiate between
> the first thread and the other ones (e.g. for decoding the extradata
> just once).
> 
>> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=1f4cf92cfbd3accbae582ac63126ed5570ddfd37

This commit has caused unexpected behavior in one use of the API that I
encountered.

The AVCodecContexts that are used for get_buffer2 calls have different
delay values in them.  Setting 2 threads I see the value alternating
between 0 and 1 for every call.

That constant changing value, from the point of view of the thing
reading it, is what is causing the unexpected behavior.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] swscale/x86/yuv2rgb: Fix build without SSSE3

2020-02-23 Thread James Darnley
On 2020-02-23 18:58, Michael Niedermayer wrote:
> On Sun, Feb 23, 2020 at 05:03:36PM +0100, Carl Eugen Hoyos wrote:
>> Am So., 23. Feb. 2020 um 13:30 Uhr schrieb Michael Niedermayer
>> :
>>>
>>> From: Parker Ernest <@>
>>>
>>> commit fc6a5883d6af8cae0e96af84dda0ad74b360a084 breaks build on
>>> x86_64 CPUs which do not have SSSE3, e.g. AMD Phenom-II
>>
>> Does the commit break build on specific CPUs or specific toolchains?
> 
> I dont know what the testcase was the author encountered, i just posted
> this here as the author wanted me to post it for him.
> but a simple
> make distclean ; ./configure --disable-ssse3 && make -j32
> replicates the build failure here (see below for the errors)

Okay, it breaks the build when you do --disable-sse3.  I see that too.

It is okay to fix that any way you want.  This patch is fine by me but
please don't imply that it fixes a run time error in the commit message,
which is what I first thought.

I see a discussion has sprung up on the best way to fix it so I guess
that has to be resolved first.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] Add .mailmap

2020-02-23 Thread James Darnley
On 2020-02-23 15:12, Jean-Baptiste Kempf wrote:
> Yo,
> 
> On Sat, Feb 22, 2020, at 22:18, Josh de Kock wrote:
>> This allows for easy shortlog/log parsing, useful in determining
>> eligible members of the general assembly for the new FFmpeg voting
>> system.
> 
> I think this is a good idea.
> But are you sure all of those are in the right order? (aka preferred email is 
> shown)
> 

What is "preferred email" when you have 2 roles?  My commits on the job
get obe.tv (or are supposed to) and ones made in my own time get
gmail.com (or are supposed to).

Is it: when you screw up what email should you be shouted at on?

I guess since I probably send more discussion email from gmail.com,
maybe it is that one.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] swscale/x86/yuv2rgb: Fix build without SSSE3

2020-02-23 Thread James Darnley
On 2020-02-23 13:22, Michael Niedermayer wrote:
> From: Parker Ernest <@>
> 
> commit fc6a5883d6af8cae0e96af84dda0ad74b360a084 breaks build on
> x86_64 CPUs which do not have SSSE3, e.g. AMD Phenom-II
> 
> Signed-off-by: Michael Niedermayer 
> ---
>  libswscale/x86/yuv2rgb.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c
> index c12e88cbb5..4791e5b93a 100644
> --- a/libswscale/x86/yuv2rgb.c
> +++ b/libswscale/x86/yuv2rgb.c
> @@ -83,6 +83,7 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
>  #if HAVE_X86ASM
>  int cpu_flags = av_get_cpu_flags();
>  
> +#if HAVE_SSSE3
>  if (EXTERNAL_SSSE3(cpu_flags)) {
>  switch (c->dstFormat) {
>  case AV_PIX_FMT_RGB32:
> @@ -111,6 +112,7 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
>  return yuv420_rgb15_ssse3;
>  }
>  }
> +#endif
>  
>  if (EXTERNAL_MMXEXT(cpu_flags)) {
>  switch (c->dstFormat) {
> 

What?  Why doesn't the the EXTERNAL_SSSE3 macro stop the code from
entering that branch?  The #if would only stop the section from being
compiled with --disable-ssse3.  A normal build would still enter that
branch on that CPU.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Followup: FOSDEM meeting

2020-02-22 Thread James Darnley
On 2020-02-22 13:25, Paul B Mahol wrote:
> On 2/22/20, James Darnley  wrote:
>> On 2020-02-22 11:11, Thilo Borgmann wrote:
>>> Please someone put an IRC log from the meeting there, too. James Darnley?
>>> Also the audio was streamed, somebody might remember where too exactly.
>>> Michael?
>>
>> I can post my log from the day, probably email attachment.  Should I
>> remove any of the lines from it, particularly after the meting
>> concluded?  There was a little chat afterwards and into the early evening.
>>
> 
> Consor my entries.

> [Sat 22 18:00] <@durandal_1707> J_Darnley: no censoring allowed
That is more clear

Attached is the log for the entire day.  I don't think anything needs
removing so it is complete.

[14:10:59]  hello
[14:11:01]  https://hangouts.google.com/call/jYaO0pADYZELBBfsntHgAEEI
[14:11:13]  hullo
[14:11:40]  I can't invite, need op
[14:13:07]  ugh google wants me phone#
[14:13:12]  my
[14:14:23]  just use talky.io
[14:14:47]  I hope I'm showing up as muted since this UI isn't making me sure if I am or not (I should be)
[14:15:03]  Do you people hear us?
[14:15:20]  no audio so far
[14:15:34]  no
[14:15:35]  No 
[14:15:51]  I'm just following irc, not the hangout unfortunately
[14:16:10]  ok, james's video feed picked up
[14:16:24]  JEEB: with sound ?
[14:16:42]  neat
[14:16:56]  no sound still but I can just attempt to re-join
[14:17:19]  nope
[14:17:27]  ok, audio
[14:17:28]  yes
[14:17:31]  yeah
[14:17:32]  have audio
[14:20:13]  I'm in. idling with mic off
[14:26:55]  usually what you do is have a nomination committee that asks people in advance and then present the nominees
[14:27:53]  Can everybody hear?
[14:28:14]  I can hear
[14:28:20]  voting 1: 3d, vote 2: a week, so seems like the conn is working here :)
[14:28:21]  I can too
[14:28:23]  Atm we don’t copy into irc what is said
[14:29:08]  (v1 was IIRC people nominated who might not otherwise show up on voting list, v2 was committees, right?)
[14:29:15]  git log --since="last 36 months" --author="name" --oneline | wc -l
[14:29:16]  yes
[14:29:18]  Jeeb: Please write short summaries about what you hear
[14:29:24]  the hangout in the topic is empty btw
[14:29:31]  (mobile phone here)
[14:29:36]  BBB: https://hangouts.google.com/call/jYaO0pADYZELBBfsntHgAEEI
[14:30:16]  cehoyos: will attempt.
[14:30:22]  git log --no-merges  --since=2020-01-25T00:00:00Z --until 2020-02-01T00:00:00Z --pretty=fuller | grep '^Author:' | sed 's/<.*//' |sort | uniq -c | sort -nr
[14:31:03]  Ty
[14:31:06]  j-b noting - CoC more like a values list as opposed to specific rules. there will be a suggestion which would then be voted on
[14:33:08]  Lynne noting - various audio decoders do checks already done avcodec common utils
[14:33:17]  (if I acught that right)
[14:33:41]  i have some difficulty understanding lynne with my headphones
[14:35:24]  michaelni: the sample rate and other checks in audio decoders that are now checked internally by the API so they should be removed
[14:35:39]  you added them, I pinged you on IRC and you didn't remove them
[14:36:07]  Lynne, i dont remember abouzt the ping but yes if there are redundant checks i should remove them
[14:36:15]  ping me again until i react!
[14:36:42]  for new joiners: since the topic is out of date if you want to join muted the URL is https://hangouts.google.com/call/jYaO0pADYZELBBfsntHgAEEI
[14:36:59]  patches would not be "lost" if we move to gitlab, for example
[14:37:32]  gitlab move: I guess main part being discussed atm being merge requests
[14:37:44]  if patches are handled by say gitlab, is it possible to subscribe via rss/atom?
[14:38:01]  I think yes, you can cehck with videolan's gitlab instance
[14:38:45]  couldn't find RSS/atom right away, but they have JSON https://code.videolan.org/videolan/x264/merge_requests.json
[14:38:50]  ugh
[14:38:56]  (just giving x264 as an example)
[14:39:02]  I keep track of mxf issues over rss
[14:39:11]  which is really handy
[14:39:21]  thardin: there are atom feeds for project activity, not sure if there's one *specific* to MRs
[14:39:27]  ah
[14:39:33]  haasn: that might be enough
[14:39:46]  rss readers typically haev filters
[14:39:57]  i dont see the problem with the existing infrastructure, so i dont see why we should move to gitlab
[14:40:05]  e.g. https://code.videolan.org/videolan/dav1d.atom
[14:40:47]  I run a gitlab instance at uni, and one thing I've found with gitlab is that it's.. a big thing. like it sometimes breaks for seemingly random reasons
[14:42:16]  yes, it's a very large ruby on rails thing, which is why I would hopefully share the system with another project, like videolan
[14:42:31]  that sounds like a good idea
[14:43:01]  I upgraded our instance when the last ubuntu lts came out, which was a bit of a chore but now I don&#

Re: [FFmpeg-devel] Followup: FOSDEM meeting

2020-02-22 Thread James Darnley
On 2020-02-22 11:11, Thilo Borgmann wrote:
> Please someone put an IRC log from the meeting there, too. James Darnley?
> Also the audio was streamed, somebody might remember where too exactly. 
> Michael?

I can post my log from the day, probably email attachment.  Should I
remove any of the lines from it, particularly after the meting
concluded?  There was a little chat afterwards and into the early evening.

I didn't record the audio but it was broadcast on Google hangouts.  I
don't know whether it records.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] What new instructions would you like?

2020-02-01 Thread James Darnley
On 30/12/2019, Lauri Kasanen  wrote:
> Hi,
>
> For the Libre RISC-V project, I'm going to research the popular codecs
> and design new instructions to help speed them up. With ffmpeg being
> home to lots of asm folks for many platforms, I also want to ask your
> opinion.
>
> What new instructions would you like? Anything particular you find
> missing in existing ISAs, slow, or cumbersome?

Do you mean SIMD instructions?  I have no idea what exists in RISC-V
already or what capabilities or limitations it has, and I am going to
use x86 language and terms such as byte, word, dword, qword.

Things I have found missing in old(er) x86 instruction sets are
missing word size and signed/unsigned variants for existing
operations.  Some operations may have byte and word variants but dword
and qword might be missing, or there might be a signed version but not
an unsigned version (and vice versa).  A couple of things I had to
emulate:
* packed absolute value of dwords
* packed maximum unsigned words
* packed max and min signed dwords (I might have really wanted
unsigned for this)
* arithmetic right shift of qwords
* pack dwords to words with unsigned saturation

Shuffle instructions.  pshufb is very useful and I think I read on IRC
that arm/aarch64/neon does not have an equivalent.  (Or was that other
shuffles?)  It allows for arbitrary reordering of bytes and setting
bytes to 0.  On x86 it takes the shuffle pattern from another SIMD
register but I usually use it with a constant pattern that gets loaded
from memory.  An interesting improvement would be if you can encode 17
* 16 (or however long your vectors might be) values in an immediate
value so it doesn't require another register.

Good documentation.  The intel instruction manual has pretty good
explanation of what the instructions do.  The old instructions from
around the time of MMX and SSE had excellent diagrams, these might
have been mostly for shuffle operations.  I need to look and jog my
memory.  I think punpcklbw is an example of what I mean.  The entry in
the manual for it has a good diagram IMO.  (At least the version I am
currently looking at)

No stupid lane stuff.  AVX2 brought us a SIMD vector length extension
from 16 to 32 bytes.  Good except for the stupid lanes they were split
into making it hard to "mix" data from the low 0-15 bytes and the high
16-31 bytes.

I forgot about this email for a month.  Sorry about that.  Seeing
RISC-V in the schedule at FOSDEM reminded me about this.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [IMPORTANT] FOSDEM meeting

2020-02-01 Thread James Darnley
On 28/01/2020, Liu Steven  wrote:
>
>
>> 在 2020年1月27日,下午3:29,Jean-Baptiste Kempf  写道:
>> It will be joinable through some VideoConf tool.
> Can we join by IRC or other things on internet?
> Because these days are Spring Festival (Chinese New Year, Important
> festivals that have lasted for thousands of years),
> The more important reason is New infectious virus epidemic areas here. :(

Since I don't think it was said yet: yes, there will be participation
on IRC.  At the very least I plan to be there and will relay things
to<->from #ffmpeg-meeting on freenode.

Other people are responsible for other solutions.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH, v3, 1/7] lavu/pixfmt: add new pixel format 0yuv/y210/y410

2019-12-05 Thread James Darnley
On 2019-12-04 15:43, Linjie Fu wrote:
> Previously, media driver provided planar format(like 420 8 bit),
> but for HEVC Range Extension (422/444 8/10 bit), the decoded image
> is produced in packed format because Windows expects it.
> 
> Add some packed pixel formats for hardware decode support in VAAPI
> and QSV:
> 
> 4:2:2 10 bit: Y210
> 4:4:4  8 bit: 0YUV
> 4:4:4 10 bit: Y410
> 

> +[AV_PIX_FMT_Y410LE] = {
> +.name = "y410le",
> +.nb_components = 4,
> +.log2_chroma_w = 0,
> +.log2_chroma_h = 0,
> +.comp = {
> +{ 0, 32, 10, 0, 10, 31, 9, 11 },/* Y */
> +{ 0, 32,  0, 0, 10, 31, 9,  1 },/* U */
> +{ 0, 32, 20, 0, 10, 31, 9, 21 },/* V */
> +{ 0, 32, 30, 0,  2, 31, 1, 31 },/* A */
> +},
> +.flags = AV_PIX_FMT_FLAG_ALPHA | AV_PIX_FMT_FLAG_BITSTREAM,
> +},



> diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
> index d78e863..a163350 100644
> --- a/libavutil/pixfmt.h
> +++ b/libavutil/pixfmt.h
> @@ -348,6 +348,12 @@ enum AVPixelFormat {
>  AV_PIX_FMT_NV24,  ///< planar YUV 4:4:4, 24bpp, 1 plane for Y and 1 
> plane for the UV components, which are interleaved (first byte U and the 
> following byte V)
>  AV_PIX_FMT_NV42,  ///< as above, but U and V bytes are swapped
>  
> +AV_PIX_FMT_Y210BE,///< packed YUV 4:2:2, 32bpp, Y0 Cb Y1 Cr, 
> big-endian
> +AV_PIX_FMT_Y210LE,///< packed YUV 4:2:2, 32bpp, Y0 Cb Y1 Cr, 
> little-endian
> +AV_PIX_FMT_0YUV,  ///< packed YUV 4:4:4, 32bpp,  X  Y Cb Cr, 
> X=unused/undefined
> +AV_PIX_FMT_Y410LE,///< packed YUV 4:4:4, 32bpp, Cr  Y Cb  A, 
> little-endian
> +AV_PIX_FMT_Y410BE,///< packed YUV 4:4:4, 32bpp, Cr  Y Cb  A, 
> big-endian
> +
>  AV_PIX_FMT_NB ///< number of pixel formats, DO NOT USE THIS if 
> you want to link with shared libav* because the number of formats might 
> differ between versions
>  };
>  

I will ask again.  From
> http://ffmpeg.org/pipermail/ffmpeg-devel/2019-June/245929.html

> Why am I suspicious that at least one of those is a re-ordered v210?  I
> seem to recall that we rejected adding v210 to this list.  Either they
> don't belong in this list or they don't belong because libavcodec has a
> proper decoder (at least for v210).
> 
> This might be the thread I was remembering but March seems too recent
>> https://ffmpeg.org/pipermail/ffmpeg-devel/2019-March/241549.html
> 
> No real conclusion was reached there.
> 
> Do bit-packed formats belong in an AVPixelFormat?

Despite what was said last time I do believe this is packed.  I have
taken a little time to actually understand these magic number structs.

y410 is clearly packed like v210.  Look at the those offsets: 0, 10, 20,
30.  Packed into a 32-bit word.  Flagged with AV_PIX_FMT_FLAG_BITSTREAM.

How is that any different to v210?  Can you address a single sample in
that 1 plane format without using shifts and bit-wise ands?  Isn't that
the definition of packed?  I do not mean interleaved.

Okay, y410 is a little better in that it is 444 so the sample order does
not change through 6 word cycle.  Is that the key difference?



Do bit-packed formats belong in an AVPixelFormat?

If yes then I do not object to this patch or any others like this.

If no then why is this not rejected?



Does the AV_PIX_FMT_FLAG_BITSTREAM flag mean they do belong?  I admit I
haven't seen this before so maybe I should shut up and not send this email.




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [Contract Request] for FFmpeg libmp3lame multi-threaded feature implementation

2019-11-25 Thread James Darnley
On 2019-11-25 13:52, Chandra Nakka wrote:
> Dear FFmpeg developers,
> 
> I'm very happy to have found your details on FFmpeg website for requesting
> FFmpeg feature implementation.
> 
> Currently I'm using FFmpeg command line tool on my linux servers to process
> media files into instant mp3 audio files by using FFmpeg piping feature.
> But, currently libmp3lame encoder support single thread only for encoding
> audio stream to mp3 file. This is the great drawback for my project.
> 
> I have more than 100+ linux servers for processing audio streams to mp3
> files. Each server has 8 physical CPU cores. But, due to libmp3lame single
> thread limitation my project mp3 conversion speed becomes too lazy
> and remaining cores on servers are becomes useless.
> 
> Actually I'm a web developer. I have no idea on FFmpeg tools tech
> languages.  So, I'm looking for FFmpeg developer who can implement
> libmp3lame multi-threaded feature on FFmpeg. I'm ready to pay for this
> feature.
> 
> Looking forward to hearing from you.
> 
> Thank you,
> Chandra N.

https://www.gnu.org/software/parallel/

That'll be $1, thank you.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] avutil/eval: add sgn()

2019-10-12 Thread James Darnley
On 2019-10-11 21:45, Paul B Mahol wrote:

> diff --git a/doc/utils.texi b/doc/utils.texi
> index d55dd315c3..4e2e713505 100644
> --- a/doc/utils.texi
> +++ b/doc/utils.texi
> @@ -920,6 +920,9 @@ corresponding input value will be returned.
>  @item round(expr)
>  Round the value of expression @var{expr} to the nearest integer. For 
> example, "round(1.5)" is "2.0".
>  
> +@item sgn(x)
> +Compute sign of @var{x}.
> +
>  @item sin(x)
>  Compute sine of @var{x}.
>  

Too late now but, since we have round() just above it which is 5 chars,
couldn't you have made this sign()?

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] avcodec/h264: fix draw_horiz_band with slice threads

2019-09-02 Thread James Darnley
From: Kieran Kunhya 

---
 libavcodec/h264_slice.c | 29 +++--
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
index 5ceee107a0..fe2aa01ceb 100644
--- a/libavcodec/h264_slice.c
+++ b/libavcodec/h264_slice.c
@@ -2527,18 +2527,33 @@ static void predict_field_decoding_flag(const 
H264Context *h, H264SliceContext *
 /**
  * Draw edges and report progress for the last MB row.
  */
-static void decode_finish_row(const H264Context *h, H264SliceContext *sl)
+static void decode_finish_row(const H264Context *h, H264SliceContext *sl, int 
slice_end)
 {
 int top= 16 * (sl->mb_y  >> FIELD_PICTURE(h));
 int pic_height = 16 *  h->mb_height >> FIELD_PICTURE(h);
 int height =  16  << FRAME_MBAFF(h);
 int deblock_border = (16 + 4) << FRAME_MBAFF(h);
 
-if (sl->deblocking_filter) {
+/* Slice-threaded draw_horiz_band not useful in this situation */
+if (sl->deblocking_filter == 1) {
 if ((top + height) >= pic_height)
 height += deblock_border;
 top -= deblock_border;
 }
+else if (sl->deblocking_filter == 2) {
+int first_mb_y = sl->first_mb_addr / h->mb_width;
+
+/* Draw the whole slice if it's possible:
+ * - If the beginning of the slice is at the start of a row
+ * - If we are at the end of the slice
+ * Previous slice is guaranteed not be included. */
+if (!(sl->first_mb_addr % h->mb_width)) {
+if (slice_end) {
+top = 16 * (first_mb_y >> FIELD_PICTURE(h));
+height = (16 << FRAME_MBAFF(h)) * ((sl->mb_y+1) - first_mb_y);
+}
+}
+}
 
 if (top >= pic_height || (top + height) < 0)
 return;
@@ -2549,7 +2564,8 @@ static void decode_finish_row(const H264Context *h, 
H264SliceContext *sl)
 top= 0;
 }
 
-ff_h264_draw_horiz_band(h, sl, top, height);
+if (slice_end)
+ff_h264_draw_horiz_band(h, sl, top, height);
 
 if (h->droppable || sl->h264->slice_ctx[0].er.error_occurred)
 return;
@@ -2622,7 +2638,7 @@ static int decode_slice(struct AVCodecContext *avctx, 
void *arg)
 
 for (;;) {
 // START_TIMER
-int ret, eos;
+int ret, eos, slice_end;
 if (sl->mb_x + sl->mb_y * h->mb_width >= sl->next_slice_idx) {
 av_log(h->avctx, AV_LOG_ERROR, "Slice overlaps with next at 
%d\n",
sl->next_slice_idx);
@@ -2669,10 +2685,11 @@ static int decode_slice(struct AVCodecContext *avctx, 
void *arg)
 return AVERROR_INVALIDDATA;
 }
 
+slice_end = eos || sl->mb_y >= h->mb_height;
 if (++sl->mb_x >= h->mb_width) {
 loop_filter(h, sl, lf_x_start, sl->mb_x);
 sl->mb_x = lf_x_start = 0;
-decode_finish_row(h, sl);
+decode_finish_row(h, sl, slice_end);
 ++sl->mb_y;
 if (FIELD_OR_MBAFF_PICTURE(h)) {
 ++sl->mb_y;
@@ -2729,7 +2746,7 @@ static int decode_slice(struct AVCodecContext *avctx, 
void *arg)
 if (++sl->mb_x >= h->mb_width) {
 loop_filter(h, sl, lf_x_start, sl->mb_x);
 sl->mb_x = lf_x_start = 0;
-decode_finish_row(h, sl);
+decode_finish_row(h, sl, 0);
 ++sl->mb_y;
 if (FIELD_OR_MBAFF_PICTURE(h)) {
 ++sl->mb_y;
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] avcodec/h264: enable draw_horiz_band

2019-09-02 Thread James Darnley
---
 libavcodec/h264dec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c
index 8d1bd16a8e..b9f304936c 100644
--- a/libavcodec/h264dec.c
+++ b/libavcodec/h264dec.c
@@ -1056,7 +1056,7 @@ AVCodec ff_h264_decoder = {
 .init  = h264_decode_init,
 .close = h264_decode_end,
 .decode= h264_decode_frame,
-.capabilities  = /*AV_CODEC_CAP_DRAW_HORIZ_BAND |*/ 
AV_CODEC_CAP_DR1 |
+.capabilities  = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1 |
  AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SLICE_THREADS |
  AV_CODEC_CAP_FRAME_THREADS,
 .hw_configs= (const AVCodecHWConfigInternal*[]) {
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 0/2] WIP: h264, slice threads, draw_horiz_band

2019-09-02 Thread James Darnley
Trying a combination of sliced threads, chunk decoding, and draw_horiz_band we
found that it didn't work with the current master code.  Modifying the
api-h264-slice fate test showed obvious errors with grey and green blocks and
more subtle ones that looked like misplaced macroblocks.

Kieran identified the cause and coded this quick fix.  He said that essentially
the code would give a region to draw_horiz_band which could include the previous
slice even if it hadn't been finished yet.

This corrects that problem and lets us decode exactly.  However it does cause
errors decoding B-frames in chunked mode.

Needs more work.

James Darnley (1):
  avcodec/h264: enable draw_horiz_band

Kieran Kunhya (1):
  avcodec/h264: fix draw_horiz_band with slice threads

 libavcodec/h264_slice.c | 29 +++--
 libavcodec/h264dec.c|  2 +-
 2 files changed, 24 insertions(+), 7 deletions(-)

-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 3/7] x86inc: Improve SAVE/LOAD_MM_PERMUTATION macros

2019-08-05 Thread James Darnley
From: Henrik Gramner 

Use register numbers instead of copying the full register names. This makes it
possible to change register widths in the middle of a function and keep the
mmreg permutations intact which can be useful for code that only needs larger
vectors for parts of the function in combination with macros etc.

Also change the LOAD_MM_PERMUTATION macro to use the same default name as the
SAVE macro. This simplifies swapping from ymm to xmm registers or vice versa:

SAVE_MM_PERMUTATION
INIT_XMM 
LOAD_MM_PERMUTATION
---
 libavutil/x86/x86inc.asm | 23 ++-
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 39cba5db09..10b7711637 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1081,19 +1081,32 @@ INIT_XMM
 %endif
 %assign %%i 0
 %rep num_mmregs
-CAT_XDEFINE %%f, %%i, m %+ %%i
+%xdefine %%tmp m %+ %%i
+CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp
 %assign %%i %%i+1
 %endrep
 %endmacro
 
-%macro LOAD_MM_PERMUTATION 1 ; name to load from
-%ifdef %1_m0
+%macro LOAD_MM_PERMUTATION 0-1 ; name to load from
+%if %0
+%xdefine %%f %1_m
+%else
+%xdefine %%f current_function %+ _m
+%endif
+%xdefine %%tmp %%f %+ 0
+%ifnum %%tmp
+RESET_MM_PERMUTATION
 %assign %%i 0
 %rep num_mmregs
-CAT_XDEFINE m, %%i, %1_m %+ %%i
-CAT_XDEFINE nn, m %+ %%i, %%i
+%xdefine %%tmp %%f %+ %%i
+CAT_XDEFINE %%m, %%i, m %+ %%tmp
 %assign %%i %%i+1
 %endrep
+%rep num_mmregs
+%assign %%i %%i-1
+CAT_XDEFINE m, %%i, %%m %+ %%i
+CAT_XDEFINE nn, m %+ %%i, %%i
+%endrep
 %endif
 %endmacro
 
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 7/7] x86inc: Add support for GFNI instructions

2019-08-05 Thread James Darnley
From: Henrik Gramner 

---
 libavutil/x86/x86inc.asm | 30 +-
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index d1b4c982fc..8c8cc97e0c 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -820,19 +820,20 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, 
jge, jng, jnge, ja, jae,
 %assign cpuflags_sse4 (1<<10)| cpuflags_ssse3
 %assign cpuflags_sse42(1<<11)| cpuflags_sse4
 %assign cpuflags_aesni(1<<12)| cpuflags_sse42
-%assign cpuflags_avx  (1<<13)| cpuflags_sse42
-%assign cpuflags_xop  (1<<14)| cpuflags_avx
-%assign cpuflags_fma4 (1<<15)| cpuflags_avx
-%assign cpuflags_fma3 (1<<16)| cpuflags_avx
-%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt
-%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1
-%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2
-%assign cpuflags_avx512   (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
-
-%assign cpuflags_cache32  (1<<21)
-%assign cpuflags_cache64  (1<<22)
-%assign cpuflags_aligned  (1<<23) ; not a cpu feature, but a function variant
-%assign cpuflags_atom (1<<24)
+%assign cpuflags_gfni (1<<13)| cpuflags_sse42
+%assign cpuflags_avx  (1<<14)| cpuflags_sse42
+%assign cpuflags_xop  (1<<15)| cpuflags_avx
+%assign cpuflags_fma4 (1<<16)| cpuflags_avx
+%assign cpuflags_fma3 (1<<17)| cpuflags_avx
+%assign cpuflags_bmi1 (1<<18)| cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2 (1<<19)| cpuflags_bmi1
+%assign cpuflags_avx2 (1<<20)| cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512   (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL
+
+%assign cpuflags_cache32  (1<<22)
+%assign cpuflags_cache64  (1<<23)
+%assign cpuflags_aligned  (1<<24) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<25)
 
 ; Returns a boolean value expressing whether or not the specified cpuflag is 
enabled.
 %definecpuflag(x) (cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 
1) >> 31) & 1)
@@ -1418,6 +1419,9 @@ AVX_INSTR divss, sse, 1, 0, 0
 AVX_INSTR dppd, sse4, 1, 1, 0
 AVX_INSTR dpps, sse4, 1, 1, 0
 AVX_INSTR extractps, sse4, 1
+AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8mulb, gfni, 0, 0, 0
 AVX_INSTR haddpd, sse3, 1, 0, 0
 AVX_INSTR haddps, sse3, 1, 0, 0
 AVX_INSTR hsubpd, sse3, 1, 0, 0
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/7] x86inc: Optimize VEX instruction encoding

2019-08-05 Thread James Darnley
From: Henrik Gramner 

Most VEX-encoded instructions require an additional byte to encode when src2
is a high register (e.g. x|ymm8..15). If the instruction is commutative we
can swap src1 and src2 when doing so reduces the instruction length, e.g.

vpaddw xmm0, xmm0, xmm8 -> vpaddw xmm0, xmm8, xmm0
---
 libavutil/x86/x86inc.asm | 35 +--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index bc370a6186..39cba5db09 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1244,9 +1244,40 @@ INIT_XMM
 %elif %0 >= 9
 __instr %6, %7, %8, %9
 %elif %0 == 8
-__instr %6, %7, %8
+%if avx_enabled && %5
+%xdefine __src1 %7
+%xdefine __src2 %8
+%ifnum regnumof%7
+%ifnum regnumof%8
+%if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 
&& sizeof%8 <= 32
+; Most VEX-encoded instructions require an additional 
byte to encode when
+; src2 is a high register (e.g. m8..15). If the 
instruction is commutative
+; we can swap src1 and src2 when doing so reduces the 
instruction length.
+%xdefine __src1 %8
+%xdefine __src2 %7
+%endif
+%endif
+%endif
+__instr %6, __src1, __src2
+%else
+__instr %6, %7, %8
+%endif
 %elif %0 == 7
-__instr %6, %7
+%if avx_enabled && %5
+%xdefine __src1 %6
+%xdefine __src2 %7
+%ifnum regnumof%6
+%ifnum regnumof%7
+%if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 
&& sizeof%7 <= 32
+%xdefine __src1 %7
+%xdefine __src2 %6
+%endif
+%endif
+%endif
+__instr %6, __src1, __src2
+%else
+__instr %6, %7
+%endif
 %else
 __instr %6
 %endif
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 0/7] Import some x264asm patches from x264

2019-08-05 Thread James Darnley
Here are a few easy-to-import patches from x264.  These are all after x264
commit 4a158b00 "x86inc: Correctly set mmreg variables" which FFmpeg already
has (commit eb5f063e7c).

It does not include the following commits:
* 82721eae "x86inc: Add x86-32 PIC support macros"
* 101bd27d "x86inc: Support N_PEXT bit on Mach-O"

They would not apply cleanly because of existing differences between x264 and
FFmpeg.  The PIC one has a change to configure which would need remaking.

Henrik Gramner (7):
  x86inc: Fix VEX -> EVEX instruction conversion
  x86inc: Optimize VEX instruction encoding
  x86inc: Improve SAVE/LOAD_MM_PERMUTATION macros
  x86inc: Turn 'movsxd' into 'movifnidn' on x86-32
  x86inc: Make 'non-adjacent' default in the TAIL_CALL macro
  x86inc: Improve warnings for use of unsupported instructions
  x86inc: Add support for GFNI instructions

 libavutil/x86/x86inc.asm | 219 ---
 1 file changed, 161 insertions(+), 58 deletions(-)

-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 6/7] x86inc: Improve warnings for use of unsupported instructions

2019-08-05 Thread James Darnley
From: Henrik Gramner 

Warn when the following are used without the appropriate cpuflag:
 * YMM and ZMM registers
 * 'pextrw' with a memory operand
 * GPR instruction set extensions
---
 libavutil/x86/x86inc.asm | 120 +++
 1 file changed, 83 insertions(+), 37 deletions(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index af35fe1e4d..d1b4c982fc 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1216,8 +1216,22 @@ INIT_XMM
 %ifdef cpuname
 %if notcpuflag(%2)
 %error use of ``%1'' %2 instruction in cpuname function: 
current_function
-%elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && 
__sizeofreg > 8
+%elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
 %error use of ``%1'' sse2 instruction in cpuname function: 
current_function
+%elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
+%error use of ``%1'' avx2 instruction in cpuname function: 
current_function
+%elif __sizeofreg == 16 && notcpuflag(sse)
+%error use of ``%1'' sse instruction in cpuname function: 
current_function
+%elif __sizeofreg == 32 && notcpuflag(avx)
+%error use of ``%1'' avx instruction in cpuname function: 
current_function
+%elif __sizeofreg == 64 && notcpuflag(avx512)
+%error use of ``%1'' avx512 instruction in cpuname function: 
current_function
+%elifidn %1, pextrw ; special case because the base instruction is 
mmx2,
+%ifnid %6   ; but sse4 is required for memory operands
+%if notcpuflag(sse4)
+%error use of ``%1'' sse4 instruction in cpuname 
function: current_function
+%endif
+%endif
 %endif
 %endif
 %endif
@@ -1379,38 +1393,38 @@ AVX_INSTR cmpunordpd, sse2, 1, 0, 1
 AVX_INSTR cmpunordps, sse, 1, 0, 1
 AVX_INSTR cmpunordsd, sse2, 1, 0, 0
 AVX_INSTR cmpunordss, sse, 1, 0, 0
-AVX_INSTR comisd, sse2
-AVX_INSTR comiss, sse
-AVX_INSTR cvtdq2pd, sse2
-AVX_INSTR cvtdq2ps, sse2
-AVX_INSTR cvtpd2dq, sse2
-AVX_INSTR cvtpd2ps, sse2
-AVX_INSTR cvtps2dq, sse2
-AVX_INSTR cvtps2pd, sse2
-AVX_INSTR cvtsd2si, sse2
+AVX_INSTR comisd, sse2, 1
+AVX_INSTR comiss, sse, 1
+AVX_INSTR cvtdq2pd, sse2, 1
+AVX_INSTR cvtdq2ps, sse2, 1
+AVX_INSTR cvtpd2dq, sse2, 1
+AVX_INSTR cvtpd2ps, sse2, 1
+AVX_INSTR cvtps2dq, sse2, 1
+AVX_INSTR cvtps2pd, sse2, 1
+AVX_INSTR cvtsd2si, sse2, 1
 AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
 AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
 AVX_INSTR cvtsi2ss, sse, 1, 0, 0
 AVX_INSTR cvtss2sd, sse2, 1, 0, 0
-AVX_INSTR cvtss2si, sse
-AVX_INSTR cvttpd2dq, sse2
-AVX_INSTR cvttps2dq, sse2
-AVX_INSTR cvttsd2si, sse2
-AVX_INSTR cvttss2si, sse
+AVX_INSTR cvtss2si, sse, 1
+AVX_INSTR cvttpd2dq, sse2, 1
+AVX_INSTR cvttps2dq, sse2, 1
+AVX_INSTR cvttsd2si, sse2, 1
+AVX_INSTR cvttss2si, sse, 1
 AVX_INSTR divpd, sse2, 1, 0, 0
 AVX_INSTR divps, sse, 1, 0, 0
 AVX_INSTR divsd, sse2, 1, 0, 0
 AVX_INSTR divss, sse, 1, 0, 0
 AVX_INSTR dppd, sse4, 1, 1, 0
 AVX_INSTR dpps, sse4, 1, 1, 0
-AVX_INSTR extractps, sse4
+AVX_INSTR extractps, sse4, 1
 AVX_INSTR haddpd, sse3, 1, 0, 0
 AVX_INSTR haddps, sse3, 1, 0, 0
 AVX_INSTR hsubpd, sse3, 1, 0, 0
 AVX_INSTR hsubps, sse3, 1, 0, 0
 AVX_INSTR insertps, sse4, 1, 1, 0
 AVX_INSTR lddqu, sse3
-AVX_INSTR ldmxcsr, sse
+AVX_INSTR ldmxcsr, sse, 1
 AVX_INSTR maskmovdqu, sse2
 AVX_INSTR maxpd, sse2, 1, 0, 1
 AVX_INSTR maxps, sse, 1, 0, 1
@@ -1420,10 +1434,10 @@ AVX_INSTR minpd, sse2, 1, 0, 1
 AVX_INSTR minps, sse, 1, 0, 1
 AVX_INSTR minsd, sse2, 1, 0, 0
 AVX_INSTR minss, sse, 1, 0, 0
-AVX_INSTR movapd, sse2
-AVX_INSTR movaps, sse
+AVX_INSTR movapd, sse2, 1
+AVX_INSTR movaps, sse, 1
 AVX_INSTR movd, mmx
-AVX_INSTR movddup, sse3
+AVX_INSTR movddup, sse3, 1
 AVX_INSTR movdqa, sse2
 AVX_INSTR movdqu, sse2
 AVX_INSTR movhlps, sse, 1, 0, 0
@@ -1432,19 +1446,19 @@ AVX_INSTR movhps, sse, 1, 0, 0
 AVX_INSTR movlhps, sse, 1, 0, 0
 AVX_INSTR movlpd, sse2, 1, 0, 0
 AVX_INSTR movlps, sse, 1, 0, 0
-AVX_INSTR movmskpd, sse2
-AVX_INSTR movmskps, sse
+AVX_INSTR movmskpd, sse2, 1
+AVX_INSTR movmskps, sse, 1
 AVX_INSTR movntdq, sse2
 AVX_INSTR movntdqa, sse4
-AVX_INSTR movntpd, sse2
-AVX_INSTR movntps, sse
+AVX_INSTR movntpd, sse2, 1
+AVX_INSTR movntps, sse, 1
 AVX_INSTR movq, mmx
 AVX_INSTR movsd, sse2, 1, 0, 0
-AVX_INSTR movshdup, sse3
-AVX_INSTR movsldup, sse3
+AVX_INSTR movshdup, sse3, 1
+AVX_INSTR movsldup, sse3, 1
 AVX_INSTR movss, sse, 1, 0, 0
-AVX_INSTR movupd, sse2
-AVX_INSTR movups, sse
+AVX_INSTR movupd, sse2, 1
+AVX_INSTR movups, sse, 1
 AVX_INSTR mpsadbw, sse4, 0, 1, 0
 AVX_INSTR mulpd, sse2, 1, 0, 1
 AVX_INSTR mulps, sse, 1, 0, 1
@@ -1577,27 +1591,27 @@ AVX_INSTR punpcklwd, mmx, 0, 0, 0
 AVX_INSTR punpckldq, mmx, 0, 0, 0
 AVX_INSTR punpcklqdq, sse2, 0, 0, 0
 AVX_INSTR pxor, mmx, 0, 0, 1
-AVX_INSTR rcpps, sse
+AVX_INSTR rcpps, sse, 1
 AVX_INST

[FFmpeg-devel] [PATCH 5/7] x86inc: Make 'non-adjacent' default in the TAIL_CALL macro

2019-08-05 Thread James Darnley
From: Henrik Gramner 

---
 libavutil/x86/x86inc.asm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 04dbb6b785..af35fe1e4d 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -685,7 +685,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 
 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, 
jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
 
-%macro TAIL_CALL 2 ; callee, is_nonadjacent
+%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
 %if has_epilogue
 call %1
 RET
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 4/7] x86inc: Turn 'movsxd' into 'movifnidn' on x86-32

2019-08-05 Thread James Darnley
From: Henrik Gramner 

---
 libavutil/x86/x86inc.asm | 4 
 1 file changed, 4 insertions(+)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 10b7711637..04dbb6b785 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -293,6 +293,10 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 %endif
 %endmacro
 
+%if ARCH_X86_64 == 0
+%define movsxd movifnidn
+%endif
+
 %macro movsxdifnidn 2
 %ifnidn %1, %2
 movsxd %1, %2
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/7] x86inc: Fix VEX -> EVEX instruction conversion

2019-08-05 Thread James Darnley
From: Henrik Gramner 

There's an edge case that wasn't properly handled.
---
 libavutil/x86/x86inc.asm | 5 +
 1 file changed, 5 insertions(+)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 5044ee86f0..bc370a6186 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1662,6 +1662,11 @@ FMA4_INSTR fnmsub,   pd, ps, sd, ss
 %assign %%evex_required 1
 %endif
 %endif
+%ifnum regnumof%3
+%if regnumof%3 >= 16 || sizeof%3 > 32
+%assign %%evex_required 1
+%endif
+%endif
 %if %%evex_required
 %6 %%args
 %else
-- 
2.22.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Issues while encoding a ts file to m3u8

2019-08-02 Thread James Darnley
On 2019-08-02 15:55, Ramana Jajula wrote:
> Hi,
> 
> I am trying to encode my ts file m3u8 using my customised ffmpeg of version
> 4.1. I used below command to do encoding.
> 
> ffmpeg -re -threads 8 -i /videos/input.ts -vcodec libx264 -s 320x240 -b:v
> 512000 -maxrate 512000 -acodec libfdk_aac -b:a 32000 -ac 2 -ar 48000
> -force_key_frames 'expr:gte(t,n_forced*3)' -hls_flags single_file
> -hls_list_size 0 -hls_time 3 -fsize 400x222 -frames /frames/my_frames/
> -index /mpegindex/my_index.idx  -y /encoded/test/output.m3u8
> 
> My encoding was bad. The output printed to console is
>   libavutil  56. 22.100 / 56. 22.100
>   libavcodec 58. 35.100 / 58. 35.100
>   libavformat58. 20.100 / 58. 20.100
>   libavdevice58.  5.100 / 58.  5.100
>   libavfilter 7. 40.101 /  7. 40.101
>   libavresample   4.  0.  0 /  4.  0.  0
>   libswscale  5.  3.100 /  5.  3.100
>   libswresample   3.  3.100 /  3.  3.100
>   libpostproc55.  3.100 / 55.  3.100
> /videos/input.ts FPS 25.00 0
> Input #0, mpegts, from '/videos/.input.ts':
>   Duration: 00:04:05.97, start: 85837.091689, bitrate: 1769 kb/s
>   Program 1
> Stream #0:0[0x105]: Video: h264 (Main) ([27][0][0][0] / 0x001B),
> yuv420p(top first), 1920x1080 [SAR 1:1 DAR 16:9], 25 fps, 25 tbr, 90k tbn,
> 50 tbc
> Stream #0:1[0x106]: Audio: ac3 ([129][0][0][0] / 0x0081), 48000 Hz,
> stereo, fltp, 128 kb/s
> [libx264 @ 0x564a2f7cc480] VBV maxrate specified, but no bufsize, ignored
> [libx264 @ 0x564a2f7cc480] using SAR=4/3
> [libx264 @ 0x564a2f7cc480] using cpu capabilities: MMX2 SSE2Fast SSSE3
> SSE4.2
> [libx264 @ 0x564a2f7cc480] profile High, level 2.0
> [libx264 @ 0x564a2f7cc480] 264 - core 148 r2748 97eaef2 - H.264/MPEG-4 AVC
> codec - Copyleft 2003-2016 - http://www.videolan.org/x264.html - options:
> cabac=1 ref=3 debloc
> k=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1
> me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11
> fast_pskip=1 chroma_qp_offset
> =-2 threads=3 lookahead_threads=1 sliced_threads=0 nr=0 decimate=1
> interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2
> b_adapt=1 b_bias=0 direct=1 wei
> ghtb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40
> intra_refresh=0 rc_lookahead=40 rc=abr mbtree=1 bitrate=512 ratetol=1.0
> qcomp=0.60 qpmin=0 qpmax=69 qpst
> ep=4 ip_ratio=1.40 aq=1:1.00
> [hls @ 0x564a2f7ccc40] Using AVStream.codec to pass codec parameters to
> muxers is deprecated, use AVStream.codecpar instead.
> Last message repeated 1 times
> [hls @ 0x564a2f7ccc40] Opening '/encodedt/input.ts' for writing
> Output #0, hls, to '/encoded/output.m3u8':
>   Metadata:
> encoder : Lavf58.20.100
> Stream #0:0: Video: h264 (libx264), yuv420p, 320x240 [SAR 4:3 DAR
> 16:9], q=-1--1, 512 kb/s, 25 fps, 90k tbn, 25 tbc
> Metadata:
>   encoder : Lavc58.35.100 libx264
> Side data:
>   cpb: bitrate max/min/avg: 512000/0/512000 buffer size: 0 vbv_delay: -1
> Stream #0:1: Audio: aac (libfdk_aac), 48000 Hz, stereo, s16, 32 kb/s
> Metadata:
>   encoder : Lavc58.35.100 libfdk_aac
> Stream mapping:
>   Stream #0:0 -> #0:0 (h264 (native) -> h264 (libx264))
>   Stream #0:1 -> #0:1 (ac3 (native) -> aac (libfdk_aac))
> Press [q] to stop, [?] for help
> frame=   34 fps=0.1 q=0.0 size=N/A time=00:05:02.11 bitrate=N/A dup=29
> drop=0 speed=0.567x
> [hls @ 0x564a2f7ccc40] Packets poorly interleaved, failed to avoid negative
> timestamp -3360 in stream 0.0.567x
> Try -max_interleave_delta 0 as a possible workaround.
> 
> Since the encoding speed is too slow I had to cancel the encoding process.
> I killed it,
> 
> What is the reason for this slow encoding process?
> 
> PS: My input file is of 1 hour duration.
> 

1 - Wrong mailing list.  This should probably be on ffmpeg-user.

2 - What configure options did you use for ffmpeg?  Why did you remove them?

3 - What "modifications" have you made"?

4 - What CPU do you have?  One without AVX is either old, or limited
(like Celerons and Pentiums)

5 - Why are you using an x264 from 2016?  Have you "modified" it too?

Next time just press 'q' to end encoding so we can see some stats.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/5] lavu/pixfmt: add Y210/AYUV/Y410 pixel formats

2019-06-28 Thread James Darnley
On 2019-06-28 03:03, Hendrik Leppkes wrote:
> On Fri, Jun 28, 2019 at 1:26 AM James Darnley  wrote:
>>
>> On 2019-06-28 04:26, Linjie Fu wrote:
>>> Previously, media driver provided planar format(like 420 8 bit), but
>>> for HEVC Range Extension (422/444 8/10 bit), the decoded image is
>>> produced in packed format.
>>>
>>> Y210/AYUV/Y410 are packed formats which are needed in HEVC Rext decoding
>>> for both VAAPI and QSV:
>>> - Y210: 422 10 BIT
>>> - AYUV: 444  8 BIT
>>> - Y410: 444 10 BIT
>>>
>>
>>
>> Why am I suspicious that at least one of those is a re-ordered v210?  I
>> seem to recall that we rejected adding v210 to this list.  Either they
>> don't belong in this list or they don't belong because libavcodec has a
>> proper decoder (at least for v210).
>>
> 
> They are not quite as bad as v210 (and not related).
> 
> Microsoft documents them here as the recommended formats to be used on 
> Windows:
> https://docs.microsoft.com/en-us/windows/desktop/medfound/recommended-8-bit-yuv-formats-for-video-rendering#444-formats-32-bits-per-pixel
> https://docs.microsoft.com/en-us/windows/desktop/medfound/10-bit-and-16-bit-yuv-video-formats
> 
> - Hendrik

Okay y410 and y210 use the highest 10 bits in each 16-bit word.  I
apologise for jumping to that conclusion.




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/5] lavu/pixfmt: add Y210/AYUV/Y410 pixel formats

2019-06-27 Thread James Darnley
On 2019-06-28 04:26, Linjie Fu wrote:
> Previously, media driver provided planar format(like 420 8 bit), but
> for HEVC Range Extension (422/444 8/10 bit), the decoded image is
> produced in packed format.
> 
> Y210/AYUV/Y410 are packed formats which are needed in HEVC Rext decoding
> for both VAAPI and QSV:
> - Y210: 422 10 BIT
> - AYUV: 444  8 BIT
> - Y410: 444 10 BIT
> 


Why am I suspicious that at least one of those is a re-ordered v210?  I
seem to recall that we rejected adding v210 to this list.  Either they
don't belong in this list or they don't belong because libavcodec has a
proper decoder (at least for v210).

This might be the thread I was remembering but March seems too recent
> https://ffmpeg.org/pipermail/ffmpeg-devel/2019-March/241549.html

No real conclusion was reached there.

Do bit-packed formats belong in an AVPixelFormat?




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] avcodec: Add librav1e encoder

2019-05-28 Thread James Darnley
On 2019-05-28 22:00, Derek Buitenhuis wrote:
> On 28/05/2019 20:58, James Almer wrote:
>> I think x26* and vpx/aom call it crf? It's not in option_tables.h in any
>> case.
> 
> They do not. This is a constant quantizer mode, not constant rate factor.

IIRC either qp or cqp




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/7] libavfilter/vf_overlay.c: change the commands style for the macro defined function

2019-05-24 Thread James Darnley
On 2019-05-24 12:06, James Darnley wrote:
> On 2019-05-24 11:36, lance.lmw...@gmail.com wrote:
>> From: Limin Wang 
>>
>> ...
> 
> Why?

I see why: so you don't screw-up the macros you create later.




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/7] libavfilter/vf_overlay.c: change the commands style for the macro defined function

2019-05-24 Thread James Darnley
On 2019-05-24 11:36, lance.lmw...@gmail.com wrote:
> From: Limin Wang 
> 
> ...

Why?  And these are "comments" not "commands".




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] avcodec/v210dec: Fix alignment check for AVX2

2019-05-18 Thread James Darnley
On 2019-05-18 12:15, Michael Niedermayer wrote:
> On Sat, May 18, 2019 at 12:02:55PM +0200, James Darnley wrote:
>> I object to the commit message though because it isn't a "null pointer
>> dereference" but if that is the error as reported by the tool then keep
>> it as is.
> 
> yes, the tool(s) say things like "Null-dereference READ", "SEGV on unknown 
> address 0x"
> 

Hm.  It is almost certainly an aligned move on an unaligned address.

I don't care that much about the rest of the commit message; the subject
is correct which is good enough.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] avcodec/v210dec: Fix alignment check for AVX2

2019-05-18 Thread James Darnley
On 2019-05-18 09:39, Michael Niedermayer wrote:
> Fixes: "null pointer dereference"
> Fixes: 
> 14551/clusterfuzz-testcase-minimized-ffmpeg_AV_CODEC_ID_V210_fuzzer-5088609952071680
> 
> Found-by: continuous fuzzing process 
> https://github.com/google/oss-fuzz/tree/master/projects/ffmpeg
> Signed-off-by: Michael Niedermayer 
> ---
>  libavcodec/v210dec.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> index bc1e1d34ff..5a33d8c089 100644
> --- a/libavcodec/v210dec.c
> +++ b/libavcodec/v210dec.c
> @@ -104,7 +104,7 @@ static int decode_frame(AVCodecContext *avctx, void 
> *data, int *got_frame,
>  && avpkt->size - 64 >= stride * avctx->height)
>  psrc += 64;
>  
> -aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf);
> +aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f);
>  if (aligned_input != s->aligned_input) {
>  s->aligned_input = aligned_input;
>  ff_v210dec_init(s);
> 

Ah yes, that'll be needed after the recent addition of avx2.  LGTM and
sorry.

I object to the commit message though because it isn't a "null pointer
dereference" but if that is the error as reported by the tool then keep
it as is.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 0/3] v210dec checkasm test and avx2 function

2019-04-18 Thread James Darnley
On 2019-04-10 14:47, James Darnley wrote:
> I am resending this my patches because I am not sure if I sent this version in
> the past.  I split my changes into two patches because they do separate 
> things.
> 
> I also changed some tabs to spaces in Mike's AVX2 patch.
> 
> James Darnley (2):
>   avcodec/v210dec: move DSP function setting into dedicated function
>   checkasm: add test for v210dec
> 
> Michael Stoner (1):
>   libavcodec Adding ff_v210_planar_unpack AVX2
> 
>  libavcodec/v210dec.c   | 26 +
>  libavcodec/v210dec.h   |  1 +
>  libavcodec/x86/v210-init.c |  8 
>  libavcodec/x86/v210.asm| 72 +++
>  tests/checkasm/Makefile|  1 +
>  tests/checkasm/checkasm.c  |  3 ++
>  tests/checkasm/checkasm.h  |  1 +
>  tests/checkasm/v210dec.c   | 77 ++
>  8 files changed, 166 insertions(+), 23 deletions(-)
>  create mode 100644 tests/checkasm/v210dec.c
> 

Any objections to this patchset?  I have corrected the address of
Michael's patch to the address I Cced.  I hope that the right one.


___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 3/3] libavcodec Adding ff_v210_planar_unpack AVX2

2019-04-10 Thread James Darnley
On 2019-04-10 14:47, James Darnley wrote:
> From: Michael Stoner 

Screw you mailing list or git, which ever one of you managed to screw up
the author's address.  I will correct that, if I can.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/3] avcodec/v210dec: move DSP function setting into dedicated function

2019-04-10 Thread James Darnley
Prepare for checkasm test.
---
 libavcodec/v210dec.c | 16 ++--
 libavcodec/v210dec.h |  1 +
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index ddc5dbe8be..fd8a6b0d78 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -50,6 +50,13 @@ static void v210_planar_unpack_c(const uint32_t *src, 
uint16_t *y, uint16_t *u,
 }
 }
 
+av_cold void ff_v210dec_init(V210DecContext *s)
+{
+s->unpack_frame = v210_planar_unpack_c;
+if (ARCH_X86)
+ff_v210_x86_init(s);
+}
+
 static av_cold int decode_init(AVCodecContext *avctx)
 {
 V210DecContext *s = avctx->priv_data;
@@ -57,10 +64,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
 avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
 avctx->bits_per_raw_sample = 10;
 
-s->unpack_frame= v210_planar_unpack_c;
-
-if (HAVE_MMX)
-ff_v210_x86_init(s);
+s->aligned_input = 0;
+ff_v210dec_init(s);
 
 return 0;
 }
@@ -102,8 +107,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, 
int *got_frame,
 aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf);
 if (aligned_input != s->aligned_input) {
 s->aligned_input = aligned_input;
-if (HAVE_MMX)
-ff_v210_x86_init(s);
+ff_v210dec_init(s);
 }
 
 if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
index 533afc435c..cfdb29da09 100644
--- a/libavcodec/v210dec.h
+++ b/libavcodec/v210dec.h
@@ -31,6 +31,7 @@ typedef struct {
 void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
 } V210DecContext;
 
+void ff_v210dec_init(V210DecContext *s);
 void ff_v210_x86_init(V210DecContext *s);
 
 #endif /* AVCODEC_V210DEC_H */
-- 
2.21.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 3/3] libavcodec Adding ff_v210_planar_unpack AVX2

2019-04-10 Thread James Darnley
From: Michael Stoner 

Replaced VSHUFPS with VPBLENDD to relieve port 5 bottleneck
AVX2 is 1.4x faster than AVX
---

Mike, is this still the patch you want applied.  I had to make a small
amendment to it because you had some tabs as indentation.

 libavcodec/v210dec.c   | 10 +-
 libavcodec/x86/v210-init.c |  8 +
 libavcodec/x86/v210.asm| 72 +-
 3 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index fd8a6b0d78..bc1e1d34ff 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -123,7 +123,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, 
int *got_frame,
 const uint32_t *src = (const uint32_t*)psrc;
 uint32_t val;
 
-w = (avctx->width / 6) * 6;
+w = (avctx->width / 12) * 12;
 s->unpack_frame(src, y, u, v, w);
 
 y += w;
@@ -131,6 +131,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, 
int *got_frame,
 v += w >> 1;
 src += (w << 1) / 3;
 
+if (w < avctx->width - 5) {
+READ_PIXELS(u, y, v);
+READ_PIXELS(y, u, y);
+READ_PIXELS(v, y, u);
+READ_PIXELS(y, v, y);
+w += 6;
+}
+
 if (w < avctx->width - 1) {
 READ_PIXELS(u, y, v);
 
diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c
index d64dbca1a8..cb9a6cbd6a 100644
--- a/libavcodec/x86/v210-init.c
+++ b/libavcodec/x86/v210-init.c
@@ -21,9 +21,11 @@
 
 extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, 
uint16_t *y, uint16_t *u, uint16_t *v, int width);
 extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t 
*y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_unaligned_avx2(const uint32_t *src, uint16_t 
*y, uint16_t *u, uint16_t *v, int width);
 
 extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t 
*y, uint16_t *u, uint16_t *v, int width);
 extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t 
*y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t 
*y, uint16_t *u, uint16_t *v, int width);
 
 av_cold void ff_v210_x86_init(V210DecContext *s)
 {
@@ -36,6 +38,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s)
 
 if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
 s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
+
+if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
+s->unpack_frame = ff_v210_planar_unpack_aligned_avx2;
 }
 else {
 if (cpu_flags & AV_CPU_FLAG_SSSE3)
@@ -43,6 +48,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s)
 
 if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
 s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
+
+if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
+s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2;
 }
 #endif
 }
diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
index c24c765e5b..706712313d 100644
--- a/libavcodec/x86/v210.asm
+++ b/libavcodec/x86/v210.asm
@@ -22,9 +22,14 @@
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
+
+; for AVX2 version only
+v210_luma_permute: dd 0,1,2,4,5,6,7,7  ; 32-byte alignment required
+v210_chroma_shuf2: db 0,1,2,3,4,5,8,9,10,11,12,13,-1,-1,-1,-1
+v210_luma_shuf_avx2: db 0,1,4,5,6,7,8,9,12,13,14,15,-1,-1,-1,-1
+v210_chroma_shuf_avx2: db 0,1,4,5,10,11,-1,-1,2,3,8,9,12,13,-1,-1
 
-v210_mask: times 4 dd 0x3ff
 v210_mult: dw 64,4,64,4,64,4,64,4
 v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
 v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
@@ -34,40 +39,65 @@ SECTION .text
 %macro v210_planar_unpack 1
 
 ; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t 
*v, int width)
-cglobal v210_planar_unpack_%1, 5, 5, 7
+cglobal v210_planar_unpack_%1, 5, 5, 8
 movsxdifnidn r4, r4d
 lear1, [r1+2*r4]
 addr2, r4
 addr3, r4
 negr4
 
-mova   m3, [v210_mult]
-mova   m4, [v210_mask]
-mova   m5, [v210_luma_shuf]
-mova   m6, [v210_chroma_shuf]
+VBROADCASTI128   m3, [v210_mult]
+VBROADCASTI128   m5, [v210_chroma_shuf]
+
+%if cpuflag(avx2)
+VBROADCASTI128   m4, [v210_luma_shuf_avx2]
+VBROADCASTI128   m5, [v210_chroma_shuf_avx2]
+mova m6, [v210_luma_permute]
+VBROADCASTI128   m7, [v210_chroma_shuf2]
+%else
+VBROADCASTI128   m4, [v210_luma_shuf]
+VBROADCASTI128   m5, [v210_chroma_shuf]
+%endif
+
 .loop:
 %ifidn %1, unaligned
-movu   m0, [r0]
+movu   m0, [r0]; yB v5 yA  u5 y9 v4  y8 u4 y7  v3 y6 u3  y5 v2 y4  u2 
y3 v1  y2 u1 y1  v0 y0 u0
 %else
 mova   m0, [r0]
 %endif
 
 pmullw m1, m0, m3
-psrld  m0, 10
-psrlw  m1, 6  ; u0 v0 y1 y2 v1 u2 y4 y5
-pand   m0, m4 ; y0 __ u1 __ y3 __ v2

[FFmpeg-devel] [PATCH 0/3] v210dec checkasm test and avx2 function

2019-04-10 Thread James Darnley
I am resending this my patches because I am not sure if I sent this version in
the past.  I split my changes into two patches because they do separate things.

I also changed some tabs to spaces in Mike's AVX2 patch.

James Darnley (2):
  avcodec/v210dec: move DSP function setting into dedicated function
  checkasm: add test for v210dec

Michael Stoner (1):
  libavcodec Adding ff_v210_planar_unpack AVX2

 libavcodec/v210dec.c   | 26 +
 libavcodec/v210dec.h   |  1 +
 libavcodec/x86/v210-init.c |  8 
 libavcodec/x86/v210.asm| 72 +++
 tests/checkasm/Makefile|  1 +
 tests/checkasm/checkasm.c  |  3 ++
 tests/checkasm/checkasm.h  |  1 +
 tests/checkasm/v210dec.c   | 77 ++
 8 files changed, 166 insertions(+), 23 deletions(-)
 create mode 100644 tests/checkasm/v210dec.c

-- 
2.21.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/3] checkasm: add test for v210dec

2019-04-10 Thread James Darnley
---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/v210dec.c  | 77 +++
 4 files changed, 82 insertions(+)
 create mode 100644 tests/checkasm/v210dec.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 8cc0bff2d1..886ae33167 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -25,6 +25,7 @@ AVCODECOBJS-$(CONFIG_JPEG2000_DECODER)  += jpeg2000dsp.o
 AVCODECOBJS-$(CONFIG_PIXBLOCKDSP)   += pixblockdsp.o
 AVCODECOBJS-$(CONFIG_HEVC_DECODER)  += hevc_add_res.o hevc_idct.o 
hevc_sao.o
 AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER)   += utvideodsp.o
+AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 9eec41e3c4..bf51e00eab 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -136,6 +136,9 @@ static const struct {
 #if CONFIG_UTVIDEO_DECODER
 { "utvideodsp", checkasm_check_utvideodsp },
 #endif
+#if CONFIG_V210_DECODER
+{ "v210dec", checkasm_check_v210dec },
+#endif
 #if CONFIG_V210_ENCODER
 { "v210enc", checkasm_check_v210enc },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 9e8e879fd3..9b8d2f5419 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -69,6 +69,7 @@ void checkasm_check_sbrdsp(void);
 void checkasm_check_synth_filter(void);
 void checkasm_check_sw_rgb(void);
 void checkasm_check_utvideodsp(void);
+void checkasm_check_v210dec(void);
 void checkasm_check_v210enc(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c
new file mode 100644
index 00..7dd50a8271
--- /dev/null
+++ b/tests/checkasm/v210dec.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 James Darnley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+#include "checkasm.h"
+#include "libavcodec/v210dec.h"
+
+static uint32_t get_v210(void)
+{
+uint32_t t0 = rnd() & 0x3ff,
+ t1 = rnd() & 0x3ff,
+ t2 = rnd() & 0x3ff;
+uint32_t value =  t0
+   | (t1 << 10)
+   | (t2 << 20);
+return value;
+}
+
+#define NUM_SAMPLES 2048
+
+static void randomize_buffers(uint32_t *src0, uint32_t *src1, int len)
+{
+for (int i = 0; i < len; i++) {
+uint32_t value = get_v210();
+src0[i] = value;
+src1[i] = value;
+}
+}
+
+void checkasm_check_v210dec(void)
+{
+V210DecContext h;
+
+h.aligned_input = 0;
+ff_v210dec_init(&h);
+
+if (check_func(h.unpack_frame, "v210_unpack")) {
+uint32_t src0[NUM_SAMPLES/3];
+uint32_t src1[NUM_SAMPLES/3];
+uint16_t y0[NUM_SAMPLES/2];
+uint16_t y1[NUM_SAMPLES/2];
+uint16_t u0[NUM_SAMPLES/4];
+uint16_t u1[NUM_SAMPLES/4];
+uint16_t v0[NUM_SAMPLES/4];
+uint16_t v1[NUM_SAMPLES/4];
+declare_func(void, const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
+const int pixels = NUM_SAMPLES / 2 / 6 * 6;
+
+randomize_buffers(src0, src1, NUM_SAMPLES/3);
+call_ref(src0, y0, u0, v0, pixels);
+call_new(src1, y1, u1, v1, pixels);
+if (memcmp(src0, src1, NUM_SAMPLES/3 * sizeof src0[0])
+|| memcmp(y0, y1, pixels * sizeof y0[0])
+|| memcmp(u0, u1, pixels/2 * sizeof u0[0])
+|| memcmp(v0, v1, pixels/2 * sizeof v0[0]))
+fail();
+bench_new(src1, y1, u1, v1, pixels);
+}
+report("v210_unpack");
+}
-- 
2.21.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] libavcodec Adding ff_v210_planar_unpack AVX2

2019-03-27 Thread James Darnley
On 2019-03-26 21:22, Mike Stoner via ffmpeg-devel wrote:
> Hello,
> I’ve accounted for all feedback on this so far, I’m wondering if it is ready 
> to be pushed upstream?
> 
> Here are my results from ‘checkasm’ (lower is better):
> 
> v210_unpack_c: 1636
> v210_unpack_ssse3: 611
> v210_unpack_avx: 601
> v210_unpack_avx2: 423
> 
> I ran it 5 times and averaged the middle 3 results for each CPU target 
> (ignoring the highest and lowest time).
> 
> https://patchwork.ffmpeg.org/patch/12325/
> 
> 
> Thanks… -Mike

Sorry that I keep forgetting about this.  I will try to make some time
tomorrow to give this another look over.

I'm not sure what order this and my checkasm patch should be applied in,
which I also forgot about.

Did anyone else make comments on either patch?




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] avcodec/v210dec: move DSP function setting into dedicated function

2019-03-06 Thread James Darnley
Prepare for checkasm test.
---
 libavcodec/v210dec.c | 16 ++--
 libavcodec/v210dec.h |  1 +
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index ddc5dbe8be..fd8a6b0d78 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -50,6 +50,13 @@ static void v210_planar_unpack_c(const uint32_t *src, 
uint16_t *y, uint16_t *u,
 }
 }
 
+av_cold void ff_v210dec_init(V210DecContext *s)
+{
+s->unpack_frame = v210_planar_unpack_c;
+if (ARCH_X86)
+ff_v210_x86_init(s);
+}
+
 static av_cold int decode_init(AVCodecContext *avctx)
 {
 V210DecContext *s = avctx->priv_data;
@@ -57,10 +64,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
 avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
 avctx->bits_per_raw_sample = 10;
 
-s->unpack_frame= v210_planar_unpack_c;
-
-if (HAVE_MMX)
-ff_v210_x86_init(s);
+s->aligned_input = 0;
+ff_v210dec_init(s);
 
 return 0;
 }
@@ -102,8 +107,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, 
int *got_frame,
 aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf);
 if (aligned_input != s->aligned_input) {
 s->aligned_input = aligned_input;
-if (HAVE_MMX)
-ff_v210_x86_init(s);
+ff_v210dec_init(s);
 }
 
 if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
index 533afc435c..cfdb29da09 100644
--- a/libavcodec/v210dec.h
+++ b/libavcodec/v210dec.h
@@ -31,6 +31,7 @@ typedef struct {
 void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
 } V210DecContext;
 
+void ff_v210dec_init(V210DecContext *s);
 void ff_v210_x86_init(V210DecContext *s);
 
 #endif /* AVCODEC_V210DEC_H */
-- 
2.20.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 2/2] checkasm: add test for v210dec

2019-03-06 Thread James Darnley
---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/v210dec.c  | 77 +++
 4 files changed, 82 insertions(+)
 create mode 100644 tests/checkasm/v210dec.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 47b7b06d28..70abc1a407 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -25,6 +25,7 @@ AVCODECOBJS-$(CONFIG_JPEG2000_DECODER)  += jpeg2000dsp.o
 AVCODECOBJS-$(CONFIG_PIXBLOCKDSP)   += pixblockdsp.o
 AVCODECOBJS-$(CONFIG_HEVC_DECODER)  += hevc_add_res.o hevc_idct.o 
hevc_sao.o
 AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER)   += utvideodsp.o
+AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 9eec41e3c4..bf51e00eab 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -136,6 +136,9 @@ static const struct {
 #if CONFIG_UTVIDEO_DECODER
 { "utvideodsp", checkasm_check_utvideodsp },
 #endif
+#if CONFIG_V210_DECODER
+{ "v210dec", checkasm_check_v210dec },
+#endif
 #if CONFIG_V210_ENCODER
 { "v210enc", checkasm_check_v210enc },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 9e8e879fd3..9b8d2f5419 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -69,6 +69,7 @@ void checkasm_check_sbrdsp(void);
 void checkasm_check_synth_filter(void);
 void checkasm_check_sw_rgb(void);
 void checkasm_check_utvideodsp(void);
+void checkasm_check_v210dec(void);
 void checkasm_check_v210enc(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c
new file mode 100644
index 00..7dd50a8271
--- /dev/null
+++ b/tests/checkasm/v210dec.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 James Darnley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+#include "checkasm.h"
+#include "libavcodec/v210dec.h"
+
+static uint32_t get_v210(void)
+{
+uint32_t t0 = rnd() & 0x3ff,
+ t1 = rnd() & 0x3ff,
+ t2 = rnd() & 0x3ff;
+uint32_t value =  t0
+   | (t1 << 10)
+   | (t2 << 20);
+return value;
+}
+
+#define NUM_SAMPLES 2048
+
+static void randomize_buffers(uint32_t *src0, uint32_t *src1, int len)
+{
+for (int i = 0; i < len; i++) {
+uint32_t value = get_v210();
+src0[i] = value;
+src1[i] = value;
+}
+}
+
+void checkasm_check_v210dec(void)
+{
+V210DecContext h;
+
+h.aligned_input = 0;
+ff_v210dec_init(&h);
+
+if (check_func(h.unpack_frame, "v210_unpack")) {
+uint32_t src0[NUM_SAMPLES/3];
+uint32_t src1[NUM_SAMPLES/3];
+uint16_t y0[NUM_SAMPLES/2];
+uint16_t y1[NUM_SAMPLES/2];
+uint16_t u0[NUM_SAMPLES/4];
+uint16_t u1[NUM_SAMPLES/4];
+uint16_t v0[NUM_SAMPLES/4];
+uint16_t v1[NUM_SAMPLES/4];
+declare_func(void, const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
+const int pixels = NUM_SAMPLES / 2 / 6 * 6;
+
+randomize_buffers(src0, src1, NUM_SAMPLES/3);
+call_ref(src0, y0, u0, v0, pixels);
+call_new(src1, y1, u1, v1, pixels);
+if (memcmp(src0, src1, NUM_SAMPLES/3 * sizeof src0[0])
+|| memcmp(y0, y1, pixels * sizeof y0[0])
+|| memcmp(u0, u1, pixels/2 * sizeof u0[0])
+|| memcmp(v0, v1, pixels/2 * sizeof v0[0]))
+fail();
+bench_new(src1, y1, u1, v1, pixels);
+}
+report("v210_unpack");
+}
-- 
2.20.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avcodec/v210dec: move DSP function setting into dedicated function

2019-03-06 Thread James Darnley
Prepare for checkasm test.
---
 libavcodec/v210dec.c | 16 ++--
 libavcodec/v210dec.h |  1 +
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index ddc5dbe8be..6db662538e 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -50,6 +50,14 @@ static void v210_planar_unpack_c(const uint32_t *src, 
uint16_t *y, uint16_t *u,
 }
 }
 
+av_cold void ff_v210dec_init(V210DecContext *s)
+{
+s->unpack_frame = v210_planar_unpack_c;
+s->aligned_input = 0;
+if (ARCH_X86)
+ff_v210_x86_init(s);
+}
+
 static av_cold int decode_init(AVCodecContext *avctx)
 {
 V210DecContext *s = avctx->priv_data;
@@ -57,10 +65,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
 avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
 avctx->bits_per_raw_sample = 10;
 
-s->unpack_frame= v210_planar_unpack_c;
-
-if (HAVE_MMX)
-ff_v210_x86_init(s);
+ff_v210dec_init(s);
 
 return 0;
 }
@@ -102,8 +107,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, 
int *got_frame,
 aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf);
 if (aligned_input != s->aligned_input) {
 s->aligned_input = aligned_input;
-if (HAVE_MMX)
-ff_v210_x86_init(s);
+ff_v210dec_init(s);
 }
 
 if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
index 533afc435c..cfdb29da09 100644
--- a/libavcodec/v210dec.h
+++ b/libavcodec/v210dec.h
@@ -31,6 +31,7 @@ typedef struct {
 void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
 } V210DecContext;
 
+void ff_v210dec_init(V210DecContext *s);
 void ff_v210_x86_init(V210DecContext *s);
 
 #endif /* AVCODEC_V210DEC_H */
-- 
2.20.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] checkasm: add test for v210dec

2019-03-06 Thread James Darnley
On 2019-03-06 20:31, James Darnley wrote:
> ...

Wrong patch and wrong reference.  Please ignore this.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] checkasm: add test for v210dec

2019-03-06 Thread James Darnley
---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/v210dec.c  | 76 +++
 4 files changed, 81 insertions(+)
 create mode 100644 tests/checkasm/v210dec.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 47b7b06d28..70abc1a407 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -25,6 +25,7 @@ AVCODECOBJS-$(CONFIG_JPEG2000_DECODER)  += jpeg2000dsp.o
 AVCODECOBJS-$(CONFIG_PIXBLOCKDSP)   += pixblockdsp.o
 AVCODECOBJS-$(CONFIG_HEVC_DECODER)  += hevc_add_res.o hevc_idct.o 
hevc_sao.o
 AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER)   += utvideodsp.o
+AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 9eec41e3c4..bf51e00eab 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -136,6 +136,9 @@ static const struct {
 #if CONFIG_UTVIDEO_DECODER
 { "utvideodsp", checkasm_check_utvideodsp },
 #endif
+#if CONFIG_V210_DECODER
+{ "v210dec", checkasm_check_v210dec },
+#endif
 #if CONFIG_V210_ENCODER
 { "v210enc", checkasm_check_v210enc },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 9e8e879fd3..9b8d2f5419 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -69,6 +69,7 @@ void checkasm_check_sbrdsp(void);
 void checkasm_check_synth_filter(void);
 void checkasm_check_sw_rgb(void);
 void checkasm_check_utvideodsp(void);
+void checkasm_check_v210dec(void);
 void checkasm_check_v210enc(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c
new file mode 100644
index 00..7320ed5e37
--- /dev/null
+++ b/tests/checkasm/v210dec.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2019 James Darnley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+#include "checkasm.h"
+#include "libavcodec/v210dec.h"
+
+static uint32_t get_v210(void)
+{
+uint32_t t0 = rnd() & 0x3ff,
+ t1 = rnd() & 0x3ff,
+ t2 = rnd() & 0x3ff;
+uint32_t value =  t0
+   | (t1 << 10)
+   | (t2 << 20);
+return value;
+}
+
+#define NUM_SAMPLES 2048
+
+static void randomize_buffers(uint32_t *src0, uint32_t *src1, int len)
+{
+for (int i = 0; i < len; i++) {
+uint32_t value = get_v210();
+src0[i] = value;
+src1[i] = value;
+}
+}
+
+void checkasm_check_v210dec(void)
+{
+V210DecContext h;
+
+ff_v210dec_init(&h);
+
+if (check_func(h.unpack_frame, "v210_unpack")) {
+uint32_t src0[NUM_SAMPLES/3];
+uint32_t src1[NUM_SAMPLES/3];
+uint16_t y0[NUM_SAMPLES/2];
+uint16_t y1[NUM_SAMPLES/2];
+uint16_t u0[NUM_SAMPLES/4];
+uint16_t u1[NUM_SAMPLES/4];
+uint16_t v0[NUM_SAMPLES/4];
+uint16_t v1[NUM_SAMPLES/4];
+declare_func(void, const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
+const int pixels = NUM_SAMPLES / 2 / 6 * 6;
+
+randomize_buffers(src0, src1, NUM_SAMPLES/3);
+call_ref(src0, y0, u0, v0, pixels);
+call_new(src1, y1, u1, v1, pixels);
+if (memcmp(src0, src1, NUM_SAMPLES/3 * sizeof src0[0])
+|| memcmp(y0, y1, pixels * sizeof y0[0])
+|| memcmp(u0, u1, pixels/2 * sizeof u0[0])
+|| memcmp(v0, v1, pixels/2 * sizeof v0[0]))
+fail();
+bench_new(src1, y1, u1, v1, pixels);
+}
+report("v210_unpack");
+}
-- 
2.20.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 1/2] avcodec/v210dec: move DSP function setting into dedicated function

2019-03-06 Thread James Darnley
On 2019-03-06 10:11, Paul B Mahol wrote:
> On 3/6/19, Carl Eugen Hoyos  wrote:
>> 2019-03-04 23:58 GMT+01:00, James Darnley :
>>> Prepare for checkasm test.
>>> ---
>>>  libavcodec/v210dec.c | 13 +
>>>  libavcodec/v210dec.h |  1 +
>>>  2 files changed, 10 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
>>> index ddc5dbe8be..28cf00d320 100644
>>> --- a/libavcodec/v210dec.c
>>> +++ b/libavcodec/v210dec.c
>>> @@ -50,6 +50,14 @@ static void v210_planar_unpack_c(const uint32_t *src,
>>> uint16_t *y, uint16_t *u,
>>>  }
>>>  }
>>>
>>> +av_cold void ff_v210dec_init(V210DecContext *s)
>>> +{
>>> +s->unpack_frame = v210_planar_unpack_c;
>>
>>> +s->aligned_input = 0;
>>
>> Isn't this an unrelated change or do I misunderstand?
> 
> You misunderstand.

Maybe.

I need to initialize that member before it is used in the x86 function.
I expect valgrind or similar would catch the use.

It doesn't matter for normal use because it will be set correctly based
on the input data alignment for each frame.  Now that you mention it I
realize I forgot to change that to call the new function so I will send
a v2 later.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 2/2] checkasm: add test for v210dec

2019-03-04 Thread James Darnley
---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/v210dec.c  | 76 +++
 4 files changed, 81 insertions(+)
 create mode 100644 tests/checkasm/v210dec.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 47b7b06d28..70abc1a407 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -25,6 +25,7 @@ AVCODECOBJS-$(CONFIG_JPEG2000_DECODER)  += jpeg2000dsp.o
 AVCODECOBJS-$(CONFIG_PIXBLOCKDSP)   += pixblockdsp.o
 AVCODECOBJS-$(CONFIG_HEVC_DECODER)  += hevc_add_res.o hevc_idct.o 
hevc_sao.o
 AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER)   += utvideodsp.o
+AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 9eec41e3c4..bf51e00eab 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -136,6 +136,9 @@ static const struct {
 #if CONFIG_UTVIDEO_DECODER
 { "utvideodsp", checkasm_check_utvideodsp },
 #endif
+#if CONFIG_V210_DECODER
+{ "v210dec", checkasm_check_v210dec },
+#endif
 #if CONFIG_V210_ENCODER
 { "v210enc", checkasm_check_v210enc },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 9e8e879fd3..9b8d2f5419 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -69,6 +69,7 @@ void checkasm_check_sbrdsp(void);
 void checkasm_check_synth_filter(void);
 void checkasm_check_sw_rgb(void);
 void checkasm_check_utvideodsp(void);
+void checkasm_check_v210dec(void);
 void checkasm_check_v210enc(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c
new file mode 100644
index 00..7320ed5e37
--- /dev/null
+++ b/tests/checkasm/v210dec.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2019 James Darnley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+#include "checkasm.h"
+#include "libavcodec/v210dec.h"
+
+static uint32_t get_v210(void)
+{
+uint32_t t0 = rnd() & 0x3ff,
+ t1 = rnd() & 0x3ff,
+ t2 = rnd() & 0x3ff;
+uint32_t value =  t0
+   | (t1 << 10)
+   | (t2 << 20);
+return value;
+}
+
+#define NUM_SAMPLES 2048
+
+static void randomize_buffers(uint32_t *src0, uint32_t *src1, int len)
+{
+for (int i = 0; i < len; i++) {
+uint32_t value = get_v210();
+src0[i] = value;
+src1[i] = value;
+}
+}
+
+void checkasm_check_v210dec(void)
+{
+V210DecContext h;
+
+ff_v210dec_init(&h);
+
+if (check_func(h.unpack_frame, "v210_unpack")) {
+uint32_t src0[NUM_SAMPLES/3];
+uint32_t src1[NUM_SAMPLES/3];
+uint16_t y0[NUM_SAMPLES/2];
+uint16_t y1[NUM_SAMPLES/2];
+uint16_t u0[NUM_SAMPLES/4];
+uint16_t u1[NUM_SAMPLES/4];
+uint16_t v0[NUM_SAMPLES/4];
+uint16_t v1[NUM_SAMPLES/4];
+declare_func(void, const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
+const int pixels = NUM_SAMPLES / 2 / 6 * 6;
+
+randomize_buffers(src0, src1, NUM_SAMPLES/3);
+call_ref(src0, y0, u0, v0, pixels);
+call_new(src1, y1, u1, v1, pixels);
+if (memcmp(src0, src1, NUM_SAMPLES/3 * sizeof src0[0])
+|| memcmp(y0, y1, pixels * sizeof y0[0])
+|| memcmp(u0, u1, pixels/2 * sizeof u0[0])
+|| memcmp(v0, v1, pixels/2 * sizeof v0[0]))
+fail();
+bench_new(src1, y1, u1, v1, pixels);
+}
+report("v210_unpack");
+}
-- 
2.20.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 1/2] avcodec/v210dec: move DSP function setting into dedicated function

2019-03-04 Thread James Darnley
Prepare for checkasm test.
---
 libavcodec/v210dec.c | 13 +
 libavcodec/v210dec.h |  1 +
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index ddc5dbe8be..28cf00d320 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -50,6 +50,14 @@ static void v210_planar_unpack_c(const uint32_t *src, 
uint16_t *y, uint16_t *u,
 }
 }
 
+av_cold void ff_v210dec_init(V210DecContext *s)
+{
+s->unpack_frame = v210_planar_unpack_c;
+s->aligned_input = 0;
+if (ARCH_X86)
+ff_v210_x86_init(s);
+}
+
 static av_cold int decode_init(AVCodecContext *avctx)
 {
 V210DecContext *s = avctx->priv_data;
@@ -57,10 +65,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
 avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
 avctx->bits_per_raw_sample = 10;
 
-s->unpack_frame= v210_planar_unpack_c;
-
-if (HAVE_MMX)
-ff_v210_x86_init(s);
+ff_v210dec_init(s);
 
 return 0;
 }
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
index 533afc435c..cfdb29da09 100644
--- a/libavcodec/v210dec.h
+++ b/libavcodec/v210dec.h
@@ -31,6 +31,7 @@ typedef struct {
 void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, 
uint16_t *v, int width);
 } V210DecContext;
 
+void ff_v210dec_init(V210DecContext *s);
 void ff_v210_x86_init(V210DecContext *s);
 
 #endif /* AVCODEC_V210DEC_H */
-- 
2.20.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] Added ff_v210_planar_unpack_aligned_avx2

2019-03-04 Thread James Darnley
On 2019-03-01 18:41, Michael Stoner wrote:
> The AVX2 code leverages VPERMD to process 12 pixels/iteration.  This is my 
> first patch submission so any comments are greatly appreciated.
> 
> -Mike
> 
> Tested on Skylake (Win32 & Win64)
> 1920x1080 input frame
> =
> C code - 440 fps
> SSSE3  - 920 fps
> AVX- 930 fps
> AVX2   - 1040 fps
> 
> Regression tested at 1920x1080, 1280x720, and 352x288

>  .loop:
>  %ifidn %1, unaligned
> -movu   m0, [r0]
> +movu   m0, [r0]; yB v5 yA  u5 y9 v4  y8 u4 y7  v3 y6 
> u3  y5 v2 y4  u2 y3 v1  y2 u1 y1  v0 y0 u0
>  %else
>  mova   m0, [r0]
>  %endif

At first I didn't understand why you do so much seemingly unnecessary
work.  You don't change how the data loaded into register.  After more
in-depth reading I see now that you shuffle data around just so you can
store the data with a single move for each plane.  The chroma is below.

> +%if cpuflag(avx2)
> +vpermd m1, m6, m1  ; 00 v5 v4 v3 00 v2 v1 v0 00 u5 u4 u3 
> 00 u2 u1 u0
> +pshufb m1, m7  ; 00 00 v5 v4 v3 v2 v1 v0 00 00 u5 u4 
> u3 u2 u1 u0
> +movu   [r2+r4], xm1
> +vextracti128 [r3+r4], m1, 1
> +%else
>  movq   [r2+r4], m1
>  movhps [r3+r4], m1
> +%endif

Sounds commendable but I doubt the use of this many more shuffles gets
you much over a naive AVX2 version (where you treat the high half of ymm
like an unroll).

> +; for AVX2 version only
> +v210_luma_permute: dd 0,1,2,4,5,6,7,7
> +v210_chroma_permute: dd 0,1,4,5,2,3,6,7

Are you sure these can't be replaced with vpermq and its immediate
operand?  It really looks like the second could be.  It'll save you a
register.

> -mova   m3, [v210_mult]
> -mova   m4, [v210_mask]
> -mova   m5, [v210_luma_shuf]
> -mova   m6, [v210_chroma_shuf]
> +mova   m3, [v210_luma_shuf]
> +mova   m4, [v210_chroma_shuf1]
> +
> +%if cpuflag(avx2)
> +mova   m5, [v210_luma_permute]  ; VPERMD constant must be in a 
> register
> +mova   m6, [v210_chroma_permute]; VPERMD constant must be in a 
> register
> +mova   m7, [v210_chroma_shuf2]
> +%endif
> +
> +%if ARCH_X86_64
> +mova   m8, [v210_mult]
> +mova   m9, [v210_mask]
> +%endif
> +

It would let you clean this up a bit.

My suggestion is to make the diff minimal by keeping the existing uses
and if you still need more than 8 registers for avx2 then make it
available for x86-64 only.

Compare yours with the one I committed here
https://github.com/Upipe/upipe/blob/master/lib/upipe-v210/v210dec.asm#L45
which is just FFmpeg's cleaned up a little plus avx2.  I'm surprised
it's not already in FFmpeg.

You should do whatever is faster.




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] Added ff_v210_planar_unpack_aligned_avx2

2019-03-04 Thread James Darnley
On 2019-03-03 15:44, Martin Vignali wrote:
> Hello,
> 
> ...
> 
> Not directly related to this patch, but it can be interesting for testing
> purpose to write a checkasm test for the v210 func decoding.
> So it's more easy to check the perf for "each" cpu flags, and be sure, the
> various width cases works as expected.

I can probably do that.  I have one for v210 unpacking in a knock-off
checkasm for another project.

I will look over/review the submitted patch first.




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] Lossy GIF encoding

2019-02-15 Thread James Darnley
On 2019-02-15 10:01, Kornel wrote:
> libavcodec/gif.c in ff_gif_encoder.pix_fmts seems to passively declare types 
> of pixel formats it accepts.

If you want to experiment you can change that so it accepts rgb (also or
only).  Then you can implement and test what you want, then you can ask
about submitting it.

You can make your fancy encoding only available with rgb, or with some
option and return an error when given pal8.




signature.asc
Description: OpenPGP digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avformat/matroskaenc: add reserve free space option

2018-09-06 Thread James Darnley
On 2018-09-06 19:39, Sigríður Regína Sigurþórsdóttir wrote:
> +if (s->metadata_header_padding) {
> +if (s->metadata_header_padding == 1)
> +s->metadata_header_padding++;
> +put_ebml_void(pb, s->metadata_header_padding);
> +}

Unfortunately I was forced to make the default -1 so you want to check
that the value is greater than 0 rather than just true.

Furthermore I think you will still want to add to Changelog making a
note that the matroska muxer will now listen to metadata_header_padding.
 That may also want a micro version bump so that library users can check.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avformat/matroskaenc: add reserve free space option

2018-09-05 Thread James Darnley
On 2018-09-05 22:52, Sigríður Regína Sigurþórsdóttir wrote:
> +{"reserve_free_space", "Reserve a given amount of space at the
> beginning og the file for unspecified purpose."

I added the "metadata_header_padding" global option many years ago.  Can
you not reuse it for this purpose?  Is it not likely to be "metadata"
that another software might fill this with?

Also there is a typo in the bit I quoted.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] frame: Simplify the video allocation

2018-09-03 Thread James Darnley
On 2018-09-03 15:29, James Almer wrote:
> pass 32 - 1 to both av_image_fill_pointers() calls directly?

Please do not add a magic number where nobody will find it.  Use one of
the 3 already existing methods for knowing the alignment necessary for
assembly.

If this is unrelated, my apologies.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 1/3] diracdec: add 10-bit Haar SIMD functions

2018-07-27 Thread James Darnley
On 2018-07-27 15:05, Henrik Gramner wrote:
> On Fri, Jul 27, 2018 at 1:47 PM, James Darnley  wrote:
>> On 2018-07-26 17:29, Rostislav Pehlivanov wrote:
>>>> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
>>>> x, b2
>>>> +DECLARE_REG_TMP 2,5
>>>> +%if ARCH_X86_64
>>>> +%define tail r6d
>>>> +%else
>>>> +%define tail dword wm
>>>> +%endif
>>>> +
>>>>
>>>
>>> You can remove this whole bit, the init function only gets called if
>>> ARCH_X86_64 is true.
>>
>> Where did you get that from?  I don't require 64-bit for this.
> 
> Can't you just use 7 GPR:s on x86-32 as well?

I'm sure I've done that in the past and at least 1 platform has always
complained due to PIE or stack alignment or whatever, I think.  I went
looking for an old email but couldn't find it.

If you want me to try it I can.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 1/3] diracdec: add 10-bit Haar SIMD functions

2018-07-27 Thread James Darnley
On 2018-07-26 17:29, Rostislav Pehlivanov wrote:
> On 26 July 2018 at 12:28, James Darnley  wrote:
> +cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w
>> +DECLARE_REG_TMP 4,5
>> +
>> +mova  m2, [pd_1]
>> +mov  r3d, wd
>> +and   wd, ~(mmsize/4 - 1)
>> +shl   wd, 2
>> +add  b0q, wq
>> +add  b1q, wq
>> +neg   wq
>> +
>> +ALIGN 16
>> +.loop_simd:
>> +mova m0, [b0q + wq]
>> +mova m1, [b1q + wq]
>> +paddd m3, m1, m2
>> +psrad m3, 1
>> +psubd m0, m3
>> +paddd m1, m0
>> +mova [b0q + wq], m0
>> +mova [b1q + wq], m1
>> +add wq, mmsize
>> +jl .loop_simd
>> +
>> +and  r3d, mmsize/4 - 1
>> +jz .end
>> +.loop_scalar:
>> +mov t0d, [b0q]
>> +mov t1d, [b1q]
>> +mov r2d, t1d
>> +add r2d, 1
>> +sar r2d, 1
>> +sub t0d, r2d
>> +add t1d, t0d
>> +mov [b0q], t0d
>> +mov [b1q], t1d
>> +
>> +add b0q, 4
>> +add b1q, 4
>> +sub r3d, 1
>> +jg .loop_scalar
>> +
>> +.end:
>> +RET
>> +
>> +%endmacro
>> +
>> +%macro HAAR_HORIZONTAL 0
>>
> +
>>
> 
> Could you remove this newline from every patch? All asm I've written and
> seen keep them without a newline. It made me think there's something in the
> asm which checked the value of the macro, not that the entire function is
> macro'd.

What?  I don't understand what you mean.  Do you think I have too many
blank lines between things?

> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
>> x, b2
>> +DECLARE_REG_TMP 2,5
>> +%if ARCH_X86_64
>> +%define tail r6d
>> +%else
>> +%define tail dword wm
>> +%endif
>> +
>> +mova m2, [pd_1]
>> +xor xd, xd
>> +shr wd, 1
>> +mov tail, wd
>> +lea b2q, [bq + 4*wq]
>> +
>> +ALIGN 16
>> +.loop_lo:
>> +mova m0, [bq  + 4*xq]
>> +movu m1, [b2q + 4*xq]
>> +paddd m1, m2
>> +psrad m1, 1
>> +psubd m0, m1
>> +mova [temp_q + 4*xq], m0
>> +add xd, mmsize/4
>> +cmp xd, wd
>> +jl .loop_lo
>> +
>> +xor xd, xd
>> +and wd, ~(mmsize/4 - 1)
>> +
>> +ALIGN 16
>> +.loop_hi:
>> +mova m0, [temp_q + 4*xq]
>> +movu m1, [b2q+ 4*xq]
>> +paddd m1, m0
>> +paddd m0, m2
>> +paddd m1, m2
>> +psrad m0, 1
>> +psrad m1, 1
>> +SBUTTERFLY dq, 0,1,3
>> +%if cpuflag(avx2)
>> +SBUTTERFLY dqqq, 0,1,3
>> +%endif
>> +mova [bq + 8*xq], m0
>> +mova [bq + 8*xq + mmsize], m1
>> +add xd, mmsize/4
>> +cmp xd, wd
>> +jl .loop_hi
>> +
>> +and tail, mmsize/4 - 1
>> +jz .end
>> +.loop_scalar:
>> +mov t0d, [temp_q + 4*xq]
>> +mov t1d, [b2q+ 4*xq]
>> +add t1d, t0d
>> +add t0d, 1
>> +add t1d, 1
>> +sar t0d, 1
>> +sar t1d, 1
>> +mov [bq + 8*xq], t0d
>> +mov [bq + 8*xq + 4], t1d
>> +add  xq, 1
>> +sub tail, 1
>> +jg .loop_scalar
>> +
>> +.end:
>> +REP_RET
>> +
>> +%endmacro
>> +
>> +INIT_XMM sse2
>> +HAAR_HORIZONTAL
>> +HAAR_VERTICAL
>> +
>> +INIT_XMM avx
>> +HAAR_HORIZONTAL
>> +HAAR_VERTICAL
>>
> 
> You're not using any avx functions in that version, not unless a macro'd
> instruction inserts one for you. I think you should remove the avx version
> then.
> Also since you always have a HAAR_HORIZONTAL and HAAR_VERTICAL macros per
> version you can just make a single macro to do both versions at the same
> time.

Now that I think about it there will be only one 3-operand instruction
in the SBUTTERFLY and the vertical function also only has 1.  I will
remove it.

I can merge the two macros but I will look back at what I've done
previously.  I think it is usually 1 macro per function.

> +
>> +INIT_YMM avx2
>> +HAAR_HORIZONTAL
>> +HAAR_VERTICAL
>> diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c
>> b/libavcodec/x86/dirac_dwt_init_10bit.c
>> new file mode 100644
>> index 00..289862d728
>> --- /d

[FFmpeg-devel] [PATCH 1/3] diracdec: add 10-bit Haar SIMD functions

2018-07-26 Thread James Darnley
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
relevant transform.
C:119fps
SSE2: 204fps
AVX:  206fps
AVX2: 221fps

timer measurements, haar horizontal compose:
sse2: 3.68x faster (45143 vs. 12279 decicycles) compared with C
avx:  3.68x faster (45143 vs. 12275 decicycles) compared with C
avx2: 5.16x faster (45143 vs.  8742 decicycles) compared with C
haar vertical compose:
sse2: 1.64x faster (31792 vs. 19377 decicycles) compared with C
avx:  1.58x faster (31792 vs. 20090 decicycles) compared with C
avx2: 1.66x faster (31792 vs. 19157 decicycles) compared with C
---
 libavcodec/dirac_dwt.c|   7 +-
 libavcodec/dirac_dwt.h|   1 +
 libavcodec/x86/Makefile   |   6 +-
 libavcodec/x86/dirac_dwt_10bit.asm| 160 ++
 libavcodec/x86/dirac_dwt_init_10bit.c |  76 
 5 files changed, 247 insertions(+), 3 deletions(-)
 create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm
 create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c

diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c
index cc08f8865a..86bee5bb9b 100644
--- a/libavcodec/dirac_dwt.c
+++ b/libavcodec/dirac_dwt.c
@@ -59,8 +59,13 @@ int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum 
dwt_type type,
 return AVERROR_INVALIDDATA;
 }
 
-if (ARCH_X86 && bit_depth == 8)
+#if ARCH_X86
+if (bit_depth == 8)
 ff_spatial_idwt_init_x86(d, type);
+else if (bit_depth == 10)
+ff_spatial_idwt_init_10bit_x86(d, type);
+#endif
+
 return 0;
 }
 
diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h
index 994dc21d70..1ad7b9a821 100644
--- a/libavcodec/dirac_dwt.h
+++ b/libavcodec/dirac_dwt.h
@@ -88,6 +88,7 @@ enum dwt_type {
 int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type,
  int decomposition_count, int bit_depth);
 void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type);
+void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type);
 
 void ff_spatial_idwt_slice2(DWTContext *d, int y);
 
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 2350c8bbee..590d83c167 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -7,7 +7,8 @@ OBJS-$(CONFIG_BLOCKDSP)+= x86/blockdsp_init.o
 OBJS-$(CONFIG_BSWAPDSP)+= x86/bswapdsp_init.o
 OBJS-$(CONFIG_DCT) += x86/dct_init.o
 OBJS-$(CONFIG_DIRAC_DECODER)   += x86/diracdsp_init.o   \
-  x86/dirac_dwt_init.o
+  x86/dirac_dwt_init.o \
+  x86/dirac_dwt_init_10bit.o
 OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o
 OBJS-$(CONFIG_FFT) += x86/fft_init.o
 OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o
@@ -153,7 +154,8 @@ X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
 X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o
 X86ASM-OBJS-$(CONFIG_DCA_DECODER)  += x86/dcadsp.o x86/synth_filter.o
 X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)+= x86/diracdsp.o\
-  x86/dirac_dwt.o
+  x86/dirac_dwt.o \
+  x86/dirac_dwt_10bit.o
 X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER)+= x86/dnxhdenc.o
 X86ASM-OBJS-$(CONFIG_EXR_DECODER)  += x86/exrdsp.o
 X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
diff --git a/libavcodec/x86/dirac_dwt_10bit.asm 
b/libavcodec/x86/dirac_dwt_10bit.asm
new file mode 100644
index 00..baea91329e
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt_10bit.asm
@@ -0,0 +1,160 @@
+;**
+;* x86 optimized discrete 10-bit wavelet trasnform
+;* Copyright (c) 2018 James Darnley
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;**
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pd_1
+
+SECTION .text
+
+%macro HAAR_VERTICAL 0
+
+cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w
+ 

[FFmpeg-devel] [PATCH 0/3 v2] x86 SIMD for dirac 10-bit wavelet transforms

2018-07-26 Thread James Darnley
I will ask the same question as last time.  Is the AVX worth it in Haar?  Also I
am surprised that the AVX2 doesn't have a bigger difference on some of the
vertical transforms.

James Darnley (3):
  diracdec: add 10-bit Haar SIMD functions
  diracdec: add 10-bit Legall 5,3 (5_3) SIMD functions
  diracdec: add 10-bit Deslauriers-Dubuc 9,7 (9_7) vertical high-pass
function

 libavcodec/dirac_dwt.c|   7 +-
 libavcodec/dirac_dwt.h|   1 +
 libavcodec/x86/Makefile   |   6 +-
 libavcodec/x86/dirac_dwt_10bit.asm| 302 ++
 libavcodec/x86/dirac_dwt_init_10bit.c | 118 ++
 5 files changed, 431 insertions(+), 3 deletions(-)
 create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm
 create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c

-- 
2.18.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 3/3] diracdec: add 10-bit Deslauriers-Dubuc 9, 7 (9_7) vertical high-pass function

2018-07-26 Thread James Darnley
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
relevant transform.
C: 84fps
SSE2: 111fps
AVX2: 115fps

dd97 vertical hi
sse2: 2.77x faster (31773 vs. 11457 decicycles) compared with C
avx2: 3.83x faster (31773 vs.  8297 decicycles) compared with C
---
 libavcodec/x86/dirac_dwt_10bit.asm| 39 +++
 libavcodec/x86/dirac_dwt_init_10bit.c | 29 
 2 files changed, 68 insertions(+)

diff --git a/libavcodec/x86/dirac_dwt_10bit.asm 
b/libavcodec/x86/dirac_dwt_10bit.asm
index 0295e6f554..2ed77fe3b0 100644
--- a/libavcodec/x86/dirac_dwt_10bit.asm
+++ b/libavcodec/x86/dirac_dwt_10bit.asm
@@ -25,6 +25,7 @@ SECTION_RODATA 32
 
 cextern pd_1
 pd_2: times 8 dd 2
+pd_8: times 8 dd 8
 
 SECTION .text
 
@@ -246,7 +247,44 @@ RET
 
 %endmacro
 
+%macro DD97_VERTICAL_HI 0
+
+cglobal dd97_vertical_hi, 6, 6, 8, b0, b1, b2, b3, b4, w
+mova m7, [pd_8]
+shl wd, 2
+add b0q, wq
+add b1q, wq
+add b2q, wq
+add b3q, wq
+add b4q, wq
+neg wq
+
+ALIGN 16
+.loop:
+mova m0, [b0q + wq]
+mova m1, [b1q + wq]
+mova m2, [b2q + wq]
+mova m3, [b3q + wq]
+mova m4, [b4q + wq]
+pslld m5, m1, 3
+pslld m6, m3, 3
+paddd m5, m1
+paddd m6, m3
+psubd m5, m0
+psubd m6, m4
+paddd m5, m7
+paddd m5, m6
+psrad m5, 4
+paddd m2, m5
+mova [b2q + wq], m2
+add wq, mmsize
+jl .loop
+RET
+
+%endmacro
+
 INIT_XMM sse2
+DD97_VERTICAL_HI
 HAAR_HORIZONTAL
 HAAR_VERTICAL
 LEGALL53_VERTICAL_HI
@@ -257,6 +295,7 @@ HAAR_HORIZONTAL
 HAAR_VERTICAL
 
 INIT_YMM avx2
+DD97_VERTICAL_HI
 HAAR_HORIZONTAL
 HAAR_VERTICAL
 LEGALL53_VERTICAL_HI
diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c 
b/libavcodec/x86/dirac_dwt_init_10bit.c
index d1234efac5..a9ac603bc5 100644
--- a/libavcodec/x86/dirac_dwt_init_10bit.c
+++ b/libavcodec/x86/dirac_dwt_init_10bit.c
@@ -23,6 +23,9 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/dirac_dwt.h"
 
+void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t 
*b3, int32_t *b4, int width);
+void ff_dd97_vertical_hi_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t 
*b3, int32_t *b4, int width);
+
 void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
 void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
 void ff_legall53_vertical_hi_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
@@ -36,6 +39,24 @@ void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, 
int32_t *b1, int width_ali
 void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int 
width_align);
 void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int 
width_align);
 
+static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2,
+  int32_t *b3, int32_t *b4, int width)
+{
+int i = width & ~3;
+ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i);
+for(; ivertical_compose_h0 = (void*)dd97_vertical_hi_sse2;
+d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_sse2;
+break;
 case DWT_DIRAC_LEGALL5_3:
 d->vertical_compose_h0 = (void*)ff_legall53_vertical_hi_sse2;
 d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_sse2;
@@ -71,6 +96,10 @@ av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, 
enum dwt_type type)
 
 if (EXTERNAL_AVX2(cpu_flags)) {
 switch (type) {
+case DWT_DIRAC_DD9_7:
+d->vertical_compose_h0 = (void*)dd97_vertical_hi_avx2;
+d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_avx2;
+break;
 case DWT_DIRAC_LEGALL5_3:
 d->vertical_compose_h0 = (void*)ff_legall53_vertical_hi_avx2;
 d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_avx2;
-- 
2.18.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 2/3] diracdec: add 10-bit Legall 5, 3 (5_3) SIMD functions

2018-07-26 Thread James Darnley
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
relevant transform.
C: 94fps
SSE2: 118fps
AVX2: 121fps

legall vertical hi
sse2: 3.86x faster (20201 vs. 5231 decicycles) compared with C
avx2: 6.70x faster (20201 vs. 3014 decicycles) compared with C
legall vertical lo
sse2: 1.50x faster (28345 vs. 18908 decicycles) compared with C
avx2: 1.63x faster (28345 vs. 17361 decicycles) compared with C
---
 libavcodec/x86/dirac_dwt_10bit.asm| 105 +-
 libavcodec/x86/dirac_dwt_init_10bit.c |  13 
 2 files changed, 117 insertions(+), 1 deletion(-)

diff --git a/libavcodec/x86/dirac_dwt_10bit.asm 
b/libavcodec/x86/dirac_dwt_10bit.asm
index baea91329e..0295e6f554 100644
--- a/libavcodec/x86/dirac_dwt_10bit.asm
+++ b/libavcodec/x86/dirac_dwt_10bit.asm
@@ -21,9 +21,10 @@
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
 cextern pd_1
+pd_2: times 8 dd 2
 
 SECTION .text
 
@@ -147,9 +148,109 @@ REP_RET
 
 %endmacro
 
+%macro LEGALL53_VERTICAL_LO 0
+
+cglobal legall53_vertical_lo, 4, 6, 4, b0, b1, b2, w
+DECLARE_REG_TMP 3,4,5
+
+mova  m3, [pd_2]
+mov  t2d, wd
+and   wd, ~(mmsize/4 - 1)
+shl   wd, 2
+add  b0q, wq
+add  b1q, wq
+add  b2q, wq
+neg   wq
+
+ALIGN 16
+.loop:
+mova m0, [b0q + wq]
+mova m1, [b1q + wq]
+mova m2, [b2q + wq]
+paddd m0, m2
+paddd m0, m3
+psrad m0, 2
+psubd m1, m0
+mova [b1q + wq], m1
+add wq, mmsize
+jl .loop
+
+and  t2d, mmsize/4 - 1
+jz .end
+.loop_scalar:
+mov t0d, [b0q]
+mov t1d, [b1q]
+add t0d, [b2q]
+add t0d, 2
+sar t0d, 2
+sub t1d, t0d
+mov [b1q], t1d
+
+add b0q, 4
+add b1q, 4
+add b2q, 4
+sub t2d, 1
+jg .loop_scalar
+
+.end:
+RET
+
+%endmacro
+
+%macro LEGALL53_VERTICAL_HI 0
+
+cglobal legall53_vertical_hi, 4, 6, 4, b0, b1, b2, w
+DECLARE_REG_TMP 3,4,5
+
+mova  m3, [pd_1]
+mov  t2d, wd
+and   wd, ~(mmsize/4 - 1)
+shl   wd, 2
+add  b0q, wq
+add  b1q, wq
+add  b2q, wq
+neg   wq
+
+ALIGN 16
+.loop:
+mova m0, [b0q + wq]
+mova m1, [b1q + wq]
+mova m2, [b2q + wq]
+paddd m0, m2
+paddd m0, m3
+psrad m0, 1
+paddd m1, m0
+mova [b1q + wq], m1
+add wq, mmsize
+jl .loop
+
+and  t2d, mmsize/4 - 1
+jz .end
+.loop_scalar:
+mov t0d, [b0q]
+mov t1d, [b1q]
+add t0d, [b2q]
+add t0d, 1
+sar t0d, 1
+add t1d, t0d
+mov [b1q], t1d
+
+add b0q, 4
+add b1q, 4
+add b2q, 4
+sub t2d, 1
+jg .loop_scalar
+
+.end:
+RET
+
+%endmacro
+
 INIT_XMM sse2
 HAAR_HORIZONTAL
 HAAR_VERTICAL
+LEGALL53_VERTICAL_HI
+LEGALL53_VERTICAL_LO
 
 INIT_XMM avx
 HAAR_HORIZONTAL
@@ -158,3 +259,5 @@ HAAR_VERTICAL
 INIT_YMM avx2
 HAAR_HORIZONTAL
 HAAR_VERTICAL
+LEGALL53_VERTICAL_HI
+LEGALL53_VERTICAL_LO
diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c 
b/libavcodec/x86/dirac_dwt_init_10bit.c
index 289862d728..d1234efac5 100644
--- a/libavcodec/x86/dirac_dwt_init_10bit.c
+++ b/libavcodec/x86/dirac_dwt_init_10bit.c
@@ -23,6 +23,11 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/dirac_dwt.h"
 
+void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
+void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
+void ff_legall53_vertical_hi_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
+void ff_legall53_vertical_lo_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
+
 void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int 
width_align);
 void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int 
width_align);
 void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int 
width_align);
@@ -38,6 +43,10 @@ av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, 
enum dwt_type type)
 
 if (EXTERNAL_SSE2(cpu_flags)) {
 switch (type) {
+case DWT_DIRAC_LEGALL5_3:
+d->vertical_compose_h0 = (void*)ff_legall53_vertical_hi_sse2;
+d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_sse2;
+break;
 case DWT_DIRAC_HAAR0:
 d->vertical_compose = 
(void*)ff_vertical_compose_haar_10bit_sse2;
 break;
@@ -62,6 +71,10 @@ av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, 
enum dwt_type type)
 
 if (EXTERNAL_AVX2(cpu_flags)) {
 switch (type) {
+case DWT_DIRAC_LEGALL5_3:
+d->vertical_compose_h0 = (void*)ff_legall53_vertical_hi_avx2;
+d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_avx2;
+break;
 case DWT_DIRAC_HAAR0:
 d->vertical_compose = 
(void*)ff_vertical_compose_haar_10b

Re: [FFmpeg-devel] [PATCH 0/6] x86 SIMD for dirac 10-bit wavelet transforms

2018-07-25 Thread James Darnley
On 2018-07-19 17:23, Rostislav Pehlivanov wrote:
> Could you provide standard overall transform results using START/STOP_TIMER
> rather than overall decoding speed?

Ask and ye shall receive.

> haar horizontal compose
> sse2: 3.67x faster (45248±108.1 vs. 12328±21.1 decicycles) compared with 
> none
> avx:  3.74x faster (45248±108.1 vs. 12091±11.0 decicycles) compared with 
> none
> avx2: 5.14x faster (45248±108.1 vs. 8805±15.6 decicycles) compared with 
> none
> haar vertical compose
> sse2: 1.57x faster (31771±459.9 vs. 20179±786.2 decicycles) compared with 
> none
> avx:  1.62x faster (31771±459.9 vs. 19572±253.1 decicycles) compared with 
> none
> avx2: 1.73x faster (31771±459.9 vs. 18337±827.9 decicycles) compared with 
> none
> 
> legall vertical hi
> sse2: 3.68x faster (20506±46.2 vs. 5574±29.7 decicycles) compared with 
> none
> avx2: 5.96x faster (20506±46.2 vs. 3442±32.7 decicycles) compared with 
> none
> legall vertical lo
> sse2: 1.52x faster (28360±178.6 vs. 18603±114.8 decicycles) compared with 
> none
> avx2: 1.64x faster (28360±178.6 vs. 17255±372.3 decicycles) compared with 
> none
> 
> dd97 vertical hi
> sse2: 2.76x faster (31975±103.0 vs. 11570±247.5 decicycles) compared with 
> none
> avx:  2.82x faster (31975±103.0 vs. 11346±179.0 decicycles) compared with 
> none
> avx2: 3.83x faster (31975±103.0 vs. 8357±219.6 decicycles) compared with 
> none
> dd97 vertical lo
> sse2: 1.52x faster (29476±335.8 vs. 19429±518.7 decicycles) compared with 
> none
> avx2: 1.62x faster (29476±335.8 vs. 18246±559.8 decicycles) compared with 
> none

Here "none" refers to the C functions, from "-cpuflags none" option.

I also have the results of removing the C wrappers from these functions,
except dd97.  They aren't that much better.

> haar horizontal compose
> sse2: 3.68x faster (45143±36.4 vs. 12279±16.4 decicycles) compared with 
> none
> avx:  3.68x faster (45143±36.4 vs. 12275±9.2 decicycles) compared with 
> none
> avx2: 5.16x faster (45143±36.4 vs. 8742±12.3 decicycles) compared with 
> none
> haar vertical compose
> sse2: 1.64x faster (31792±367.5 vs. 19377±271.7 decicycles) compared with 
> none
> avx:  1.58x faster (31792±367.5 vs. 20090±593.9 decicycles) compared with 
> none
> avx2: 1.66x faster (31792±367.5 vs. 19157±1352.4 decicycles) compared 
> with none
> 
> legall vertical hi
> sse2: 3.86x faster (20201±26.5 vs. 5231±39.0 decicycles) compared with 
> none
> avx2: 6.70x faster (20201±26.5 vs. 3014±39.1 decicycles) compared with 
> none
> legall vertical lo
> sse2: 1.50x faster (28345±206.6 vs. 18908±440.3 decicycles) compared with 
> none
> avx2: 1.63x faster (28345±206.6 vs. 17361±637.9 decicycles) compared with 
> none

I will squash patches, update commit messages, and send a new patch thread.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 3/6] diracdec: add 10-bit Deslauriers-Dubuc 9, 7 (9_7) vertical high-pass function

2018-07-19 Thread James Darnley
On 2018-07-19 17:26, Rostislav Pehlivanov wrote:
> On 19 July 2018 at 15:52, James Darnley  wrote:
> 
>> int32_t *b1, int32_t *b2, int
>>  b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]);
>>  }
>>
>> +static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2,
>> +  int32_t *b3, int32_t *b4, int width)
>> +{
>> +int i = width & ~3;
>> +ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i);
>> +for(; i> +b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]);
>> +
>> +}
>>
> 
> 
> This, along with the rest of the patchset: what's up with the hybrid
> implementations? Couldn't you put the second part in the asm code as well?
> Now there are 2 function calls instead of 1.

The 8-bit code does this and I just followed it lead.  I believe this is
done because we cannot write junk data beyond what we think is the end
of the line because this might be one of the higher depths and the
coeffs for the next level sit beyond the end of the line.

But now it has just occurred to me that maybe you meant "why didn't you
do the scalar operations in SIMD?", is that what you meant?  Answer is
because it didn't occur to me at the time.  Aside from that I always
write do-while loops in assembly because I can usually guarantee 1 run
of the block.

I can certainly look at making that change.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 0/6] x86 SIMD for dirac 10-bit wavelet transforms

2018-07-19 Thread James Darnley
On 2018-07-19 17:23, Rostislav Pehlivanov wrote:
> 
> Could you provide standard overall transform results using START/STOP_TIMER
> rather than overall decoding speed?
> Coefficients sizes and therefore golomb unpacking speed changes with
> respect to the transform so potentially there could be somewhat of a
> bottleneck on decoding before the inverse transform.

Ah, you are right about that.  Should I limit the depth to 1 so that the
functions operate on the same width all the time?  Anyway, I will get
the timers in there.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 5/6] diracdec: avx2 dd97

2018-07-19 Thread James Darnley
---
 libavcodec/x86/dirac_dwt_10bit.asm|  3 ++-
 libavcodec/x86/dirac_dwt_init_10bit.c | 13 +
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/libavcodec/x86/dirac_dwt_10bit.asm 
b/libavcodec/x86/dirac_dwt_10bit.asm
index ae110d2945..2e039e11ea 100644
--- a/libavcodec/x86/dirac_dwt_10bit.asm
+++ b/libavcodec/x86/dirac_dwt_10bit.asm
@@ -25,7 +25,7 @@ SECTION_RODATA
 
 cextern pd_1
 pd_2: times 8 dd 2
-pd_8: times 4 dd 8
+pd_8: times 8 dd 8
 
 SECTION .text
 
@@ -202,6 +202,7 @@ HAAR_HORIZONTAL
 HAAR_VERTICAL
 
 INIT_YMM avx2
+DD97_VERTICAL_HI
 HAAR_HORIZONTAL
 HAAR_VERTICAL
 LEGALL53_VERTICAL_HI
diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c 
b/libavcodec/x86/dirac_dwt_init_10bit.c
index 51d6eeae93..f103a56176 100644
--- a/libavcodec/x86/dirac_dwt_init_10bit.c
+++ b/libavcodec/x86/dirac_dwt_init_10bit.c
@@ -24,6 +24,7 @@
 #include "libavcodec/dirac_dwt.h"
 
 void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t 
*b3, int32_t *b4, int width);
+void ff_dd97_vertical_hi_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t 
*b3, int32_t *b4, int width);
 
 void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
 void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
@@ -137,7 +138,15 @@ static void dd97_vertical_hi_sse2(int32_t *b0, int32_t 
*b1, int32_t *b2,
 ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i);
 for(; ivertical_compose_h0 = (void*)dd97_vertical_hi_avx2;
+d->vertical_compose_l0 = (void*)legall53_vertical_lo_avx2;
+break;
 case DWT_DIRAC_LEGALL5_3:
 d->vertical_compose_h0 = (void*)legall53_vertical_hi_avx2;
 d->vertical_compose_l0 = (void*)legall53_vertical_lo_avx2;
-- 
2.17.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 4/6] diracdec: avx2 legall

2018-07-19 Thread James Darnley
---
 libavcodec/x86/dirac_dwt_10bit.asm|  4 +++-
 libavcodec/x86/dirac_dwt_init_10bit.c | 22 ++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/libavcodec/x86/dirac_dwt_10bit.asm 
b/libavcodec/x86/dirac_dwt_10bit.asm
index 681de5e1df..ae110d2945 100644
--- a/libavcodec/x86/dirac_dwt_10bit.asm
+++ b/libavcodec/x86/dirac_dwt_10bit.asm
@@ -24,7 +24,7 @@
 SECTION_RODATA
 
 cextern pd_1
-pd_2: times 4 dd 2
+pd_2: times 8 dd 2
 pd_8: times 4 dd 8
 
 SECTION .text
@@ -204,3 +204,5 @@ HAAR_VERTICAL
 INIT_YMM avx2
 HAAR_HORIZONTAL
 HAAR_VERTICAL
+LEGALL53_VERTICAL_HI
+LEGALL53_VERTICAL_LO
diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c 
b/libavcodec/x86/dirac_dwt_init_10bit.c
index e7e7534050..51d6eeae93 100644
--- a/libavcodec/x86/dirac_dwt_init_10bit.c
+++ b/libavcodec/x86/dirac_dwt_init_10bit.c
@@ -27,6 +27,8 @@ void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, 
int32_t *b2, int32_t *b3
 
 void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
 void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
+void ff_legall53_vertical_hi_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
+void ff_legall53_vertical_lo_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
 
 void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int 
width_align);
 void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int 
width_align);
@@ -112,6 +114,22 @@ static void legall53_vertical_hi_sse2(int32_t *b0, int32_t 
*b1, int32_t *b2, int
 b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]);
 }
 
+static void legall53_vertical_lo_avx2(int32_t *b0, int32_t *b1, int32_t *b2, 
int width)
+{
+int i = width & ~7;
+ff_legall53_vertical_lo_avx2(b0, b1, b2, i);
+for(; ivertical_compose_h0 = (void*)legall53_vertical_hi_avx2;
+d->vertical_compose_l0 = (void*)legall53_vertical_lo_avx2;
+break;
 case DWT_DIRAC_HAAR0:
 d->vertical_compose = (void*)vertical_compose_haar_avx2;
 break;
-- 
2.17.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 3/6] diracdec: add 10-bit Deslauriers-Dubuc 9, 7 (9_7) vertical high-pass function

2018-07-19 Thread James Darnley
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
relevant transform.
C: 84fps
SSE2: 111fps
AVX2: 115fps
---
 libavcodec/x86/dirac_dwt_10bit.asm| 38 +++
 libavcodec/x86/dirac_dwt_init_10bit.c | 16 +++
 2 files changed, 54 insertions(+)

diff --git a/libavcodec/x86/dirac_dwt_10bit.asm 
b/libavcodec/x86/dirac_dwt_10bit.asm
index c00de32bfe..681de5e1df 100644
--- a/libavcodec/x86/dirac_dwt_10bit.asm
+++ b/libavcodec/x86/dirac_dwt_10bit.asm
@@ -25,6 +25,7 @@ SECTION_RODATA
 
 cextern pd_1
 pd_2: times 4 dd 2
+pd_8: times 4 dd 8
 
 SECTION .text
 
@@ -153,7 +154,44 @@ RET
 
 %endmacro
 
+%macro DD97_VERTICAL_HI 0
+
+cglobal dd97_vertical_hi, 6, 6, 8, b0, b1, b2, b3, b4, w
+mova m7, [pd_8]
+shl wd, 2
+add b0q, wq
+add b1q, wq
+add b2q, wq
+add b3q, wq
+add b4q, wq
+neg wq
+
+ALIGN 16
+.loop:
+mova m0, [b0q + wq]
+mova m1, [b1q + wq]
+mova m2, [b2q + wq]
+mova m3, [b3q + wq]
+mova m4, [b4q + wq]
+pslld m5, m1, 3
+pslld m6, m3, 3
+paddd m5, m1
+paddd m6, m3
+psubd m5, m0
+psubd m6, m4
+paddd m5, m7
+paddd m5, m6
+psrad m5, 4
+paddd m2, m5
+mova [b2q + wq], m2
+add wq, mmsize
+jl .loop
+RET
+
+%endmacro
+
 INIT_XMM sse2
+DD97_VERTICAL_HI
 HAAR_HORIZONTAL
 HAAR_VERTICAL
 LEGALL53_VERTICAL_HI
diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c 
b/libavcodec/x86/dirac_dwt_init_10bit.c
index 88cf267d14..e7e7534050 100644
--- a/libavcodec/x86/dirac_dwt_init_10bit.c
+++ b/libavcodec/x86/dirac_dwt_init_10bit.c
@@ -23,6 +23,8 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/dirac_dwt.h"
 
+void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t 
*b3, int32_t *b4, int width);
+
 void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
 void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
 
@@ -110,6 +112,16 @@ static void legall53_vertical_hi_sse2(int32_t *b0, int32_t 
*b1, int32_t *b2, int
 b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]);
 }
 
+static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2,
+  int32_t *b3, int32_t *b4, int width)
+{
+int i = width & ~3;
+ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i);
+for(; ivertical_compose_h0 = (void*)dd97_vertical_hi_sse2;
+d->vertical_compose_l0 = (void*)legall53_vertical_lo_sse2;
+break;
 case DWT_DIRAC_LEGALL5_3:
 d->vertical_compose_h0 = (void*)legall53_vertical_hi_sse2;
 d->vertical_compose_l0 = (void*)legall53_vertical_lo_sse2;
-- 
2.17.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 1/6] diracdec: add 10-bit Haar SIMD functions

2018-07-19 Thread James Darnley
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
relevant transform.
C:119fps
SSE2: 204fps
AVX:  206fps
AVX2: 221fps
---
 libavcodec/dirac_dwt.c|   7 +-
 libavcodec/dirac_dwt.h|   1 +
 libavcodec/x86/Makefile   |   6 +-
 libavcodec/x86/dirac_dwt_10bit.asm| 113 +
 libavcodec/x86/dirac_dwt_init_10bit.c | 136 ++
 5 files changed, 260 insertions(+), 3 deletions(-)
 create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm
 create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c

diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c
index cc08f8865a..86bee5bb9b 100644
--- a/libavcodec/dirac_dwt.c
+++ b/libavcodec/dirac_dwt.c
@@ -59,8 +59,13 @@ int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum 
dwt_type type,
 return AVERROR_INVALIDDATA;
 }
 
-if (ARCH_X86 && bit_depth == 8)
+#if ARCH_X86
+if (bit_depth == 8)
 ff_spatial_idwt_init_x86(d, type);
+else if (bit_depth == 10)
+ff_spatial_idwt_init_10bit_x86(d, type);
+#endif
+
 return 0;
 }
 
diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h
index 994dc21d70..1ad7b9a821 100644
--- a/libavcodec/dirac_dwt.h
+++ b/libavcodec/dirac_dwt.h
@@ -88,6 +88,7 @@ enum dwt_type {
 int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type,
  int decomposition_count, int bit_depth);
 void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type);
+void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type);
 
 void ff_spatial_idwt_slice2(DWTContext *d, int y);
 
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 2350c8bbee..590d83c167 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -7,7 +7,8 @@ OBJS-$(CONFIG_BLOCKDSP)+= x86/blockdsp_init.o
 OBJS-$(CONFIG_BSWAPDSP)+= x86/bswapdsp_init.o
 OBJS-$(CONFIG_DCT) += x86/dct_init.o
 OBJS-$(CONFIG_DIRAC_DECODER)   += x86/diracdsp_init.o   \
-  x86/dirac_dwt_init.o
+  x86/dirac_dwt_init.o \
+  x86/dirac_dwt_init_10bit.o
 OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o
 OBJS-$(CONFIG_FFT) += x86/fft_init.o
 OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o
@@ -153,7 +154,8 @@ X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
 X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o
 X86ASM-OBJS-$(CONFIG_DCA_DECODER)  += x86/dcadsp.o x86/synth_filter.o
 X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)+= x86/diracdsp.o\
-  x86/dirac_dwt.o
+  x86/dirac_dwt.o \
+  x86/dirac_dwt_10bit.o
 X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER)+= x86/dnxhdenc.o
 X86ASM-OBJS-$(CONFIG_EXR_DECODER)  += x86/exrdsp.o
 X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
diff --git a/libavcodec/x86/dirac_dwt_10bit.asm 
b/libavcodec/x86/dirac_dwt_10bit.asm
new file mode 100644
index 00..dc3830615e
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt_10bit.asm
@@ -0,0 +1,113 @@
+;**
+;* x86 optimized discrete 10-bit wavelet trasnform
+;* Copyright (c) 2018 James Darnley
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;**
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pd_1
+
+SECTION .text
+
+%macro HAAR_VERTICAL 0
+
+cglobal vertical_compose_haar_10bit, 3, 3, 4, b0, b1, w
+mova m2, [pd_1]
+shl wd, 2
+add b0q, wq
+add b1q, wq
+neg wq
+
+ALIGN 16
+.loop:
+mova m0, [b0q + wq]
+mova m1, [b1q + wq]
+paddd m3, m1, m2
+psrad m3, 1
+psubd m0, m3
+paddd m1, m0
+mova [b0q + wq], m0
+mova [b1q + wq], m1
+add wq, mmsize
+jl .loop
+RET
+
+%endmacro
+
+%macro HAAR_HORIZONTAL 0
+
+cglobal horizontal_compose_haar_10bit, 3, 6, 4, b, temp_, w, 

[FFmpeg-devel] [PATCH 2/6] diracdec: add 10-bit Legall 5, 3 (5_3) SIMD functions

2018-07-19 Thread James Darnley
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
relevant transform.
C: 94fps
SSE2: 118fps
AVX2: 121fps
---
 libavcodec/x86/dirac_dwt_10bit.asm| 55 +++
 libavcodec/x86/dirac_dwt_init_10bit.c | 23 +++
 2 files changed, 78 insertions(+)

diff --git a/libavcodec/x86/dirac_dwt_10bit.asm 
b/libavcodec/x86/dirac_dwt_10bit.asm
index dc3830615e..c00de32bfe 100644
--- a/libavcodec/x86/dirac_dwt_10bit.asm
+++ b/libavcodec/x86/dirac_dwt_10bit.asm
@@ -24,6 +24,7 @@
 SECTION_RODATA
 
 cextern pd_1
+pd_2: times 4 dd 2
 
 SECTION .text
 
@@ -100,9 +101,63 @@ REP_RET
 
 %endmacro
 
+%macro LEGALL53_VERTICAL_LO 0
+
+cglobal legall53_vertical_lo, 4, 4, 4, b0, b1, b2, w
+mova m3, [pd_2]
+shl wd, 2
+add b0q, wq
+add b1q, wq
+add b2q, wq
+neg wq
+
+ALIGN 16
+.loop:
+mova m0, [b0q + wq]
+mova m1, [b1q + wq]
+mova m2, [b2q + wq]
+paddd m0, m2
+paddd m0, m3
+psrad m0, 2
+psubd m1, m0
+mova [b1q + wq], m1
+add wq, mmsize
+jl .loop
+RET
+
+%endmacro
+
+%macro LEGALL53_VERTICAL_HI 0
+
+cglobal legall53_vertical_hi, 4, 4, 4, b0, b1, b2, w
+mova m3, [pd_1]
+shl wd, 2
+add b0q, wq
+add b1q, wq
+add b2q, wq
+neg wq
+
+ALIGN 16
+.loop:
+mova m0, [b0q + wq]
+mova m1, [b1q + wq]
+mova m2, [b2q + wq]
+paddd m0, m2
+paddd m0, m3
+psrad m0, 1
+paddd m1, m0
+mova [b1q + wq], m1
+add wq, mmsize
+jl .loop
+RET
+
+%endmacro
+
 INIT_XMM sse2
 HAAR_HORIZONTAL
 HAAR_VERTICAL
+LEGALL53_VERTICAL_HI
+LEGALL53_VERTICAL_LO
 
 INIT_XMM avx
 HAAR_HORIZONTAL
diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c 
b/libavcodec/x86/dirac_dwt_init_10bit.c
index 939950e3ff..88cf267d14 100644
--- a/libavcodec/x86/dirac_dwt_init_10bit.c
+++ b/libavcodec/x86/dirac_dwt_init_10bit.c
@@ -23,6 +23,9 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/dirac_dwt.h"
 
+void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
+void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int 
width);
+
 void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int 
width_align);
 void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int 
width_align);
 void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int 
width_align);
@@ -91,6 +94,22 @@ static void horizontal_compose_haar_avx2(int32_t *b, int32_t 
*tmp, int width)
 }
 }
 
+static void legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, 
int width)
+{
+int i = width & ~3;
+ff_legall53_vertical_lo_sse2(b0, b1, b2, i);
+for(; ivertical_compose_h0 = (void*)legall53_vertical_hi_sse2;
+d->vertical_compose_l0 = (void*)legall53_vertical_lo_sse2;
+break;
 case DWT_DIRAC_HAAR0:
 d->vertical_compose = (void*)vertical_compose_haar_sse2;
 break;
-- 
2.17.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


  1   2   3   4   5   6   >