Re: [FFmpeg-devel] [PATCH 3/3] avfilter: add avx2 filter_line function for bwdif
On 3/11/23 17:14, Thomas Mundt wrote: +%if mmsize == 32 +vpbroadcastd m12, DWORD clip_maxm I get a green pattern at bit depths > 8. Looks good with: vpbroadcastw m12, WORD clip_maxm +%else movdm12, DWORD clip_maxm SPLATW m12, m12, 0 +%endif Of course it should be a word broadcast! But why doesn't my checkasm test catch it? bwdif->filter_line = ff_bwdif_filter_line_sse2; if (EXTERNAL_SSSE3(cpu_flags)) bwdif->filter_line = ff_bwdif_filter_line_ssse3; +if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags)) +bwdif->filter_line = ff_bwdif_filter_line_avx2; } else if (bit_depth <= 12) { if (EXTERNAL_SSE2(cpu_flags)) bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2; if (EXTERNAL_SSSE3(cpu_flags)) bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3; +if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags)) +bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2; } } I was intending to only modify/write the 8-bit function so this is a mistake. Thanks. I'll be back with a version 2. [re-sending to list] ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 2/3] checkasm: add test for bwdif
On 3/11/23 17:18, Thomas Mundt wrote: I'm not familiar with checkasm tests, but isn't this one limited to a bit depth of 8? Yes, that was the idea because I was only intending to modify the 8-bit function, for now. The function pointer is the same for all depths so you need to initialize it with a different depth. Judging from your other email I might need to write them anyway. [re-sending to list] ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] tests: actually test yadif's 10 and 16-bit functions
On 2/20/23 14:06, James Darnley wrote: On 2/20/23 13:49, Nicolas George wrote: James Darnley (12023-02-20): snip Moving scale before yadif is right, but format= is redundant with -pix_fmt. Regards, So the patch should just be moving the scale filter first? Sure. Any other comments? I wait a short while then make that change and push. I forgot about this. Now that the repo seems to be working again after the HW failure I will push on Monday. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] libavfilter/x86/vf_convolution.asm- fix missing decelerator for AVX512ICL sobel
On 2/24/23 04:00, Felix LeClair wrote: Fixes: Compilation of Sobel with AVX512ICL Caused: Comment left without deleniator in AVX512ICL version of SOBEL Testing:Confirmed working on AVX512 Alderlake (AKA SPR without AMX) diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm index 9ac9ef5d73..8b85897819 100644 --- a/libavfilter/x86/vf_convolution.asm +++ b/libavfilter/x86/vf_convolution.asm @@ -232,8 +232,8 @@ cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, psubd m4, m5 vpermbm3, m6, m3 mova m5, m4 -vpdpbusd m4, m2, [sobel_mulA] {1to16} -vpdpbusd m5, m3, [sobel_mulB] {1to16} +vpdpbusd m4, m2, [sobel_mulA]; {1to16} +vpdpbusd m5, m3, [sobel_mulB]; {1to16} cvtdq2ps m4, m4 mulps m4, m4 Fix compilation with what? I'm not familiar with the sobel algorith/function so I can't say whether the code is correct. However those constants are only dword sized and that is how you do a memory broadcast with avx512(icl). Furthermore testing your change on an icl system results in a failure in checkasm. So what program and what version fails to assemble that? [re-sending to list] ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 3/3] avfilter: add avx2 filter_line function for bwdif
2.24x faster (1925±1.3 vs. 859±2.2 decicycles) compared with ssse3 --- libavfilter/x86/vf_bwdif.asm| 29 - libavfilter/x86/vf_bwdif_init.c | 12 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/libavfilter/x86/vf_bwdif.asm b/libavfilter/x86/vf_bwdif.asm index 0b453da53b..5cc61435fd 100644 --- a/libavfilter/x86/vf_bwdif.asm +++ b/libavfilter/x86/vf_bwdif.asm @@ -26,18 +26,22 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 -pw_coefhf: times 4 dw 1016, 5570 -pw_coefhf1: times 8 dw -3801 -pw_coefsp: times 4 dw 5077, -981 -pw_splfdif: times 4 dw -768, 768 +pw_coefhf: times 8 dw 1016, 5570 +pw_coefhf1: times 16 dw -3801 +pw_coefsp: times 8 dw 5077, -981 +pw_splfdif: times 8 dw -768, 768 SECTION .text %macro LOAD8 2 +%if mmsize == 32 +pmovzxbw %1, %2 +%else movh %1, %2 punpcklbw%1, m7 +%endif %endmacro %macro LOAD12 2 @@ -45,8 +49,14 @@ SECTION .text %endmacro %macro DISP8 0 +%if mmsize == 32 +vextracti128 xm1,m2, 1 +packuswb xm2, xm1 +movu [dstq], xm2 +%else packuswb m2, m2 movh [dstq], m2 +%endif %endmacro %macro DISP12 0 @@ -244,8 +254,12 @@ cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \ prefs, mrefs, prefs2, mrefs2, \ prefs3, mrefs3, prefs4, \ mrefs4, parity, clip_max +%if mmsize == 32 +vpbroadcastd m12, DWORD clip_maxm +%else movdm12, DWORD clip_maxm SPLATW m12, m12, 0 +%endif %else cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ prefs, mrefs, prefs2, mrefs2, \ @@ -264,3 +278,8 @@ INIT_XMM ssse3 BWDIF INIT_XMM sse2 BWDIF + +%if HAVE_AVX2_EXTERNAL && ARCH_X86_64 +INIT_YMM avx2 +BWDIF +%endif diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c index ba7bc40c3d..f833318c10 100644 --- a/libavfilter/x86/vf_bwdif_init.c +++ b/libavfilter/x86/vf_bwdif_init.c @@ -32,6 +32,10 @@ void ff_bwdif_filter_line_ssse3(void *dst, void *prev, void *cur, void *next, int w, int prefs, int mrefs, int prefs2, int mrefs2, int prefs3, int mrefs3, int prefs4, int mrefs4, int parity, int clip_max); +void ff_bwdif_filter_line_avx2(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int prefs2, + int mrefs2, int prefs3, int mrefs3, int prefs4, + int mrefs4, int parity, int clip_max); void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, void *next, int w, int prefs, int mrefs, int prefs2, @@ -41,6 +45,10 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne int w, int prefs, int mrefs, int prefs2, int mrefs2, int prefs3, int mrefs3, int prefs4, int mrefs4, int parity, int clip_max); +void ff_bwdif_filter_line_12bit_avx2(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int prefs2, + int mrefs2, int prefs3, int mrefs3, int prefs4, + int mrefs4, int parity, int clip_max); av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth) { @@ -51,10 +59,14 @@ av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth) bwdif->filter_line = ff_bwdif_filter_line_sse2; if (EXTERNAL_SSSE3(cpu_flags)) bwdif->filter_line = ff_bwdif_filter_line_ssse3; +if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags)) +bwdif->filter_line = ff_bwdif_filter_line_avx2; } else if (bit_depth <= 12) { if (EXTERNAL_SSE2(cpu_flags)) bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2; if (EXTERNAL_SSSE3(cpu_flags)) bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3; +if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags)) +bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2; } } -- 2.39.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/3] checkasm: add test for bwdif
--- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 ++ tests/checkasm/checkasm.h | 1 + tests/checkasm/vf_bwdif.c | 70 +++ tests/fate/checkasm.mak | 1 + 5 files changed, 76 insertions(+) create mode 100644 tests/checkasm/vf_bwdif.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index a6f06c7007..b6a43f181f 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -40,6 +40,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) # libavfilter tests AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o +AVFILTEROBJS-$(CONFIG_BWDIF_FILTER) += vf_bwdif.o AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o AVFILTEROBJS-$(CONFIG_EQ_FILTER) += vf_eq.o AVFILTEROBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index e96d84a7da..5e729cf0e0 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -179,6 +179,9 @@ static const struct { #if CONFIG_BLEND_FILTER { "vf_blend", checkasm_check_blend }, #endif +#if CONFIG_BWDIF_FILTER +{ "vf_bwdif", checkasm_check_vf_bwdif }, +#endif #if CONFIG_COLORSPACE_FILTER { "vf_colorspace", checkasm_check_colorspace }, #endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 8744a81218..e9e73c6fa0 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -82,6 +82,7 @@ void checkasm_check_utvideodsp(void); void checkasm_check_v210dec(void); void checkasm_check_v210enc(void); void checkasm_check_vc1dsp(void); +void checkasm_check_vf_bwdif(void); void checkasm_check_vf_eq(void); void checkasm_check_vf_gblur(void); void checkasm_check_vf_hflip(void); diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c new file mode 100644 index 00..e27f9b7494 --- /dev/null +++ b/tests/checkasm/vf_bwdif.c @@ -0,0 +1,70 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include "checkasm.h" +#include "libavcodec/internal.h" +#include "libavfilter/bwdif.h" + +#define WIDTH 256 + +#define randomize_buffers(buf0, buf1, mask, count) \ +for (size_t i; i < count; i++) \ +buf0[i] = buf1[i] = rnd() & mask + +void checkasm_check_vf_bwdif(void) +{ +BWDIFContext ctx_8, ctx_10, ctx_16; + +ff_bwdif_init_filter_line(&ctx_8, 8); +ff_bwdif_init_filter_line(&ctx_10, 10); +ff_bwdif_init_filter_line(&ctx_16, 16); + +if (check_func(ctx_8.filter_line, "bwdif8")) { +uint8_t prev0[9*WIDTH], prev1[9*WIDTH]; +uint8_t next0[9*WIDTH], next1[9*WIDTH]; +uint8_t cur0[9*WIDTH], cur1[9*WIDTH]; +uint8_t dst0[WIDTH], dst1[WIDTH]; + +declare_func(void, void *dst, void *prev, void *cur, void *next, +int w, int prefs, int mrefs, int prefs2, int mrefs2, +int prefs3, int mrefs3, int prefs4, int mrefs4, +int parity, int clip_max); + +randomize_buffers(prev0, prev1, 0xff, 9*WIDTH); +randomize_buffers(next0, next1, 0xff, 9*WIDTH); +randomize_buffers(cur0, cur1, 0xff, 9*WIDTH); + +call_ref(dst0, prev0 + 4*WIDTH, cur0 + 4*WIDTH, next0 + 4*WIDTH, WIDTH, +WIDTH, -WIDTH, 2*WIDTH, -2*WIDTH, 3*WIDTH, -3*WIDTH, 4*WIDTH, -4*WIDTH, +0, 0xff); +call_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH, WIDTH, +WIDTH, -WIDTH, 2*WIDTH, -2*WIDTH, 3*WIDTH, -3*WIDTH, 4*WIDTH, -4*WIDTH, +0, 0xff); + +if (memcmp(dst0, dst1, WIDTH) +|| memcmp(prev0, prev1, sizeof prev0) +|| memcmp(next0, next1, sizeof next0) +|| memcmp(cur0, cur1, sizeof cur0)) +fail(); +bench_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH, WIDTH, +WIDTH, -WIDTH, 2*WIDTH, -2*WIDTH, 3*WIDTH, -3*WIDTH, 4*WIDTH, -4*WIDTH, +0, 0xff); +} +report("bwdif8"); +} diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak index a4e95541f5..6a7d4a1226 100644 --- a/tests/fate/checkasm.mak +++ b/tests/fate/checkasm.mak @@ -37,6 +37,7 @@ FATE_CHECKASM
[FFmpeg-devel] [PATCH 1/3] avfilter: move bwdif's filter_line init into a dedicated function
--- libavfilter/bwdif.h | 3 ++- libavfilter/vf_bwdif.c | 13 + libavfilter/x86/vf_bwdif_init.c | 4 +--- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h index 889ff772ed..5749345f78 100644 --- a/libavfilter/bwdif.h +++ b/libavfilter/bwdif.h @@ -37,6 +37,7 @@ typedef struct BWDIFContext { int parity, int clip_max, int spat); } BWDIFContext; -void ff_bwdif_init_x86(BWDIFContext *bwdif); +void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth); +void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth); #endif /* AVFILTER_BWDIF_H */ diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c index 65c617ebb3..34e8c5e234 100644 --- a/libavfilter/vf_bwdif.c +++ b/libavfilter/vf_bwdif.c @@ -340,7 +340,14 @@ static int config_props(AVFilterLink *link) yadif->csp = av_pix_fmt_desc_get(link->format); yadif->filter = filter; -if (yadif->csp->comp[0].depth > 8) { +ff_bwdif_init_filter_line(s, yadif->csp->comp[0].depth); + +return 0; +} + +av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth) +{ +if (bit_depth > 8) { s->filter_intra = filter_intra_16bit; s->filter_line = filter_line_c_16bit; s->filter_edge = filter_edge_16bit; @@ -351,10 +358,8 @@ static int config_props(AVFilterLink *link) } #if ARCH_X86 -ff_bwdif_init_x86(s); +ff_bwdif_init_x86(s, bit_depth); #endif - -return 0; } diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c index e24e5cd9b1..ba7bc40c3d 100644 --- a/libavfilter/x86/vf_bwdif_init.c +++ b/libavfilter/x86/vf_bwdif_init.c @@ -42,11 +42,9 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne int mrefs2, int prefs3, int mrefs3, int prefs4, int mrefs4, int parity, int clip_max); -av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif) +av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth) { -YADIFContext *yadif = &bwdif->yadif; int cpu_flags = av_get_cpu_flags(); -int bit_depth = (!yadif->csp) ? 8 : yadif->csp->comp[0].depth; if (bit_depth <= 8) { if (EXTERNAL_SSE2(cpu_flags)) -- 2.39.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] tests: actually test yadif's 10 and 16-bit functions
On 2/20/23 13:49, Nicolas George wrote: James Darnley (12023-02-20): -fate-filter-yadif10: CMD = framecrc -flags bitexact -idct simple -i $(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt yuv420p10le -frames:v 30 -vf yadif=0,scale -fate-filter-yadif16: CMD = framecrc -flags bitexact -idct simple -i $(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt yuv420p16le -frames:v 30 -vf yadif=0,scale +fate-filter-yadif10: CMD = framecrc -flags bitexact -idct simple -i $(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt yuv420p10le -frames:v 30 -vf scale,format=yuv420p10le,yadif=0 +fate-filter-yadif16: CMD = framecrc -flags bitexact -idct simple -i $(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt yuv420p16le -frames:v 30 -vf scale,format=yuv420p16le,yadif=0 Moving scale before yadif is right, but format= is redundant with -pix_fmt. Regards, So the patch should just be moving the scale filter first? Sure. Any other comments? I wait a short while then make that change and push. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 3/3] avfilter/yadif: add avx2 filter_line function
On 2/10/23 14:06, James Darnley wrote: snip This patch set is broken. The checkasm test is incomplete. This avx2 function has some bug that only manifests when the strides (prefs mrefs) are opposite signs (one positive and one negative). That situation is what happens with real usage. I fixed my checkasm test which also shows it. Consider this patch set retracted until I can fix it. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] tests: actually test yadif's 10 and 16-bit functions
--- tests/fate/filter-video.mak | 4 +-- tests/ref/fate/filter-yadif10 | 60 +-- tests/ref/fate/filter-yadif16 | 60 +-- 3 files changed, 62 insertions(+), 62 deletions(-) diff --git a/tests/fate/filter-video.mak b/tests/fate/filter-video.mak index 63873a7a07..65965d8518 100644 --- a/tests/fate/filter-video.mak +++ b/tests/fate/filter-video.mak @@ -16,8 +16,8 @@ fate-filter-yadif-mode0: CMD = framecrc -flags bitexact -idct simple -i $(TARGET fate-filter-yadif-mode1: CMD = framecrc -flags bitexact -idct simple -i $(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -frames:v 59 -vf yadif=1 FATE_YADIF-$(call FILTERDEMDEC, YADIF SCALE, MPEGTS, MPEG2VIDEO) += fate-filter-yadif10 fate-filter-yadif16 -fate-filter-yadif10: CMD = framecrc -flags bitexact -idct simple -i $(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt yuv420p10le -frames:v 30 -vf yadif=0,scale -fate-filter-yadif16: CMD = framecrc -flags bitexact -idct simple -i $(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt yuv420p16le -frames:v 30 -vf yadif=0,scale +fate-filter-yadif10: CMD = framecrc -flags bitexact -idct simple -i $(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt yuv420p10le -frames:v 30 -vf scale,format=yuv420p10le,yadif=0 +fate-filter-yadif16: CMD = framecrc -flags bitexact -idct simple -i $(TARGET_SAMPLES)/mpeg2/mpeg2_field_encoding.ts -flags bitexact -pix_fmt yuv420p16le -frames:v 30 -vf scale,format=yuv420p16le,yadif=0 FATE_FILTER_SAMPLES-yes += $(FATE_YADIF-yes) diff --git a/tests/ref/fate/filter-yadif10 b/tests/ref/fate/filter-yadif10 index 28e799fc1f..1a8063fee9 100644 --- a/tests/ref/fate/filter-yadif10 +++ b/tests/ref/fate/filter-yadif10 @@ -3,33 +3,33 @@ #codec_id 0: rawvideo #dimensions 0: 720x576 #sar 0: 16/15 -0, 9, 9,1, 1244160, 0xe0c2231b -0, 10, 10,1, 1244160, 0xdc7caa43 -0, 11, 11,1, 1244160, 0x52c4dfbf -0, 12, 12,1, 1244160, 0x7c577f07 -0, 13, 13,1, 1244160, 0x5b6ad7ce -0, 14, 14,1, 1244160, 0x6f15ce76 -0, 15, 15,1, 1244160, 0xf120034a -0, 16, 16,1, 1244160, 0x9c65ba64 -0, 17, 17,1, 1244160, 0x883b237e -0, 18, 18,1, 1244160, 0xb8292e0d -0, 19, 19,1, 1244160, 0xbc392721 -0, 20, 20,1, 1244160, 0x7cd82ec9 -0, 21, 21,1, 1244160, 0x167325eb -0, 22, 22,1, 1244160, 0x49bafa73 -0, 23, 23,1, 1244160, 0xe1ff6dbf -0, 24, 24,1, 1244160, 0x85f710b6 -0, 25, 25,1, 1244160, 0xd1fd4cdb -0, 26, 26,1, 1244160, 0xafee03c5 -0, 27, 27,1, 1244160, 0x566be070 -0, 28, 28,1, 1244160, 0xb6abbd01 -0, 29, 29,1, 1244160, 0xa98f38fd -0, 30, 30,1, 1244160, 0x00f4736b -0, 31, 31,1, 1244160, 0x6b0f9dd2 -0, 32, 32,1, 1244160, 0x15810b92 -0, 33, 33,1, 1244160, 0x0b516465 -0, 34, 34,1, 1244160, 0x927d15e6 -0, 35, 35,1, 1244160, 0xd102f2bf -0, 36, 36,1, 1244160, 0xdd8b3b20 -0, 37, 37,1, 1244160, 0x229ac529 -0, 38, 38,1, 1244160, 0xf844e0a2 +0, 9, 9,1, 1244160, 0x67910b3d +0, 10, 10,1, 1244160, 0xdbb80927 +0, 11, 11,1, 1244160, 0xd5d4f27a +0, 12, 12,1, 1244160, 0xde270630 +0, 13, 13,1, 1244160, 0xe57833cc +0, 14, 14,1, 1244160, 0xc806eabd +0, 15, 15,1, 1244160, 0xe041958a +0, 16, 16,1, 1244160, 0x0007fdc7 +0, 17, 17,1, 1244160, 0xed25afda +0, 18, 18,1, 1244160, 0x43f8e068 +0, 19, 19,1, 1244160, 0xd95b763a +0, 20, 20,1, 1244160, 0xf99cacdb +0, 21, 21,1, 1244160, 0x3c33ec50 +0, 22, 22,1, 1244160, 0xf5260151 +0, 23, 23,1, 1244160, 0x88e9f2e9 +0, 24, 24,1, 1244160, 0x104cfe20 +0, 25, 25,1, 1244160, 0x804d6a33 +0, 26, 26,1, 1244160, 0x8c668008 +0, 27, 27,1, 1244160, 0x63cf270a +0, 28, 28,1, 1244160, 0xc526e89a +0, 29, 29,1, 1244160, 0xe318e4d4 +0, 30, 30,1, 1244160, 0x7c6b63a3 +0, 31, 31,1, 1244160, 0x40deffd
[FFmpeg-devel] [PATCH 1/3] avfilter: move yadif's filter_line init into a dedicated function
--- libavfilter/vf_yadif.c | 13 + libavfilter/x86/vf_yadif_init.c | 4 +--- libavfilter/yadif.h | 3 ++- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c index afa4d1d53d..1f9434f961 100644 --- a/libavfilter/vf_yadif.c +++ b/libavfilter/vf_yadif.c @@ -303,7 +303,14 @@ static int config_output(AVFilterLink *outlink) s->csp = av_pix_fmt_desc_get(outlink->format); s->filter = filter; -if (s->csp->comp[0].depth > 8) { +ff_yadif_init_filter_line(s, s->csp->comp[0].depth); + +return 0; +} + +av_cold void ff_yadif_init_filter_line(YADIFContext *s, int bit_depth) +{ +if (bit_depth > 8) { s->filter_line = filter_line_c_16bit; s->filter_edges = filter_edges_16bit; } else { @@ -312,10 +319,8 @@ static int config_output(AVFilterLink *outlink) } #if ARCH_X86 -ff_yadif_init_x86(s); +ff_yadif_init_x86(s, bit_depth); #endif - -return 0; } diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c index 257c3f9199..d648f0f835 100644 --- a/libavfilter/x86/vf_yadif_init.c +++ b/libavfilter/x86/vf_yadif_init.c @@ -47,11 +47,9 @@ void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur, void *next, int w, int prefs, int mrefs, int parity, int mode); -av_cold void ff_yadif_init_x86(YADIFContext *yadif) +av_cold void ff_yadif_init_x86(YADIFContext *yadif, int bit_depth) { int cpu_flags = av_get_cpu_flags(); -int bit_depth = (!yadif->csp) ? 8 - : yadif->csp->comp[0].depth; if (bit_depth >= 15) { if (EXTERNAL_SSE2(cpu_flags)) diff --git a/libavfilter/yadif.h b/libavfilter/yadif.h index c928911b35..5d8309b403 100644 --- a/libavfilter/yadif.h +++ b/libavfilter/yadif.h @@ -86,7 +86,8 @@ typedef struct YADIFContext { int current_field; ///< YADIFCurrentField } YADIFContext; -void ff_yadif_init_x86(YADIFContext *yadif); +void ff_yadif_init_filter_line(YADIFContext *s, int bit_depth); +void ff_yadif_init_x86(YADIFContext *yadif, int bit_depth); int ff_yadif_filter_frame(AVFilterLink *link, AVFrame *frame); -- 2.39.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 3/3] avfilter/yadif: add avx2 filter_line function
Zen 2 (Ryzen 7 3700X): 1.73x faster (3603±586.3 vs. 2082±317.1 decicycles) compared with ssse3 Using an SD y4m file speed increases from ~ 3600 fps to ~4700. --- libavfilter/x86/vf_yadif.asm| 83 +++-- libavfilter/x86/vf_yadif_init.c | 4 ++ 2 files changed, 62 insertions(+), 25 deletions(-) diff --git a/libavfilter/x86/vf_yadif.asm b/libavfilter/x86/vf_yadif.asm index 809cebdd3f..571febfca3 100644 --- a/libavfilter/x86/vf_yadif.asm +++ b/libavfilter/x86/vf_yadif.asm @@ -25,11 +25,30 @@ SECTION_RODATA -pb_1: times 16 db 1 -pw_1: times 8 dw 1 +pb_1: times 32 db 1 +pw_1: times 16 dw 1 SECTION .text +%unmacro RSHIFT 2 + +%macro RSHIFT 2 +%if mmsize == 32 +vextracti128 xm7, %1, 1 +palignr xmm %+ %1, xm7, xmm %+ %1, 2 +%else +psrldq %1, %2 +%endif +%endmacro + +%macro UNPACK 1 +%if mmsize == 32 +pmovzxbw %1, xmm %+ %1 +%else +punpcklbw %1, m7 +%endif +%endmacro + %macro CHECK 2 movu m2, [curq+t1+%1] movu m3, [curq+t0+%2] @@ -40,7 +59,7 @@ SECTION .text pand m4, [pb_1] psubusb m5, m4 RSHIFTm5, 1 -punpcklbw m5, m7 +UNPACKm5 mova m4, m2 psubusb m2, m3 psubusb m3, m4 @@ -49,9 +68,9 @@ SECTION .text mova m4, m2 RSHIFTm3, 1 RSHIFTm4, 2 -punpcklbw m2, m7 -punpcklbw m3, m7 -punpcklbw m4, m7 +UNPACKm2 +UNPACKm3 +UNPACKm4 paddw m2, m3 paddw m2, m4 %endmacro @@ -81,13 +100,19 @@ SECTION .text %endmacro %macro LOAD 2 -movh %1, %2 -punpcklbw %1, m7 +%if mmsize == 32 +pmovzxbw %1, %2 +%else +movh %1, %2 +punpcklbw %1, m7 +%endif %endmacro %macro FILTER 3 .loop%1: -pxor m7, m7 +%if mmsize != 32 +pxor m7, m7 +%endif LOAD m0, [curq+t1] LOAD m1, [curq+t0] LOAD m2, [%2] @@ -95,9 +120,9 @@ SECTION .text mova m4, m3 paddwm3, m2 psrawm3, 1 -mova [rsp+ 0], m0 -mova [rsp+16], m3 -mova [rsp+32], m1 +mova [rsp+0*mmsize], m0 +mova [rsp+1*mmsize], m3 +mova [rsp+2*mmsize], m1 psubwm2, m4 ABS1 m2, m4 LOAD m3, [prevq+t1] @@ -119,7 +144,7 @@ SECTION .text paddwm3, m4 psrlwm3, 1 pmaxsw m2, m3 -mova [rsp+48], m2 +mova [rsp+3*mmsize], m2 paddwm1, m0 paddwm0, m0 @@ -134,9 +159,9 @@ SECTION .text psubusb m3, m4 pmaxub m2, m3 mova m3, m2 -psrldq m3, 2 -punpcklbwm2, m7 -punpcklbwm3, m7 +RSHIFT m3, 2 +UNPACK m2 +UNPACK m3 paddwm0, m2 paddwm0, m3 psubwm0, [pw_1] @@ -150,7 +175,7 @@ SECTION .text CHECK 1, -3 CHECK2 -mova m6, [rsp+48] +mova m6, [rsp+3*mmsize] cmp DWORD r8m, 2 jge .end%1 LOAD m2, [%2+t1*2] @@ -161,9 +186,9 @@ SECTION .text paddwm3, m5 psrlwm2, 1 psrlwm3, 1 -mova m4, [rsp+ 0] -mova m5, [rsp+16] -mova m7, [rsp+32] +mova m4, [rsp+0*mmsize] +mova m5, [rsp+1*mmsize] +mova m7, [rsp+2*mmsize] psubwm2, m4 psubwm3, m7 mova m0, m5 @@ -182,15 +207,21 @@ SECTION .text pmaxsw m6, m4 .end%1: -mova m2, [rsp+16] +mova m2, [rsp+1*mmsize] mova m3, m2 psubwm2, m6 paddwm3, m6 pmaxsw m1, m2 pminsw m1, m3 -packuswb m1, m1 -movh [dstq], m1 +%if mmsize == 32 +vextracti128 xm4, ym1, 1 +packuswb xm1, xm4 +movu [dstq], xm1 +%else +packuswb m1, m1 +movh [dstq], m1 +%endif adddstq, mmsize/2 add prevq, mmsize/2 addcurq, mmsize/2 @@ -201,10 +232,10 @@ SECTION .text %macro YADIF 0 %if ARCH_X86_32 -cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \ +cglobal yadif_filter_line, 4, 6, 8, 4*mmsize, dst, prev, cur, next, w, prefs, \ mrefs, parity, mode %else -cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \ +cglobal yadif_filter_line, 4, 7, 8, 4*mmsize, dst, prev, cur, next, w, prefs, \ mrefs, parity, mode %endif %if ARCH_X86_32 @@ -233,3 +264,5 @@ INIT_XMM ssse3 YADIF INIT_XMM sse2 YADIF +INIT_YMM avx2 +YADIF diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c index d648f0f835..48858dc295 100644 --- a/libavfilter/x86/vf_yadif_init.c +++ b/libavfilter/x86/vf_yadif_init.c @@ -29,6 +29,8 @@ void ff_yadif_filter_line_sse2(void *dst, void *prev, void *cur, void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cu
[FFmpeg-devel] [PATCH 2/3] checkasm: add test for yadif
--- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 ++ tests/checkasm/checkasm.h | 1 + tests/checkasm/vf_yadif.c | 62 +++ 4 files changed, 67 insertions(+) create mode 100644 tests/checkasm/vf_yadif.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index a6f06c7007..fc65bdc77d 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -47,6 +47,7 @@ AVFILTEROBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o AVFILTEROBJS-$(CONFIG_THRESHOLD_FILTER) += vf_threshold.o AVFILTEROBJS-$(CONFIG_NLMEANS_FILTER)+= vf_nlmeans.o AVFILTEROBJS-$(CONFIG_SOBEL_FILTER) += vf_convolution.o +AVFILTEROBJS-$(CONFIG_YADIF_FILTER) += vf_yadif.o CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index e96d84a7da..2bb72cf839 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -200,6 +200,9 @@ static const struct { #if CONFIG_SOBEL_FILTER { "vf_sobel", checkasm_check_vf_sobel }, #endif +#if CONFIG_YADIF_FILTER +{ "vf_yadif", checkasm_check_vf_yadif }, +#endif #endif #if CONFIG_SWSCALE { "sw_gbrp", checkasm_check_sw_gbrp }, diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 8744a81218..0b9a83b5b5 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -87,6 +87,7 @@ void checkasm_check_vf_gblur(void); void checkasm_check_vf_hflip(void); void checkasm_check_vf_threshold(void); void checkasm_check_vf_sobel(void); +void checkasm_check_vf_yadif(void); void checkasm_check_vp8dsp(void); void checkasm_check_vp9dsp(void); void checkasm_check_videodsp(void); diff --git a/tests/checkasm/vf_yadif.c b/tests/checkasm/vf_yadif.c new file mode 100644 index 00..cb58519c23 --- /dev/null +++ b/tests/checkasm/vf_yadif.c @@ -0,0 +1,62 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include "checkasm.h" +#include "libavcodec/internal.h" +#include "libavfilter/yadif.h" + +#define WIDTH 256 + +#define randomize_buffers(buf0, buf1, mask, count) \ +for (size_t i; i < count; i++) \ +buf0[i] = buf1[i] = rnd() & mask + +void checkasm_check_vf_yadif(void) +{ +YADIFContext ctx_8, ctx_10, ctx_16; + +ff_yadif_init_filter_line(&ctx_8, 8); +ff_yadif_init_filter_line(&ctx_10, 10); +ff_yadif_init_filter_line(&ctx_16, 16); + +if (check_func(ctx_8.filter_line, "yadif8")) { +uint8_t prev0[5*WIDTH + STRIDE_ALIGN], prev1[5*WIDTH + STRIDE_ALIGN]; +uint8_t next0[5*WIDTH + STRIDE_ALIGN], next1[5*WIDTH + STRIDE_ALIGN]; +uint8_t cur0[5*WIDTH + STRIDE_ALIGN], cur1[5*WIDTH + STRIDE_ALIGN]; +uint8_t dst0[WIDTH + STRIDE_ALIGN], dst1[WIDTH + STRIDE_ALIGN]; + +declare_func(void, void *dst, void *prev, void *cur, void *next, +int w, int prefs, int mrefs, int parity, int mode); + +randomize_buffers(prev0, prev1, 0xff, 5*WIDTH + STRIDE_ALIGN); +randomize_buffers(next0, next1, 0xff, 5*WIDTH + STRIDE_ALIGN); +randomize_buffers(cur0, cur1, 0xff, 5*WIDTH + STRIDE_ALIGN); + +call_ref(dst0, prev0, cur0, next0, WIDTH, WIDTH, WIDTH, 0, 1); +call_new(dst1, prev1, cur1, next1, WIDTH, WIDTH, WIDTH, 0, 1); + +if (memcmp(dst0, dst1, WIDTH) +|| memcmp(prev0, prev1, sizeof prev0) +|| memcmp(next0, next1, sizeof next0) +|| memcmp(cur0, cur1, sizeof cur0)) +fail(); +bench_new(dst1, prev1, cur1, next1, WIDTH, WIDTH, WIDTH, 0, 1); +} +report("yadif8"); +} -- 2.39.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [RFC PATCH 2/2] avcodec/x86: add avx512icl function for v210dec
Ice Lake (Xeon Silver 4316): 2.01x faster (1147±36.8 vs. 571±38.2 decicycles) compared with avx2 --- I think I can merge this with the existing macro without it being too ugly. That might allow a plain avx512 version too but I can't say if that would be any faster. libavcodec/x86/v210-init.c | 10 ++- libavcodec/x86/v210.asm| 60 +- tests/checkasm/v210dec.c | 12 3 files changed, 74 insertions(+), 8 deletions(-) diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c index 5db1fef98c..8b3677b8aa 100644 --- a/libavcodec/x86/v210-init.c +++ b/libavcodec/x86/v210-init.c @@ -17,7 +17,7 @@ */ #include "libavutil/attributes.h" -#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" #include "libavcodec/v210dec.h" extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); @@ -28,6 +28,8 @@ extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_avx512icl(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); + av_cold void ff_v210_x86_init(V210DecContext *s) { #if HAVE_X86ASM @@ -42,6 +44,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s) if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2) s->unpack_frame = ff_v210_planar_unpack_aligned_avx2; + +if (EXTERNAL_AVX512ICL(cpu_flags)) +s->unpack_frame = ff_v210_planar_unpack_avx512icl; } else { if (cpu_flags & AV_CPU_FLAG_SSSE3) @@ -52,6 +57,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s) if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2) s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2; + +if (EXTERNAL_AVX512ICL(cpu_flags)) +s->unpack_frame = ff_v210_planar_unpack_avx512icl; } #endif } diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm index 600a4ddc5f..f247737ed0 100644 --- a/libavcodec/x86/v210.asm +++ b/libavcodec/x86/v210.asm @@ -22,7 +22,21 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 + +perm_y: +db 0,1, 4,5, 6,7, 8,9, 12,13, 14,15, 16,17, 20,21 +db 22,23, 24,25, 28,29, 30,31, 32,33, 36,37, 38,39, 40,41 +db 44,45, 46,47, 48,49, 52,53, 54,55, 56,57, 60,61, 62,63 +times 16 db 0xff ; align to 64 + +perm_uv: +db 0,1, 4,5, 10,11, 16,17, 20,21, 26,27, 32,33, 36,37 +db 42,43, 48,49, 52,53, 58,59 +times 8 db 0xff ; align to 32 +db 2,3, 8,9, 12,13, 18,19, 24,25, 28,29, 34,35, 40,41 +db 44,45, 50,51, 56,57, 60,61 +times 8 db 0xff ; align to 32 ; for AVX2 version only v210_luma_permute: dd 0,1,2,4,5,6,7,7 ; 32-byte alignment required @@ -34,6 +48,9 @@ v210_mult: dw 64,4,64,4,64,4,64,4 v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 +shift: times 4 dw 6, 2 +kmask: dw 0x, 0x + SECTION .text %macro v210_planar_unpack 1 @@ -127,3 +144,44 @@ v210_planar_unpack aligned INIT_YMM avx2 v210_planar_unpack aligned %endif + +%if HAVE_AVX512ICL_EXTERNAL + +INIT_ZMM avx512icl + +cglobal v210_planar_unpack, 5, 5, 6, src, y, u, v, w +movsxdifnidn wq, wd +leayq, [yq+2*wq] +adduq, wq +addvq, wq +negwq + +kmovw k1, [kmask] ; odd dword mask +kmovw k2, [kmask+2] ; even dword mask + +VBROADCASTI128 m0, [shift] +mova m1, [perm_y] +mova m2, [perm_uv] + +.loop: +movum3, [srcq] +vpsllvw m4, m3, m0 +pslld m5, m3, 12 +psrlw m4, 6 +psrld m5, 22 + +vpblendmd m3{k1}, m4, m5 +vpermbm3, m1, m3 ; could use vpcompressw +movu [yq+2*wq], m3 + +vpblendmd m5{k2}, m4, m5 +vpermbm5, m2, m5 +movu [uq+wq], ym5 +vextracti32x8 [vq+wq], zm5, 1 + +add srcq, mmsize +add wq, (mmsize*3)/8 +jl .loop +RET + +%endif diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c index 6aef519cc5..93993bae71 100644 --- a/tests/checkasm/v210dec.c +++ b/tests/checkasm/v210dec.c @@ -54,12 +54,12 @@ void checkasm_check_v210dec(void) if (check_func(h.unpack_frame, "v210_unpack")) { uint32_t src0[NUM_SAMPLES/3]; uint32_t src1[NUM_SAMPLES/3]; -uint16_t y0[NUM_SAMPLES/2]; -uint16_t y1[NUM_SAMPLES/2]; -uint16_t u0[NUM_SAMPLES/4]; -uint16_t u1[NUM_SAMPLES/4]; -uint16_t v0[NUM_SAMPLES/4]; -uint16_t v1[NUM_SAMPLES/4]; +uint16_t y0[NUM_SAMPLES/2 + 15]; +uint16_t y1[NUM_SAMPLES/2 + 15]; +uint16_t u0[NUM_SAMPLES/4 + 7]; +
[FFmpeg-devel] [PATCH 1/2] avcodec/x86/v210: add some comments to the improved avx2 function
--- libavcodec/x86/v210.asm | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm index 3b9e0761df..600a4ddc5f 100644 --- a/libavcodec/x86/v210.asm +++ b/libavcodec/x86/v210.asm @@ -65,18 +65,18 @@ cglobal v210_planar_unpack_%1, 5, 5, 6 + 2 * cpuflag(avx2), src, y, u, v, w mova m0, [srcq] %endif -pmullw m1, m0, m3 -pslld m0, 12 -psrlw m1, 6 ; yB yA u5 v4 y8 y7 v3 u3 y5 y4 u2 v1 y2 y1 v0 u0 -psrld m0, 22 ; 00 v5 00 y9 00 u4 00 y6 00 v2 00 y3 00 u1 00 y0 +pmullw m1, m0, m3 ; shifts the 1st and 3rd sample of each dword into the high 10 bits of each word +pslld m0, 12 ; shifts the 2nd sample of each dword into the high 10 bits of each dword +psrlw m1, 6 ; shifts the 1st and 3rd samples back into the low 10 bits +psrld m0, 22 ; shifts the 2nd sample back into the low 10 bits of each dword %if cpuflag(avx2) -vpblendd m2, m1, m0, 0x55 ; yB yA 00 y9 y8 y7 00 y6 y5 y4 00 y3 y2 y1 00 y0 +vpblendd m2, m1, m0, 0x55 ; merge the odd dwords from m0 and even from m1 ; yB yA 00 y9 y8 y7 00 y6 y5 y4 00 y3 y2 y1 00 y0 pshufb m2, m4 ; 00 00 yB yA y9 y8 y7 y6 00 00 y5 y4 y3 y2 y1 y0 vpermd m2, m6, m2 ; 00 00 00 00 yB yA y9 y8 y7 y6 y5 y4 y3 y2 y1 y0 movu [yq+2*wq], m2 -vpblendd m1, m1, m0, 0xaa ; 00 v5 u5 v4 00 u4 v3 u3 00 v2 u2 v1 00 u1 v0 u0 +vpblendd m1, m1, m0, 0xaa ; merge the even dwords from m0 and odd from m1 ; 00 v5 u5 v4 00 u4 v3 u3 00 v2 u2 v1 00 u1 v0 u0 pshufb m1, m5 ; 00 v5 v4 v3 00 u5 u4 u3 00 v2 v1 v0 00 u2 u1 u0 vpermq m1, m1, 0xd8; 00 v5 v4 v3 00 v2 v1 v0 00 u5 u4 u3 00 u2 u1 u0 pshufb m1, m7 ; 00 00 v5 v4 v3 v2 v1 v0 00 00 u5 u4 u3 u2 u1 u0 -- 2.38.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] configure: support lsan as toolchain
On 12/7/22 17:08, James Darnley wrote: --- configure | 5 + 1 file changed, 5 insertions(+) diff --git a/configure b/configure index f4eedfc207..eaa5ef6b20 100755 --- a/configure +++ b/configure @@ -4315,6 +4315,11 @@ case "$toolchain" in add_cflags -fsanitize=address add_ldflags -fsanitize=address ;; +*-lsan) +cc_default="${toolchain%-lsan}" +add_cflags -fsanitize=leak +add_ldflags -fsanitize=leak +;; *-msan) cc_default="${toolchain%-msan}" add_cflags -fsanitize=memory -fsanitize-memory-track-origins ping Any objections to this? ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] configure: support lsan as toolchain
--- configure | 5 + 1 file changed, 5 insertions(+) diff --git a/configure b/configure index f4eedfc207..eaa5ef6b20 100755 --- a/configure +++ b/configure @@ -4315,6 +4315,11 @@ case "$toolchain" in add_cflags -fsanitize=address add_ldflags -fsanitize=address ;; +*-lsan) +cc_default="${toolchain%-lsan}" +add_cflags -fsanitize=leak +add_ldflags -fsanitize=leak +;; *-msan) cc_default="${toolchain%-msan}" add_cflags -fsanitize=memory -fsanitize-memory-track-origins -- 2.38.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 5/5] avcodec/x86/v210enc: remove unneeded instruction
--- libavcodec/x86/v210enc.asm | 1 - 1 file changed, 1 deletion(-) diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm index d3639cd440..daf5f2ab81 100644 --- a/libavcodec/x86/v210enc.asm +++ b/libavcodec/x86/v210enc.asm @@ -331,7 +331,6 @@ cglobal v210_planar_pack_8, 5, 5, 7+notcpuflag(avx512icl), y, u, v, dst, width vpternlogd m0, m1, m6, 0xd8 ; C?B:A ; merge and mask out bad bits from B %else pand m1, m6, m1 -pandn m0, m6, m0 porm0, m0, m1 %endif -- 2.38.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 4/5] avcodec/x86/v210enc: expand and correct comments
--- libavcodec/x86/v210enc.asm | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm index 552164a8be..d3639cd440 100644 --- a/libavcodec/x86/v210enc.asm +++ b/libavcodec/x86/v210enc.asm @@ -314,7 +314,7 @@ cglobal v210_planar_pack_8, 5, 5, 7+notcpuflag(avx512icl), y, u, v, dst, width movu ym1, [yq + 2*widthq] vinserti32x4 m1, [uq + 1*widthq], 2 vinserti32x4 m1, [vq + 1*widthq], 3 -vpermbm1, m2, m1 ; uyv0 yuy0 vyu0 yvy0 +vpermbm1, m2, m1 ; uyvx yuyx vyux yvyx %else movq xm0, [uq + 1*widthq]; uuxx movq xm1, [vq + 1*widthq]; vvxx @@ -325,10 +325,10 @@ cglobal v210_planar_pack_8, 5, 5, 7+notcpuflag(avx512icl), y, u, v, dst, width %endif CLIPUB m1, m4, m5 -pmaddubsw m0, m1, m3 -pslld m1, 4 +pmaddubsw m0, m1, m3 ; shift high and low samples of each dword and mask out other bits +pslld m1, 4 ; shift center sample of each dword %if cpuflag(avx512) -vpternlogd m0, m1, m6, 0xd8 ; C?B:A +vpternlogd m0, m1, m6, 0xd8 ; C?B:A ; merge and mask out bad bits from B %else pand m1, m6, m1 pandn m0, m6, m0 -- 2.38.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 3/5] avcodec/v210enc: add new 10-bit function for avx512 avx512icl
avx512 on Skylake-X (Xeon D-2123IT): 1.19x faster (970±91.2 vs. 817±104.4 decicycles) compared with avx2 avx512icl on Ice Lake (Xeon Silver 4316): 2.52x faster (1350±5.3 vs. 535±9.5 decicycles) compared with avx2 --- libavcodec/x86/v210enc.asm| 99 +++ libavcodec/x86/v210enc_init.c | 12 + 2 files changed, 111 insertions(+) diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm index c2ad3d72c0..552164a8be 100644 --- a/libavcodec/x86/v210enc.asm +++ b/libavcodec/x86/v210enc.asm @@ -56,6 +56,36 @@ v210enc_8_permd: dd 0,1,4,5, 1,2,5,6 v210enc_8_mult: db 4, 0, 64, 0 v210enc_8_mask: dd 255<<12 +icl_perm_y: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb +%assign i 0 +%rep 8 +db -1,i+0,i+1,-1 , i+2,i+3,i+4,i+5 +%assign i i+6 +%endrep + +icl_perm_uv: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb +%assign i 0 +%rep 4 +db i+0,i+1,i+32,i+33 , -1,i+2,i+3,-1 , i+34,i+35,i+4,i+5 , -1,i+36,i+37,-1 +%assign i i+6 +%endrep + +icl_perm_y_kmask: times 8 db 0b_0110 +icl_perm_uv_kmask: times 8 db 0b0110_ + +icl_shift_y: times 10 dw 2,0,4 + times 4 db 0 ; padding to 64 bytes +icl_shift_uv: times 5 dw 0,2,4 + times 2 db 0 ; padding to 32 bytes + times 5 dw 4,0,2 + times 2 db 0 ; padding to 32 bytes + +v210enc_10_permd_y: dd 0,1,2,-1 , 3,4,5,-1 +v210enc_10_shufb_y: db -1,0,1,-1 , 2,3,4,5 , -1,6,7,-1 , 8,9,10,11 +v210enc_10_permd_uv: dd 0,1,4,5 , 1,2,5,6 +v210enc_10_shufb_uv: db 0,1, 8, 9 , -1,2,3,-1 , 10,11,4,5 , -1,12,13,-1 + db 2,3,10,11 , -1,4,5,-1 , 12,13,6,7 , -1,14,15,-1 + SECTION .text %macro v210_planar_pack_10 0 @@ -113,6 +143,75 @@ INIT_YMM avx2 v210_planar_pack_10 %endif +%macro v210_planar_pack_10_new 0 + +cglobal v210_planar_pack_10, 5, 5, 8+2*notcpuflag(avx512icl), y, u, v, dst, width +lea yq, [yq+2*widthq] +add uq, widthq +add vq, widthq +neg widthq + +%if cpuflag(avx512icl) +movu m6, [icl_perm_y] +movu m7, [icl_perm_uv] +kmovq k1, [icl_perm_y_kmask] +kmovq k2, [icl_perm_uv_kmask] +%else +movu m6, [v210enc_10_permd_y] +VBROADCASTI128 m7, [v210enc_10_shufb_y] +movu m8, [v210enc_10_permd_uv] +movu m9, [v210enc_10_shufb_uv] +%endif +movu m2, [icl_shift_y] +movu m3, [icl_shift_uv] +VBROADCASTI128 m4, [v210_enc_min_10] ; only ymm sized +VBROADCASTI128 m5, [v210_enc_max_10] ; only ymm sized + +.loop: +movu m0, [yq + widthq*2] +%if cpuflag(avx512icl) +movu ym1, [uq + widthq*1] +vinserti32x8 zm1, [vq + widthq*1], 1 +%else +movu xm1, [uq + widthq*1] +vinserti128 ym1, [vq + widthq*1], 1 +%endif +CLIPW m0, m4, m5 +CLIPW m1, m4, m5 + +vpsllvw m0, m2 +vpsllvw m1, m3 +%if cpuflag(avx512icl) +vpermb m0{k1}{z}, m6, m0 ; make space for uv where the k-mask sets to zero +vpermb m1{k2}{z}, m7, m1 ; interleave uv and make space for y where the k-mask sets to zero +%else +vpermd m0, m6, m0 +pshufb m0, m7 +vpermd m1, m8, m1 +pshufb m1, m9 +%endif +por m0, m1 + +movu [dstq], m0 +add dstq, mmsize +add widthq, (mmsize*3)/8 +jl .loop +RET + +%endmacro + +%if ARCH_X86_64 +%if HAVE_AVX512_EXTERNAL +INIT_YMM avx512 +v210_planar_pack_10_new +%endif +%endif + +%if HAVE_AVX512ICL_EXTERNAL +INIT_ZMM avx512icl +v210_planar_pack_10_new +%endif + %macro v210_planar_pack_8 0 ; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width) diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c index 6e9f8c6e61..44f22ca7fe 100644 --- a/libavcodec/x86/v210enc_init.c +++ b/libavcodec/x86/v210enc_init.c @@ -37,6 +37,12 @@ void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u, void ff_v210_planar_pack_10_avx2(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width); +void ff_v210_planar_pack_10_avx512(const uint16_t *y, const uint16_t *u, + const uint16_t *v, uint8_t *dst, + ptrdiff_t width); +void ff_v210_planar_pack_10_avx512icl(const uint16_t *y, const uint16_t *u, + const uint16_t *v, uint8_t *dst, + ptrdiff_t width); av_cold void ff_v210enc_init_x86(V210EncContext *s) { @@ -60,10 +66,16 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s) if (EXTERNAL_AVX512(cpu_flags)) { s->sample_factor_8 = 2; s->pack_line_8 = ff_v210_planar_pack_8_avx512; +#
[FFmpeg-devel] [PATCH v2 2/5] avcodec/x86/v210enc: replace register use with named register
--- libavcodec/x86/v210enc.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm index afac238ede..c2ad3d72c0 100644 --- a/libavcodec/x86/v210enc.asm +++ b/libavcodec/x86/v210enc.asm @@ -62,7 +62,7 @@ SECTION .text ; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width) cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width -lea r0, [yq+2*widthq] +lea yq, [yq+2*widthq] add uq, widthq add vq, widthq neg widthq -- 2.38.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 1/5] checkasm/v210enc: test the entire width of 10-bit planar input arrays
--- tests/checkasm/v210enc.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/checkasm/v210enc.c b/tests/checkasm/v210enc.c index 9942e08137..9fb8321c25 100644 --- a/tests/checkasm/v210enc.c +++ b/tests/checkasm/v210enc.c @@ -72,8 +72,10 @@ randomize_buffers(mask); \ call_ref(y0 + y_offset, u0 + uv_offset, v0 + uv_offset, dst0, width); \ call_new(y1 + y_offset, u1 + uv_offset, v1 + uv_offset, dst1, width); \ -if (memcmp(y0, y1, BUF_SIZE) || memcmp(u0, u1, BUF_SIZE / 2) || \ -memcmp(v0, v1, BUF_SIZE / 2) || memcmp(dst0, dst1, width * 8 / 3)) \ +if (memcmp(y0, y1, BUF_SIZE * sizeof(type)) \ +|| memcmp(u0, u1, BUF_SIZE * sizeof(type) / 2) \ +|| memcmp(v0, v1, BUF_SIZE * sizeof(type) / 2) \ +|| memcmp(dst0, dst1, width * 8 / 3)) \ fail(); \ bench_new(y1 + y_offset, u1 + uv_offset, v1 + uv_offset, dst1, width); \ } \ -- 2.38.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 3/3] avcodec/v210enc: add new 10-bit function for avx512 avx512icl
ARCH_X86_64 is always defined. So checks of this type need to check with #if. Thanks. I forgot the ffmpeg convention there. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 3/3] avcodec/v210enc: add new 10-bit function for avx512 avx512icl
avx512 on Skylake-X (Xeon D-2123IT): 1.19x faster (970±91.2 vs. 817±104.4 decicycles) compared with avx2 avx512icl on Ice Lake (Xeon Silver 4316): 2.52x faster (1350±5.3 vs. 535±9.5 decicycles) compared with avx2 --- libavcodec/x86/v210enc.asm| 99 +++ libavcodec/x86/v210enc_init.c | 12 + 2 files changed, 111 insertions(+) diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm index c2ad3d72c0..9cee954619 100644 --- a/libavcodec/x86/v210enc.asm +++ b/libavcodec/x86/v210enc.asm @@ -56,6 +56,36 @@ v210enc_8_permd: dd 0,1,4,5, 1,2,5,6 v210enc_8_mult: db 4, 0, 64, 0 v210enc_8_mask: dd 255<<12 +icl_perm_y: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb +%assign i 0 +%rep 8 +db -1,i+0,i+1,-1 , i+2,i+3,i+4,i+5 +%assign i i+6 +%endrep + +icl_perm_uv: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb +%assign i 0 +%rep 4 +db i+0,i+1,i+32,i+33 , -1,i+2,i+3,-1 , i+34,i+35,i+4,i+5 , -1,i+36,i+37,-1 +%assign i i+6 +%endrep + +icl_perm_y_kmask: times 8 db 0b_0110 +icl_perm_uv_kmask: times 8 db 0b0110_ + +icl_shift_y: times 10 dw 2,0,4 + times 4 db 0 ; padding to 64 bytes +icl_shift_uv: times 5 dw 0,2,4 + times 2 db 0 ; padding to 32 bytes + times 5 dw 4,0,2 + times 2 db 0 ; padding to 32 bytes + +v210enc_10_permd_y: dd 0,1,2,-1 , 3,4,5,-1 +v210enc_10_shufb_y: db -1,0,1,-1 , 2,3,4,5 , -1,6,7,-1 , 8,9,10,11 +v210enc_10_permd_uv: dd 0,1,4,5 , 1,2,5,6 +v210enc_10_shufb_uv: db 0,1, 8, 9 , -1,2,3,-1 , 10,11,4,5 , -1,12,13,-1 + db 2,3,10,11 , -1,4,5,-1 , 12,13,6,7 , -1,14,15,-1 + SECTION .text %macro v210_planar_pack_10 0 @@ -113,6 +143,75 @@ INIT_YMM avx2 v210_planar_pack_10 %endif +%macro v210_planar_pack_10_new 0 + +cglobal v210_planar_pack_10, 5, 5, 8+2*notcpuflag(avx512icl), y, u, v, dst, width +lea yq, [yq+2*widthq] +add uq, widthq +add vq, widthq +neg widthq + +%if cpuflag(avx512icl) +movu m6, [icl_perm_y] +movu m7, [icl_perm_uv] +kmovq k1, [icl_perm_y_kmask] +kmovq k2, [icl_perm_uv_kmask] +%else +movu m6, [v210enc_10_permd_y] +VBROADCASTI128 m7, [v210enc_10_shufb_y] +movu m8, [v210enc_10_permd_uv] +movu m9, [v210enc_10_shufb_uv] +%endif +movu m2, [icl_shift_y] +movu m3, [icl_shift_uv] +VBROADCASTI128 m4, [v210_enc_min_10] ; only ymm sized +VBROADCASTI128 m5, [v210_enc_max_10] ; only ymm sized + +.loop: +movu m0, [yq + widthq*2] +%if cpuflag(avx512icl) +movu ym1, [uq + widthq*1] +vinserti32x8 zm1, [vq + widthq*1], 1 +%else +movu xm1, [uq + widthq*1] +vinserti128 ym1, [vq + widthq*1], 1 +%endif +CLIPW m0, m4, m5 +CLIPW m1, m4, m5 + +vpsllvw m0, m2 +vpsllvw m1, m3 +%if cpuflag(avx512icl) +vpermb m0{k1}{z}, m6, m0 +vpermb m1{k2}{z}, m7, m1 +%else +vpermd m0, m6, m0 +pshufb m0, m7 +vpermd m1, m8, m1 +pshufb m1, m9 +%endif +por m0, m1 + +movu [dstq], m0 +add dstq, mmsize +add widthq, (mmsize*3)/8 +jl .loop +RET + +%endmacro + +%if ARCH_X86_64 +%if HAVE_AVX512_EXTERNAL +INIT_YMM avx512 +v210_planar_pack_10_new +%endif +%endif + +%if HAVE_AVX512ICL_EXTERNAL +INIT_ZMM avx512icl +v210_planar_pack_10_new +%endif + %macro v210_planar_pack_8 0 ; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width) diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c index 6e9f8c6e61..5d1ebcb893 100644 --- a/libavcodec/x86/v210enc_init.c +++ b/libavcodec/x86/v210enc_init.c @@ -37,6 +37,12 @@ void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u, void ff_v210_planar_pack_10_avx2(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width); +void ff_v210_planar_pack_10_avx512(const uint16_t *y, const uint16_t *u, + const uint16_t *v, uint8_t *dst, + ptrdiff_t width); +void ff_v210_planar_pack_10_avx512icl(const uint16_t *y, const uint16_t *u, + const uint16_t *v, uint8_t *dst, + ptrdiff_t width); av_cold void ff_v210enc_init_x86(V210EncContext *s) { @@ -60,10 +66,16 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s) if (EXTERNAL_AVX512(cpu_flags)) { s->sample_factor_8 = 2; s->pack_line_8 = ff_v210_planar_pack_8_avx512; +#ifdef ARCH_X86_64 +s->sample_factor_10 = 2; +s->pack_line_10 = ff_v210_planar_pack_10_avx512; +#e
[FFmpeg-devel] [PATCH 2/3] avcodec/x86/v210: replace register use with named register
--- libavcodec/x86/v210enc.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm index afac238ede..c2ad3d72c0 100644 --- a/libavcodec/x86/v210enc.asm +++ b/libavcodec/x86/v210enc.asm @@ -62,7 +62,7 @@ SECTION .text ; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width) cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width -lea r0, [yq+2*widthq] +lea yq, [yq+2*widthq] add uq, widthq add vq, widthq neg widthq -- 2.38.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/3] checkasm/v210enc: test the entire width of 10-bit planar input arrays
--- tests/checkasm/v210enc.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/checkasm/v210enc.c b/tests/checkasm/v210enc.c index 9942e08137..9fb8321c25 100644 --- a/tests/checkasm/v210enc.c +++ b/tests/checkasm/v210enc.c @@ -72,8 +72,10 @@ randomize_buffers(mask); \ call_ref(y0 + y_offset, u0 + uv_offset, v0 + uv_offset, dst0, width); \ call_new(y1 + y_offset, u1 + uv_offset, v1 + uv_offset, dst1, width); \ -if (memcmp(y0, y1, BUF_SIZE) || memcmp(u0, u1, BUF_SIZE / 2) || \ -memcmp(v0, v1, BUF_SIZE / 2) || memcmp(dst0, dst1, width * 8 / 3)) \ +if (memcmp(y0, y1, BUF_SIZE * sizeof(type)) \ +|| memcmp(u0, u1, BUF_SIZE * sizeof(type) / 2) \ +|| memcmp(v0, v1, BUF_SIZE * sizeof(type) / 2) \ +|| memcmp(dst0, dst1, width * 8 / 3)) \ fail(); \ bench_new(y1 + y_offset, u1 + uv_offset, v1 + uv_offset, dst1, width); \ } \ -- 2.38.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] avcodec/v210enc: add new function for avx2 avx512 avx512icl
+%else +pand m1, m6, m1 +pandn m0, m6, m0 +porm0, m0, m1 +%endif Isn't that pattern a vpblendb or some such ? I think Kieran already responded to this on IRC but I will too. Unfortunately not. This blend is at the bit level. This is v210 so the packing has the middle sample overlapping with the bottom sample in the second byte. I also want to amend my performance numbers on Broadwell. I can confirm Kieran's disagreement and can reproduce the 10% speed up on it: 1676±14.6 vs 1426±20.9 I will re-check Zen and amend the commit message as necessary. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] avcodec/v210enc: add new function for avx2 avx512 avx512icl
Negligible speed difference for avx2 on Zen 2 (Ryzen 5700X) and Broadwell (Xeon E5-2620 v4): 1690±4.3 decicycles vs. 1693±78.4 1439±31.1 decicycles vs 1429±16.7 Moderate speedup with avx512 on Skylake-X (Xeon D-2123IT): 1.22x faster (793±0.8 vs. 649±5.5 decicycles) compared with avx2 Better speedup with avx512icl on Ice Lake (Xeon Silver 4316): 1.77x faster (784±1.8 vs. 442±11.6 decicycles) compared with avx2 Co-authors: Henrik Gramner Kieran Kunhya --- libavcodec/x86/v210enc.asm| 80 ++- libavcodec/x86/v210enc_init.c | 14 ++ 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm index 965f2bea3c..afac238ede 100644 --- a/libavcodec/x86/v210enc.asm +++ b/libavcodec/x86/v210enc.asm @@ -21,7 +21,7 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 cextern pw_4 %define v210_enc_min_10 pw_4 @@ -46,6 +46,16 @@ v210_enc_chroma_shuf2_8: times 2 db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1 v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0 +v210enc_8_permb: db 32, 0,48,-1 , 1,33, 2,-1 , 49, 3,34,-1 , 4,50, 5,-1 + db 35, 6,51,-1 , 7,36, 8,-1 , 52, 9,37,-1 , 10,53,11,-1 + db 38,12,54,-1 , 13,39,14,-1 , 55,15,40,-1 , 16,56,17,-1 + db 41,18,57,-1 , 19,42,20,-1 , 58,21,43,-1 , 22,59,23,-1 +v210enc_8_shufb: db 0, 8, 1,-1 , 9, 2,10,-1 , 3,11, 4,-1 , 12, 5,13,-1 + db 2,10, 3,-1 , 11, 4,12,-1 , 5,13, 6,-1 , 14, 7,15,-1 +v210enc_8_permd: dd 0,1,4,5, 1,2,5,6 +v210enc_8_mult: db 4, 0, 64, 0 +v210enc_8_mask: dd 255<<12 + SECTION .text %macro v210_planar_pack_10 0 @@ -178,7 +188,73 @@ INIT_XMM avx v210_planar_pack_8 %endif +%macro v210_planar_pack_8_new 0 + +cglobal v210_planar_pack_8, 5, 5, 7+notcpuflag(avx512icl), y, u, v, dst, width +add yq, widthq +shr widthq, 1 +add uq, widthq +add vq, widthq +neg widthq + +%if cpuflag(avx512icl) +mova m2, [v210enc_8_permb] +%else +mova m2, [v210enc_8_permd] +%endif +vpbroadcastd m3, [v210enc_8_mult] +VBROADCASTI128 m4, [v210_enc_min_8] ; only ymm sized +VBROADCASTI128 m5, [v210_enc_max_8] ; only ymm sized +vpbroadcastd m6, [v210enc_8_mask] +%if notcpuflag(avx512icl) +movu m7, [v210enc_8_shufb] +%endif + +.loop: +%if cpuflag(avx512icl) +movu ym1, [yq + 2*widthq] +vinserti32x4 m1, [uq + 1*widthq], 2 +vinserti32x4 m1, [vq + 1*widthq], 3 +vpermbm1, m2, m1 ; uyv0 yuy0 vyu0 yvy0 +%else +movq xm0, [uq + 1*widthq]; uuxx +movq xm1, [vq + 1*widthq]; vvxx +punpcklbwxm1, xm0, xm1 ; uvuv uvuv uvuv +vinserti128 m1, m1, [yq + 2*widthq], 1 ; uvuv uvuv uvuv +vpermdm1, m2, m1 ; uvuv uvxx yyxx xxuv uvuv xxyy +pshufbm1, m7 ; uyv0 yuy0 vyu0 yvy0 +%endif +CLIPUB m1, m4, m5 + +pmaddubsw m0, m1, m3 +pslld m1, 4 +%if cpuflag(avx512) +vpternlogd m0, m1, m6, 0xd8 ; C?B:A +%else +pand m1, m6, m1 +pandn m0, m6, m0 +porm0, m0, m1 +%endif + +movu [dstq], m0 +add dstq, mmsize +add widthq, (mmsize*3)/16 +jl .loop +RET + +%endmacro + %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 -v210_planar_pack_8 +v210_planar_pack_8_new +%endif + +%if HAVE_AVX512_EXTERNAL +INIT_YMM avx512 +v210_planar_pack_8_new +%endif + +%if HAVE_AVX512ICL_EXTERNAL +INIT_ZMM avx512icl +v210_planar_pack_8_new %endif diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c index 13a351dd1d..6e9f8c6e61 100644 --- a/libavcodec/x86/v210enc_init.c +++ b/libavcodec/x86/v210enc_init.c @@ -27,6 +27,10 @@ void ff_v210_planar_pack_8_avx(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width); void ff_v210_planar_pack_8_avx2(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width); +void ff_v210_planar_pack_8_avx512(const uint8_t *y, const uint8_t *u, +const uint8_t *v, uint8_t *dst, ptrdiff_t width); +void ff_v210_planar_pack_8_avx512icl(const uint8_t *y, const uint8_t *u, +const uint8_t *v, uint8_t *dst, ptrdiff_t width); void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width); @@ -52,4 +56,14 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s) s->sample_factor_10 = 2;
[FFmpeg-devel] [PATCH] checkasm: add a verbose check function for uint32_t data
--- tests/checkasm/checkasm.c | 1 + tests/checkasm/checkasm.h | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 421bd096c5..c3d77cb6af 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -918,5 +918,6 @@ int checkasm_check_##type(const char *const file, const int line, \ DEF_CHECKASM_CHECK_FUNC(uint8_t, "%02x") DEF_CHECKASM_CHECK_FUNC(uint16_t, "%04x") +DEF_CHECKASM_CHECK_FUNC(uint32_t, "%08x") DEF_CHECKASM_CHECK_FUNC(int16_t, "%6d") DEF_CHECKASM_CHECK_FUNC(int32_t, "%9d") diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index ee9151410e..5f68115035 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -296,6 +296,7 @@ int checkasm_check_##type(const char *const file, const int line, \ DECL_CHECKASM_CHECK_FUNC(uint8_t); DECL_CHECKASM_CHECK_FUNC(uint16_t); +DECL_CHECKASM_CHECK_FUNC(uint32_t); DECL_CHECKASM_CHECK_FUNC(int16_t); DECL_CHECKASM_CHECK_FUNC(int32_t); -- 2.38.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] avutil/tests/cpu: print the avx512icl flag
--- libavutil/tests/cpu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/libavutil/tests/cpu.c b/libavutil/tests/cpu.c index 5bec742b2b..dadadb31dc 100644 --- a/libavutil/tests/cpu.c +++ b/libavutil/tests/cpu.c @@ -77,6 +77,7 @@ static const struct { { AV_CPU_FLAG_BMI2, "bmi2" }, { AV_CPU_FLAG_AESNI, "aesni" }, { AV_CPU_FLAG_AVX512,"avx512" }, +{ AV_CPU_FLAG_AVX512ICL, "avx512icl" }, { AV_CPU_FLAG_SLOW_GATHER, "slowgather" }, #elif ARCH_LOONGARCH { AV_CPU_FLAG_LSX, "lsx"}, -- 2.38.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] mailmap: stop git lying about who I commit things as
--- .mailmap | 1 - 1 file changed, 1 deletion(-) diff --git a/.mailmap b/.mailmap index ba072f38c8..af60290f77 100644 --- a/.mailmap +++ b/.mailmap @@ -1,4 +1,3 @@ - -- 2.38.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] RFC: v210enc optimisations and initial AVX-512
I guess it could also be scaled to ymm if you're a big Skylake fan :P (in which case you'd probably want to reorder the shuffle indices so that chroma comes first, i.e. movq [u] + movhps [v] + vinserti32x4[y]) What shuffle or permute did you have in mind when you suggested this for Skylake? Without the permute I'm not sure how the change in ordering helps. Aren't we stuck with data in separate lanes? I'm probably missing something though. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] Discrepancy between comments for AVX512 flags
While cherry-picking some stuff for avx512 I have noticed that ffmpeg has a discrepancy in the comments for the two avx512 flags. Lets start with the public header libavutil/cpu.h 56│ #define AV_CPU_FLAG_AVX512 0x10 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used 57│ #define AV_CPU_FLAG_AVX512ICL 0x20 ///< F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ This seem to imply the first only detects ZMM support and the second groups all instruction sets together. This appears to be different to what we imply in internal code libavutil/x86/cpu.c 151│ #if HAVE_AVX512 /* F, CD, BW, DQ, VL */ libavutil/x86/x86inc.asm 840│ %assign cpuflags_avx512(1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL The detection code itself has libavutil/x86/cpu.c 151│ #if HAVE_AVX512 /* F, CD, BW, DQ, VL */ 152│ if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */ 153│ if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 0xd003) { 154│ rval |= AV_CPU_FLAG_AVX512; 155│ #if HAVE_AVX512ICL 156│ if ((ebx & 0xd020) == 0xd020 && (ecx & 0x5f42) == 0x5f42) 157│ rval |= AV_CPU_FLAG_AVX512ICL; If you decode the bits being checked you'll see that the base avx512 checks ebx for F DQ CD BW VL and avx512icl checks ebx for IFMA CD BW VL and ecx for VBMI VBMI2 GFNI VAES VPCLMULQDQ VNNI BITALG VPOPCNTDQ. The first matches what the internal comments imply. Part of the difference is my fault and dates from when the flag was first added. Has there been a discussion about which features should go with which flag? ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] avfilter/vf_subtitles: add an option to choose sub stream by language
--- doc/filters.texi | 5 + libavfilter/vf_subtitles.c | 23 --- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/doc/filters.texi b/doc/filters.texi index a161754233..cfbc807f16 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -21160,6 +21160,11 @@ Override default style or script info parameters of the subtitles. It accepts a string containing ASS style format @code{KEY=VALUE} couples separated by ",". @end table +@item language +Use first stream with the given language, ISO language code. @code{subtitles} +filter only. Requires the language metadata to be read from the file. +@end table + If the first key is not specified, it is assumed that the first value specifies the @option{filename}. diff --git a/libavfilter/vf_subtitles.c b/libavfilter/vf_subtitles.c index 82e140e986..95f0a940d9 100644 --- a/libavfilter/vf_subtitles.c +++ b/libavfilter/vf_subtitles.c @@ -54,6 +54,7 @@ typedef struct AssContext { char *fontsdir; char *charenc; char *force_style; +char *language; int stream_index; int alpha; uint8_t rgba_map[4]; @@ -271,6 +272,7 @@ static const AVOption subtitles_options[] = { {"stream_index", "set stream index", OFFSET(stream_index), AV_OPT_TYPE_INT,{ .i64 = -1 }, -1, INT_MAX, FLAGS}, {"si", "set stream index", OFFSET(stream_index), AV_OPT_TYPE_INT,{ .i64 = -1 }, -1, INT_MAX, FLAGS}, {"force_style", "force subtitle style", OFFSET(force_style), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS}, +{"language", "use first stream of this language", OFFSET(language), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS}, {NULL}, }; @@ -340,9 +342,8 @@ static av_cold int init_subtitles(AVFilterContext *ctx) goto end; /* Locate subtitles stream */ -if (ass->stream_index < 0) -ret = av_find_best_stream(fmt, AVMEDIA_TYPE_SUBTITLE, -1, -1, NULL, 0); -else { +/* If the user has specified a particular stream use that. */ +if (ass->stream_index >= 0) { ret = -1; if (ass->stream_index < fmt->nb_streams) { for (j = 0; j < fmt->nb_streams; j++) { @@ -357,6 +358,22 @@ static av_cold int init_subtitles(AVFilterContext *ctx) } } +/* Otherwise find the first stream with the given language code. */ +else if (ass->language) { +ret = -1; +for (j = 0; j < fmt->nb_streams; j++) { +const AVDictionaryEntry *lang = av_dict_get(fmt->streams[j]->metadata, "language", NULL, 0); +if (lang && !strcmp(lang->value, ass->language)) { +ret = j; +break; +} +} +} + +/* Finally fall back to the "best" stream. */ +else +ret = av_find_best_stream(fmt, AVMEDIA_TYPE_SUBTITLE, -1, -1, NULL, 0); + if (ret < 0) { av_log(ctx, AV_LOG_ERROR, "Unable to locate subtitle stream in %s\n", ass->filename); -- 2.35.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 1/3] avcodec/bitpacked: ,
On 2020-06-04 01:19, Michael Niedermayer wrote: > Fixes: array end overread > Fixes: > 22395/clusterfuzz-testcase-minimized-ffmpeg_AV_CODEC_ID_BITPACKED_fuzzer-5760940300828672 > > Found-by: continuous fuzzing process > https://github.com/google/oss-fuzz/tree/master/projects/ffmpeg > Signed-off-by: Michael Niedermayer > --- > libavcodec/bitpacked.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/libavcodec/bitpacked.c b/libavcodec/bitpacked.c > index be7d1e3629..952ba73a32 100644 > --- a/libavcodec/bitpacked.c > +++ b/libavcodec/bitpacked.c > @@ -147,7 +147,7 @@ AVCodec ff_bitpacked_decoder = { > .decode = bitpacked_decode, > .capabilities = AV_CODEC_CAP_EXPERIMENTAL, > .codec_tags = (const uint32_t []){ > -MKTAG('U', 'Y', 'V', 'Y') > +MKTAG('U', 'Y', 'V', 'Y'), > FF_CODEC_TAGS_END, > }, > }; > I think you should add to the commit title. Something like "add missing comma to codec tags". Other than that this looks fine. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [FFmpeg-cvslog] pthread_frame: merge the functionality for normal decoder init and init_thread_copy
On 2020-04-10 16:53, Anton Khirnov wrote: > ffmpeg | branch: master | Anton Khirnov | Mon Jan 9 > 18:04:42 2017 +0100| [1f4cf92cfbd3accbae582ac63126ed5570ddfd37] | committer: > Anton Khirnov > > pthread_frame: merge the functionality for normal decoder init and > init_thread_copy > > The current design, where > - proper init is called for the first per-thread context > - first thread's private data is copied into private data for all the > other threads > - a "fixup" function is called for all the other threads to e.g. > allocate dynamically allocated data > is very fragile and hard to follow, so it is abandoned. Instead, the > same init function is used to init each per-thread context. Where > necessary, AVCodecInternal.is_copy can be used to differentiate between > the first thread and the other ones (e.g. for decoding the extradata > just once). > >> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=1f4cf92cfbd3accbae582ac63126ed5570ddfd37 This commit has caused unexpected behavior in one use of the API that I encountered. The AVCodecContexts that are used for get_buffer2 calls have different delay values in them. Setting 2 threads I see the value alternating between 0 and 1 for every call. That constant changing value, from the point of view of the thing reading it, is what is causing the unexpected behavior. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] swscale/x86/yuv2rgb: Fix build without SSSE3
On 2020-02-23 18:58, Michael Niedermayer wrote: > On Sun, Feb 23, 2020 at 05:03:36PM +0100, Carl Eugen Hoyos wrote: >> Am So., 23. Feb. 2020 um 13:30 Uhr schrieb Michael Niedermayer >> : >>> >>> From: Parker Ernest <@> >>> >>> commit fc6a5883d6af8cae0e96af84dda0ad74b360a084 breaks build on >>> x86_64 CPUs which do not have SSSE3, e.g. AMD Phenom-II >> >> Does the commit break build on specific CPUs or specific toolchains? > > I dont know what the testcase was the author encountered, i just posted > this here as the author wanted me to post it for him. > but a simple > make distclean ; ./configure --disable-ssse3 && make -j32 > replicates the build failure here (see below for the errors) Okay, it breaks the build when you do --disable-sse3. I see that too. It is okay to fix that any way you want. This patch is fine by me but please don't imply that it fixes a run time error in the commit message, which is what I first thought. I see a discussion has sprung up on the best way to fix it so I guess that has to be resolved first. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] Add .mailmap
On 2020-02-23 15:12, Jean-Baptiste Kempf wrote: > Yo, > > On Sat, Feb 22, 2020, at 22:18, Josh de Kock wrote: >> This allows for easy shortlog/log parsing, useful in determining >> eligible members of the general assembly for the new FFmpeg voting >> system. > > I think this is a good idea. > But are you sure all of those are in the right order? (aka preferred email is > shown) > What is "preferred email" when you have 2 roles? My commits on the job get obe.tv (or are supposed to) and ones made in my own time get gmail.com (or are supposed to). Is it: when you screw up what email should you be shouted at on? I guess since I probably send more discussion email from gmail.com, maybe it is that one. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] swscale/x86/yuv2rgb: Fix build without SSSE3
On 2020-02-23 13:22, Michael Niedermayer wrote: > From: Parker Ernest <@> > > commit fc6a5883d6af8cae0e96af84dda0ad74b360a084 breaks build on > x86_64 CPUs which do not have SSSE3, e.g. AMD Phenom-II > > Signed-off-by: Michael Niedermayer > --- > libswscale/x86/yuv2rgb.c | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c > index c12e88cbb5..4791e5b93a 100644 > --- a/libswscale/x86/yuv2rgb.c > +++ b/libswscale/x86/yuv2rgb.c > @@ -83,6 +83,7 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c) > #if HAVE_X86ASM > int cpu_flags = av_get_cpu_flags(); > > +#if HAVE_SSSE3 > if (EXTERNAL_SSSE3(cpu_flags)) { > switch (c->dstFormat) { > case AV_PIX_FMT_RGB32: > @@ -111,6 +112,7 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c) > return yuv420_rgb15_ssse3; > } > } > +#endif > > if (EXTERNAL_MMXEXT(cpu_flags)) { > switch (c->dstFormat) { > What? Why doesn't the the EXTERNAL_SSSE3 macro stop the code from entering that branch? The #if would only stop the section from being compiled with --disable-ssse3. A normal build would still enter that branch on that CPU. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] Followup: FOSDEM meeting
On 2020-02-22 13:25, Paul B Mahol wrote: > On 2/22/20, James Darnley wrote: >> On 2020-02-22 11:11, Thilo Borgmann wrote: >>> Please someone put an IRC log from the meeting there, too. James Darnley? >>> Also the audio was streamed, somebody might remember where too exactly. >>> Michael? >> >> I can post my log from the day, probably email attachment. Should I >> remove any of the lines from it, particularly after the meting >> concluded? There was a little chat afterwards and into the early evening. >> > > Consor my entries. > [Sat 22 18:00] <@durandal_1707> J_Darnley: no censoring allowed That is more clear Attached is the log for the entire day. I don't think anything needs removing so it is complete. [14:10:59] hello [14:11:01] https://hangouts.google.com/call/jYaO0pADYZELBBfsntHgAEEI [14:11:13] hullo [14:11:40] I can't invite, need op [14:13:07] ugh google wants me phone# [14:13:12] my [14:14:23] just use talky.io [14:14:47] I hope I'm showing up as muted since this UI isn't making me sure if I am or not (I should be) [14:15:03] Do you people hear us? [14:15:20] no audio so far [14:15:34] no [14:15:35] No [14:15:51] I'm just following irc, not the hangout unfortunately [14:16:10] ok, james's video feed picked up [14:16:24] JEEB: with sound ? [14:16:42] neat [14:16:56] no sound still but I can just attempt to re-join [14:17:19] nope [14:17:27] ok, audio [14:17:28] yes [14:17:31] yeah [14:17:32] have audio [14:20:13] I'm in. idling with mic off [14:26:55] usually what you do is have a nomination committee that asks people in advance and then present the nominees [14:27:53] Can everybody hear? [14:28:14] I can hear [14:28:20] voting 1: 3d, vote 2: a week, so seems like the conn is working here :) [14:28:21] I can too [14:28:23] Atm we donât copy into irc what is said [14:29:08] (v1 was IIRC people nominated who might not otherwise show up on voting list, v2 was committees, right?) [14:29:15] git log --since="last 36 months" --author="name" --oneline | wc -l [14:29:16] yes [14:29:18] Jeeb: Please write short summaries about what you hear [14:29:24] the hangout in the topic is empty btw [14:29:31] (mobile phone here) [14:29:36] BBB: https://hangouts.google.com/call/jYaO0pADYZELBBfsntHgAEEI [14:30:16] cehoyos: will attempt. [14:30:22] git log --no-merges --since=2020-01-25T00:00:00Z --until 2020-02-01T00:00:00Z --pretty=fuller | grep '^Author:' | sed 's/<.*//' |sort | uniq -c | sort -nr [14:31:03] Ty [14:31:06] j-b noting - CoC more like a values list as opposed to specific rules. there will be a suggestion which would then be voted on [14:33:08] Lynne noting - various audio decoders do checks already done avcodec common utils [14:33:17] (if I acught that right) [14:33:41] i have some difficulty understanding lynne with my headphones [14:35:24] michaelni: the sample rate and other checks in audio decoders that are now checked internally by the API so they should be removed [14:35:39] you added them, I pinged you on IRC and you didn't remove them [14:36:07] Lynne, i dont remember abouzt the ping but yes if there are redundant checks i should remove them [14:36:15] ping me again until i react! [14:36:42] for new joiners: since the topic is out of date if you want to join muted the URL is https://hangouts.google.com/call/jYaO0pADYZELBBfsntHgAEEI [14:36:59] patches would not be "lost" if we move to gitlab, for example [14:37:32] gitlab move: I guess main part being discussed atm being merge requests [14:37:44] if patches are handled by say gitlab, is it possible to subscribe via rss/atom? [14:38:01] I think yes, you can cehck with videolan's gitlab instance [14:38:45] couldn't find RSS/atom right away, but they have JSON https://code.videolan.org/videolan/x264/merge_requests.json [14:38:50] ugh [14:38:56] (just giving x264 as an example) [14:39:02] I keep track of mxf issues over rss [14:39:11] which is really handy [14:39:21] thardin: there are atom feeds for project activity, not sure if there's one *specific* to MRs [14:39:27] ah [14:39:33] haasn: that might be enough [14:39:46] rss readers typically haev filters [14:39:57] i dont see the problem with the existing infrastructure, so i dont see why we should move to gitlab [14:40:05] e.g. https://code.videolan.org/videolan/dav1d.atom [14:40:47] I run a gitlab instance at uni, and one thing I've found with gitlab is that it's.. a big thing. like it sometimes breaks for seemingly random reasons [14:42:16] yes, it's a very large ruby on rails thing, which is why I would hopefully share the system with another project, like videolan [14:42:31] that sounds like a good idea [14:43:01] I upgraded our instance when the last ubuntu lts came out, which was a bit of a chore but now I don
Re: [FFmpeg-devel] Followup: FOSDEM meeting
On 2020-02-22 11:11, Thilo Borgmann wrote: > Please someone put an IRC log from the meeting there, too. James Darnley? > Also the audio was streamed, somebody might remember where too exactly. > Michael? I can post my log from the day, probably email attachment. Should I remove any of the lines from it, particularly after the meting concluded? There was a little chat afterwards and into the early evening. I didn't record the audio but it was broadcast on Google hangouts. I don't know whether it records. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] What new instructions would you like?
On 30/12/2019, Lauri Kasanen wrote: > Hi, > > For the Libre RISC-V project, I'm going to research the popular codecs > and design new instructions to help speed them up. With ffmpeg being > home to lots of asm folks for many platforms, I also want to ask your > opinion. > > What new instructions would you like? Anything particular you find > missing in existing ISAs, slow, or cumbersome? Do you mean SIMD instructions? I have no idea what exists in RISC-V already or what capabilities or limitations it has, and I am going to use x86 language and terms such as byte, word, dword, qword. Things I have found missing in old(er) x86 instruction sets are missing word size and signed/unsigned variants for existing operations. Some operations may have byte and word variants but dword and qword might be missing, or there might be a signed version but not an unsigned version (and vice versa). A couple of things I had to emulate: * packed absolute value of dwords * packed maximum unsigned words * packed max and min signed dwords (I might have really wanted unsigned for this) * arithmetic right shift of qwords * pack dwords to words with unsigned saturation Shuffle instructions. pshufb is very useful and I think I read on IRC that arm/aarch64/neon does not have an equivalent. (Or was that other shuffles?) It allows for arbitrary reordering of bytes and setting bytes to 0. On x86 it takes the shuffle pattern from another SIMD register but I usually use it with a constant pattern that gets loaded from memory. An interesting improvement would be if you can encode 17 * 16 (or however long your vectors might be) values in an immediate value so it doesn't require another register. Good documentation. The intel instruction manual has pretty good explanation of what the instructions do. The old instructions from around the time of MMX and SSE had excellent diagrams, these might have been mostly for shuffle operations. I need to look and jog my memory. I think punpcklbw is an example of what I mean. The entry in the manual for it has a good diagram IMO. (At least the version I am currently looking at) No stupid lane stuff. AVX2 brought us a SIMD vector length extension from 16 to 32 bytes. Good except for the stupid lanes they were split into making it hard to "mix" data from the low 0-15 bytes and the high 16-31 bytes. I forgot about this email for a month. Sorry about that. Seeing RISC-V in the schedule at FOSDEM reminded me about this. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [IMPORTANT] FOSDEM meeting
On 28/01/2020, Liu Steven wrote: > > >> 在 2020年1月27日,下午3:29,Jean-Baptiste Kempf 写道: >> It will be joinable through some VideoConf tool. > Can we join by IRC or other things on internet? > Because these days are Spring Festival (Chinese New Year, Important > festivals that have lasted for thousands of years), > The more important reason is New infectious virus epidemic areas here. :( Since I don't think it was said yet: yes, there will be participation on IRC. At the very least I plan to be there and will relay things to<->from #ffmpeg-meeting on freenode. Other people are responsible for other solutions. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH, v3, 1/7] lavu/pixfmt: add new pixel format 0yuv/y210/y410
On 2019-12-04 15:43, Linjie Fu wrote: > Previously, media driver provided planar format(like 420 8 bit), > but for HEVC Range Extension (422/444 8/10 bit), the decoded image > is produced in packed format because Windows expects it. > > Add some packed pixel formats for hardware decode support in VAAPI > and QSV: > > 4:2:2 10 bit: Y210 > 4:4:4 8 bit: 0YUV > 4:4:4 10 bit: Y410 > > +[AV_PIX_FMT_Y410LE] = { > +.name = "y410le", > +.nb_components = 4, > +.log2_chroma_w = 0, > +.log2_chroma_h = 0, > +.comp = { > +{ 0, 32, 10, 0, 10, 31, 9, 11 },/* Y */ > +{ 0, 32, 0, 0, 10, 31, 9, 1 },/* U */ > +{ 0, 32, 20, 0, 10, 31, 9, 21 },/* V */ > +{ 0, 32, 30, 0, 2, 31, 1, 31 },/* A */ > +}, > +.flags = AV_PIX_FMT_FLAG_ALPHA | AV_PIX_FMT_FLAG_BITSTREAM, > +}, > diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h > index d78e863..a163350 100644 > --- a/libavutil/pixfmt.h > +++ b/libavutil/pixfmt.h > @@ -348,6 +348,12 @@ enum AVPixelFormat { > AV_PIX_FMT_NV24, ///< planar YUV 4:4:4, 24bpp, 1 plane for Y and 1 > plane for the UV components, which are interleaved (first byte U and the > following byte V) > AV_PIX_FMT_NV42, ///< as above, but U and V bytes are swapped > > +AV_PIX_FMT_Y210BE,///< packed YUV 4:2:2, 32bpp, Y0 Cb Y1 Cr, > big-endian > +AV_PIX_FMT_Y210LE,///< packed YUV 4:2:2, 32bpp, Y0 Cb Y1 Cr, > little-endian > +AV_PIX_FMT_0YUV, ///< packed YUV 4:4:4, 32bpp, X Y Cb Cr, > X=unused/undefined > +AV_PIX_FMT_Y410LE,///< packed YUV 4:4:4, 32bpp, Cr Y Cb A, > little-endian > +AV_PIX_FMT_Y410BE,///< packed YUV 4:4:4, 32bpp, Cr Y Cb A, > big-endian > + > AV_PIX_FMT_NB ///< number of pixel formats, DO NOT USE THIS if > you want to link with shared libav* because the number of formats might > differ between versions > }; > I will ask again. From > http://ffmpeg.org/pipermail/ffmpeg-devel/2019-June/245929.html > Why am I suspicious that at least one of those is a re-ordered v210? I > seem to recall that we rejected adding v210 to this list. Either they > don't belong in this list or they don't belong because libavcodec has a > proper decoder (at least for v210). > > This might be the thread I was remembering but March seems too recent >> https://ffmpeg.org/pipermail/ffmpeg-devel/2019-March/241549.html > > No real conclusion was reached there. > > Do bit-packed formats belong in an AVPixelFormat? Despite what was said last time I do believe this is packed. I have taken a little time to actually understand these magic number structs. y410 is clearly packed like v210. Look at the those offsets: 0, 10, 20, 30. Packed into a 32-bit word. Flagged with AV_PIX_FMT_FLAG_BITSTREAM. How is that any different to v210? Can you address a single sample in that 1 plane format without using shifts and bit-wise ands? Isn't that the definition of packed? I do not mean interleaved. Okay, y410 is a little better in that it is 444 so the sample order does not change through 6 word cycle. Is that the key difference? Do bit-packed formats belong in an AVPixelFormat? If yes then I do not object to this patch or any others like this. If no then why is this not rejected? Does the AV_PIX_FMT_FLAG_BITSTREAM flag mean they do belong? I admit I haven't seen this before so maybe I should shut up and not send this email. signature.asc Description: OpenPGP digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [Contract Request] for FFmpeg libmp3lame multi-threaded feature implementation
On 2019-11-25 13:52, Chandra Nakka wrote: > Dear FFmpeg developers, > > I'm very happy to have found your details on FFmpeg website for requesting > FFmpeg feature implementation. > > Currently I'm using FFmpeg command line tool on my linux servers to process > media files into instant mp3 audio files by using FFmpeg piping feature. > But, currently libmp3lame encoder support single thread only for encoding > audio stream to mp3 file. This is the great drawback for my project. > > I have more than 100+ linux servers for processing audio streams to mp3 > files. Each server has 8 physical CPU cores. But, due to libmp3lame single > thread limitation my project mp3 conversion speed becomes too lazy > and remaining cores on servers are becomes useless. > > Actually I'm a web developer. I have no idea on FFmpeg tools tech > languages. So, I'm looking for FFmpeg developer who can implement > libmp3lame multi-threaded feature on FFmpeg. I'm ready to pay for this > feature. > > Looking forward to hearing from you. > > Thank you, > Chandra N. https://www.gnu.org/software/parallel/ That'll be $1, thank you. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] avutil/eval: add sgn()
On 2019-10-11 21:45, Paul B Mahol wrote: > diff --git a/doc/utils.texi b/doc/utils.texi > index d55dd315c3..4e2e713505 100644 > --- a/doc/utils.texi > +++ b/doc/utils.texi > @@ -920,6 +920,9 @@ corresponding input value will be returned. > @item round(expr) > Round the value of expression @var{expr} to the nearest integer. For > example, "round(1.5)" is "2.0". > > +@item sgn(x) > +Compute sign of @var{x}. > + > @item sin(x) > Compute sine of @var{x}. > Too late now but, since we have round() just above it which is 5 chars, couldn't you have made this sign()? ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/2] avcodec/h264: fix draw_horiz_band with slice threads
From: Kieran Kunhya --- libavcodec/h264_slice.c | 29 +++-- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c index 5ceee107a0..fe2aa01ceb 100644 --- a/libavcodec/h264_slice.c +++ b/libavcodec/h264_slice.c @@ -2527,18 +2527,33 @@ static void predict_field_decoding_flag(const H264Context *h, H264SliceContext * /** * Draw edges and report progress for the last MB row. */ -static void decode_finish_row(const H264Context *h, H264SliceContext *sl) +static void decode_finish_row(const H264Context *h, H264SliceContext *sl, int slice_end) { int top= 16 * (sl->mb_y >> FIELD_PICTURE(h)); int pic_height = 16 * h->mb_height >> FIELD_PICTURE(h); int height = 16 << FRAME_MBAFF(h); int deblock_border = (16 + 4) << FRAME_MBAFF(h); -if (sl->deblocking_filter) { +/* Slice-threaded draw_horiz_band not useful in this situation */ +if (sl->deblocking_filter == 1) { if ((top + height) >= pic_height) height += deblock_border; top -= deblock_border; } +else if (sl->deblocking_filter == 2) { +int first_mb_y = sl->first_mb_addr / h->mb_width; + +/* Draw the whole slice if it's possible: + * - If the beginning of the slice is at the start of a row + * - If we are at the end of the slice + * Previous slice is guaranteed not be included. */ +if (!(sl->first_mb_addr % h->mb_width)) { +if (slice_end) { +top = 16 * (first_mb_y >> FIELD_PICTURE(h)); +height = (16 << FRAME_MBAFF(h)) * ((sl->mb_y+1) - first_mb_y); +} +} +} if (top >= pic_height || (top + height) < 0) return; @@ -2549,7 +2564,8 @@ static void decode_finish_row(const H264Context *h, H264SliceContext *sl) top= 0; } -ff_h264_draw_horiz_band(h, sl, top, height); +if (slice_end) +ff_h264_draw_horiz_band(h, sl, top, height); if (h->droppable || sl->h264->slice_ctx[0].er.error_occurred) return; @@ -2622,7 +2638,7 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg) for (;;) { // START_TIMER -int ret, eos; +int ret, eos, slice_end; if (sl->mb_x + sl->mb_y * h->mb_width >= sl->next_slice_idx) { av_log(h->avctx, AV_LOG_ERROR, "Slice overlaps with next at %d\n", sl->next_slice_idx); @@ -2669,10 +2685,11 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg) return AVERROR_INVALIDDATA; } +slice_end = eos || sl->mb_y >= h->mb_height; if (++sl->mb_x >= h->mb_width) { loop_filter(h, sl, lf_x_start, sl->mb_x); sl->mb_x = lf_x_start = 0; -decode_finish_row(h, sl); +decode_finish_row(h, sl, slice_end); ++sl->mb_y; if (FIELD_OR_MBAFF_PICTURE(h)) { ++sl->mb_y; @@ -2729,7 +2746,7 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg) if (++sl->mb_x >= h->mb_width) { loop_filter(h, sl, lf_x_start, sl->mb_x); sl->mb_x = lf_x_start = 0; -decode_finish_row(h, sl); +decode_finish_row(h, sl, 0); ++sl->mb_y; if (FIELD_OR_MBAFF_PICTURE(h)) { ++sl->mb_y; -- 2.22.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/2] avcodec/h264: enable draw_horiz_band
--- libavcodec/h264dec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c index 8d1bd16a8e..b9f304936c 100644 --- a/libavcodec/h264dec.c +++ b/libavcodec/h264dec.c @@ -1056,7 +1056,7 @@ AVCodec ff_h264_decoder = { .init = h264_decode_init, .close = h264_decode_end, .decode= h264_decode_frame, -.capabilities = /*AV_CODEC_CAP_DRAW_HORIZ_BAND |*/ AV_CODEC_CAP_DR1 | +.capabilities = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS, .hw_configs= (const AVCodecHWConfigInternal*[]) { -- 2.22.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 0/2] WIP: h264, slice threads, draw_horiz_band
Trying a combination of sliced threads, chunk decoding, and draw_horiz_band we found that it didn't work with the current master code. Modifying the api-h264-slice fate test showed obvious errors with grey and green blocks and more subtle ones that looked like misplaced macroblocks. Kieran identified the cause and coded this quick fix. He said that essentially the code would give a region to draw_horiz_band which could include the previous slice even if it hadn't been finished yet. This corrects that problem and lets us decode exactly. However it does cause errors decoding B-frames in chunked mode. Needs more work. James Darnley (1): avcodec/h264: enable draw_horiz_band Kieran Kunhya (1): avcodec/h264: fix draw_horiz_band with slice threads libavcodec/h264_slice.c | 29 +++-- libavcodec/h264dec.c| 2 +- 2 files changed, 24 insertions(+), 7 deletions(-) -- 2.22.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 3/7] x86inc: Improve SAVE/LOAD_MM_PERMUTATION macros
From: Henrik Gramner Use register numbers instead of copying the full register names. This makes it possible to change register widths in the middle of a function and keep the mmreg permutations intact which can be useful for code that only needs larger vectors for parts of the function in combination with macros etc. Also change the LOAD_MM_PERMUTATION macro to use the same default name as the SAVE macro. This simplifies swapping from ymm to xmm registers or vice versa: SAVE_MM_PERMUTATION INIT_XMM LOAD_MM_PERMUTATION --- libavutil/x86/x86inc.asm | 23 ++- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index 39cba5db09..10b7711637 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -1081,19 +1081,32 @@ INIT_XMM %endif %assign %%i 0 %rep num_mmregs -CAT_XDEFINE %%f, %%i, m %+ %%i +%xdefine %%tmp m %+ %%i +CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp %assign %%i %%i+1 %endrep %endmacro -%macro LOAD_MM_PERMUTATION 1 ; name to load from -%ifdef %1_m0 +%macro LOAD_MM_PERMUTATION 0-1 ; name to load from +%if %0 +%xdefine %%f %1_m +%else +%xdefine %%f current_function %+ _m +%endif +%xdefine %%tmp %%f %+ 0 +%ifnum %%tmp +RESET_MM_PERMUTATION %assign %%i 0 %rep num_mmregs -CAT_XDEFINE m, %%i, %1_m %+ %%i -CAT_XDEFINE nn, m %+ %%i, %%i +%xdefine %%tmp %%f %+ %%i +CAT_XDEFINE %%m, %%i, m %+ %%tmp %assign %%i %%i+1 %endrep +%rep num_mmregs +%assign %%i %%i-1 +CAT_XDEFINE m, %%i, %%m %+ %%i +CAT_XDEFINE nn, m %+ %%i, %%i +%endrep %endif %endmacro -- 2.22.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 7/7] x86inc: Add support for GFNI instructions
From: Henrik Gramner --- libavutil/x86/x86inc.asm | 30 +- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index d1b4c982fc..8c8cc97e0c 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -820,19 +820,20 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %assign cpuflags_sse4 (1<<10)| cpuflags_ssse3 %assign cpuflags_sse42(1<<11)| cpuflags_sse4 %assign cpuflags_aesni(1<<12)| cpuflags_sse42 -%assign cpuflags_avx (1<<13)| cpuflags_sse42 -%assign cpuflags_xop (1<<14)| cpuflags_avx -%assign cpuflags_fma4 (1<<15)| cpuflags_avx -%assign cpuflags_fma3 (1<<16)| cpuflags_avx -%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt -%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1 -%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2 -%assign cpuflags_avx512 (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL - -%assign cpuflags_cache32 (1<<21) -%assign cpuflags_cache64 (1<<22) -%assign cpuflags_aligned (1<<23) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<24) +%assign cpuflags_gfni (1<<13)| cpuflags_sse42 +%assign cpuflags_avx (1<<14)| cpuflags_sse42 +%assign cpuflags_xop (1<<15)| cpuflags_avx +%assign cpuflags_fma4 (1<<16)| cpuflags_avx +%assign cpuflags_fma3 (1<<17)| cpuflags_avx +%assign cpuflags_bmi1 (1<<18)| cpuflags_avx|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<19)| cpuflags_bmi1 +%assign cpuflags_avx2 (1<<20)| cpuflags_fma3|cpuflags_bmi2 +%assign cpuflags_avx512 (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL + +%assign cpuflags_cache32 (1<<22) +%assign cpuflags_cache64 (1<<23) +%assign cpuflags_aligned (1<<24) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<25) ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. %definecpuflag(x) (cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) @@ -1418,6 +1419,9 @@ AVX_INSTR divss, sse, 1, 0, 0 AVX_INSTR dppd, sse4, 1, 1, 0 AVX_INSTR dpps, sse4, 1, 1, 0 AVX_INSTR extractps, sse4, 1 +AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0 +AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0 +AVX_INSTR gf2p8mulb, gfni, 0, 0, 0 AVX_INSTR haddpd, sse3, 1, 0, 0 AVX_INSTR haddps, sse3, 1, 0, 0 AVX_INSTR hsubpd, sse3, 1, 0, 0 -- 2.22.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/7] x86inc: Optimize VEX instruction encoding
From: Henrik Gramner Most VEX-encoded instructions require an additional byte to encode when src2 is a high register (e.g. x|ymm8..15). If the instruction is commutative we can swap src1 and src2 when doing so reduces the instruction length, e.g. vpaddw xmm0, xmm0, xmm8 -> vpaddw xmm0, xmm8, xmm0 --- libavutil/x86/x86inc.asm | 35 +-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index bc370a6186..39cba5db09 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -1244,9 +1244,40 @@ INIT_XMM %elif %0 >= 9 __instr %6, %7, %8, %9 %elif %0 == 8 -__instr %6, %7, %8 +%if avx_enabled && %5 +%xdefine __src1 %7 +%xdefine __src2 %8 +%ifnum regnumof%7 +%ifnum regnumof%8 +%if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 +; Most VEX-encoded instructions require an additional byte to encode when +; src2 is a high register (e.g. m8..15). If the instruction is commutative +; we can swap src1 and src2 when doing so reduces the instruction length. +%xdefine __src1 %8 +%xdefine __src2 %7 +%endif +%endif +%endif +__instr %6, __src1, __src2 +%else +__instr %6, %7, %8 +%endif %elif %0 == 7 -__instr %6, %7 +%if avx_enabled && %5 +%xdefine __src1 %6 +%xdefine __src2 %7 +%ifnum regnumof%6 +%ifnum regnumof%7 +%if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32 +%xdefine __src1 %7 +%xdefine __src2 %6 +%endif +%endif +%endif +__instr %6, __src1, __src2 +%else +__instr %6, %7 +%endif %else __instr %6 %endif -- 2.22.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 0/7] Import some x264asm patches from x264
Here are a few easy-to-import patches from x264. These are all after x264 commit 4a158b00 "x86inc: Correctly set mmreg variables" which FFmpeg already has (commit eb5f063e7c). It does not include the following commits: * 82721eae "x86inc: Add x86-32 PIC support macros" * 101bd27d "x86inc: Support N_PEXT bit on Mach-O" They would not apply cleanly because of existing differences between x264 and FFmpeg. The PIC one has a change to configure which would need remaking. Henrik Gramner (7): x86inc: Fix VEX -> EVEX instruction conversion x86inc: Optimize VEX instruction encoding x86inc: Improve SAVE/LOAD_MM_PERMUTATION macros x86inc: Turn 'movsxd' into 'movifnidn' on x86-32 x86inc: Make 'non-adjacent' default in the TAIL_CALL macro x86inc: Improve warnings for use of unsupported instructions x86inc: Add support for GFNI instructions libavutil/x86/x86inc.asm | 219 --- 1 file changed, 161 insertions(+), 58 deletions(-) -- 2.22.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 6/7] x86inc: Improve warnings for use of unsupported instructions
From: Henrik Gramner Warn when the following are used without the appropriate cpuflag: * YMM and ZMM registers * 'pextrw' with a memory operand * GPR instruction set extensions --- libavutil/x86/x86inc.asm | 120 +++ 1 file changed, 83 insertions(+), 37 deletions(-) diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index af35fe1e4d..d1b4c982fc 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -1216,8 +1216,22 @@ INIT_XMM %ifdef cpuname %if notcpuflag(%2) %error use of ``%1'' %2 instruction in cpuname function: current_function -%elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8 +%elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2) %error use of ``%1'' sse2 instruction in cpuname function: current_function +%elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2) +%error use of ``%1'' avx2 instruction in cpuname function: current_function +%elif __sizeofreg == 16 && notcpuflag(sse) +%error use of ``%1'' sse instruction in cpuname function: current_function +%elif __sizeofreg == 32 && notcpuflag(avx) +%error use of ``%1'' avx instruction in cpuname function: current_function +%elif __sizeofreg == 64 && notcpuflag(avx512) +%error use of ``%1'' avx512 instruction in cpuname function: current_function +%elifidn %1, pextrw ; special case because the base instruction is mmx2, +%ifnid %6 ; but sse4 is required for memory operands +%if notcpuflag(sse4) +%error use of ``%1'' sse4 instruction in cpuname function: current_function +%endif +%endif %endif %endif %endif @@ -1379,38 +1393,38 @@ AVX_INSTR cmpunordpd, sse2, 1, 0, 1 AVX_INSTR cmpunordps, sse, 1, 0, 1 AVX_INSTR cmpunordsd, sse2, 1, 0, 0 AVX_INSTR cmpunordss, sse, 1, 0, 0 -AVX_INSTR comisd, sse2 -AVX_INSTR comiss, sse -AVX_INSTR cvtdq2pd, sse2 -AVX_INSTR cvtdq2ps, sse2 -AVX_INSTR cvtpd2dq, sse2 -AVX_INSTR cvtpd2ps, sse2 -AVX_INSTR cvtps2dq, sse2 -AVX_INSTR cvtps2pd, sse2 -AVX_INSTR cvtsd2si, sse2 +AVX_INSTR comisd, sse2, 1 +AVX_INSTR comiss, sse, 1 +AVX_INSTR cvtdq2pd, sse2, 1 +AVX_INSTR cvtdq2ps, sse2, 1 +AVX_INSTR cvtpd2dq, sse2, 1 +AVX_INSTR cvtpd2ps, sse2, 1 +AVX_INSTR cvtps2dq, sse2, 1 +AVX_INSTR cvtps2pd, sse2, 1 +AVX_INSTR cvtsd2si, sse2, 1 AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 AVX_INSTR cvtsi2ss, sse, 1, 0, 0 AVX_INSTR cvtss2sd, sse2, 1, 0, 0 -AVX_INSTR cvtss2si, sse -AVX_INSTR cvttpd2dq, sse2 -AVX_INSTR cvttps2dq, sse2 -AVX_INSTR cvttsd2si, sse2 -AVX_INSTR cvttss2si, sse +AVX_INSTR cvtss2si, sse, 1 +AVX_INSTR cvttpd2dq, sse2, 1 +AVX_INSTR cvttps2dq, sse2, 1 +AVX_INSTR cvttsd2si, sse2, 1 +AVX_INSTR cvttss2si, sse, 1 AVX_INSTR divpd, sse2, 1, 0, 0 AVX_INSTR divps, sse, 1, 0, 0 AVX_INSTR divsd, sse2, 1, 0, 0 AVX_INSTR divss, sse, 1, 0, 0 AVX_INSTR dppd, sse4, 1, 1, 0 AVX_INSTR dpps, sse4, 1, 1, 0 -AVX_INSTR extractps, sse4 +AVX_INSTR extractps, sse4, 1 AVX_INSTR haddpd, sse3, 1, 0, 0 AVX_INSTR haddps, sse3, 1, 0, 0 AVX_INSTR hsubpd, sse3, 1, 0, 0 AVX_INSTR hsubps, sse3, 1, 0, 0 AVX_INSTR insertps, sse4, 1, 1, 0 AVX_INSTR lddqu, sse3 -AVX_INSTR ldmxcsr, sse +AVX_INSTR ldmxcsr, sse, 1 AVX_INSTR maskmovdqu, sse2 AVX_INSTR maxpd, sse2, 1, 0, 1 AVX_INSTR maxps, sse, 1, 0, 1 @@ -1420,10 +1434,10 @@ AVX_INSTR minpd, sse2, 1, 0, 1 AVX_INSTR minps, sse, 1, 0, 1 AVX_INSTR minsd, sse2, 1, 0, 0 AVX_INSTR minss, sse, 1, 0, 0 -AVX_INSTR movapd, sse2 -AVX_INSTR movaps, sse +AVX_INSTR movapd, sse2, 1 +AVX_INSTR movaps, sse, 1 AVX_INSTR movd, mmx -AVX_INSTR movddup, sse3 +AVX_INSTR movddup, sse3, 1 AVX_INSTR movdqa, sse2 AVX_INSTR movdqu, sse2 AVX_INSTR movhlps, sse, 1, 0, 0 @@ -1432,19 +1446,19 @@ AVX_INSTR movhps, sse, 1, 0, 0 AVX_INSTR movlhps, sse, 1, 0, 0 AVX_INSTR movlpd, sse2, 1, 0, 0 AVX_INSTR movlps, sse, 1, 0, 0 -AVX_INSTR movmskpd, sse2 -AVX_INSTR movmskps, sse +AVX_INSTR movmskpd, sse2, 1 +AVX_INSTR movmskps, sse, 1 AVX_INSTR movntdq, sse2 AVX_INSTR movntdqa, sse4 -AVX_INSTR movntpd, sse2 -AVX_INSTR movntps, sse +AVX_INSTR movntpd, sse2, 1 +AVX_INSTR movntps, sse, 1 AVX_INSTR movq, mmx AVX_INSTR movsd, sse2, 1, 0, 0 -AVX_INSTR movshdup, sse3 -AVX_INSTR movsldup, sse3 +AVX_INSTR movshdup, sse3, 1 +AVX_INSTR movsldup, sse3, 1 AVX_INSTR movss, sse, 1, 0, 0 -AVX_INSTR movupd, sse2 -AVX_INSTR movups, sse +AVX_INSTR movupd, sse2, 1 +AVX_INSTR movups, sse, 1 AVX_INSTR mpsadbw, sse4, 0, 1, 0 AVX_INSTR mulpd, sse2, 1, 0, 1 AVX_INSTR mulps, sse, 1, 0, 1 @@ -1577,27 +1591,27 @@ AVX_INSTR punpcklwd, mmx, 0, 0, 0 AVX_INSTR punpckldq, mmx, 0, 0, 0 AVX_INSTR punpcklqdq, sse2, 0, 0, 0 AVX_INSTR pxor, mmx, 0, 0, 1 -AVX_INSTR rcpps, sse +AVX_INSTR rcpps, sse, 1 AVX_INST
[FFmpeg-devel] [PATCH 5/7] x86inc: Make 'non-adjacent' default in the TAIL_CALL macro
From: Henrik Gramner --- libavutil/x86/x86inc.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index 04dbb6b785..af35fe1e4d 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -685,7 +685,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp -%macro TAIL_CALL 2 ; callee, is_nonadjacent +%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent %if has_epilogue call %1 RET -- 2.22.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 4/7] x86inc: Turn 'movsxd' into 'movifnidn' on x86-32
From: Henrik Gramner --- libavutil/x86/x86inc.asm | 4 1 file changed, 4 insertions(+) diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index 10b7711637..04dbb6b785 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -293,6 +293,10 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %endif %endmacro +%if ARCH_X86_64 == 0 +%define movsxd movifnidn +%endif + %macro movsxdifnidn 2 %ifnidn %1, %2 movsxd %1, %2 -- 2.22.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/7] x86inc: Fix VEX -> EVEX instruction conversion
From: Henrik Gramner There's an edge case that wasn't properly handled. --- libavutil/x86/x86inc.asm | 5 + 1 file changed, 5 insertions(+) diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index 5044ee86f0..bc370a6186 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -1662,6 +1662,11 @@ FMA4_INSTR fnmsub, pd, ps, sd, ss %assign %%evex_required 1 %endif %endif +%ifnum regnumof%3 +%if regnumof%3 >= 16 || sizeof%3 > 32 +%assign %%evex_required 1 +%endif +%endif %if %%evex_required %6 %%args %else -- 2.22.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] Issues while encoding a ts file to m3u8
On 2019-08-02 15:55, Ramana Jajula wrote: > Hi, > > I am trying to encode my ts file m3u8 using my customised ffmpeg of version > 4.1. I used below command to do encoding. > > ffmpeg -re -threads 8 -i /videos/input.ts -vcodec libx264 -s 320x240 -b:v > 512000 -maxrate 512000 -acodec libfdk_aac -b:a 32000 -ac 2 -ar 48000 > -force_key_frames 'expr:gte(t,n_forced*3)' -hls_flags single_file > -hls_list_size 0 -hls_time 3 -fsize 400x222 -frames /frames/my_frames/ > -index /mpegindex/my_index.idx -y /encoded/test/output.m3u8 > > My encoding was bad. The output printed to console is > libavutil 56. 22.100 / 56. 22.100 > libavcodec 58. 35.100 / 58. 35.100 > libavformat58. 20.100 / 58. 20.100 > libavdevice58. 5.100 / 58. 5.100 > libavfilter 7. 40.101 / 7. 40.101 > libavresample 4. 0. 0 / 4. 0. 0 > libswscale 5. 3.100 / 5. 3.100 > libswresample 3. 3.100 / 3. 3.100 > libpostproc55. 3.100 / 55. 3.100 > /videos/input.ts FPS 25.00 0 > Input #0, mpegts, from '/videos/.input.ts': > Duration: 00:04:05.97, start: 85837.091689, bitrate: 1769 kb/s > Program 1 > Stream #0:0[0x105]: Video: h264 (Main) ([27][0][0][0] / 0x001B), > yuv420p(top first), 1920x1080 [SAR 1:1 DAR 16:9], 25 fps, 25 tbr, 90k tbn, > 50 tbc > Stream #0:1[0x106]: Audio: ac3 ([129][0][0][0] / 0x0081), 48000 Hz, > stereo, fltp, 128 kb/s > [libx264 @ 0x564a2f7cc480] VBV maxrate specified, but no bufsize, ignored > [libx264 @ 0x564a2f7cc480] using SAR=4/3 > [libx264 @ 0x564a2f7cc480] using cpu capabilities: MMX2 SSE2Fast SSSE3 > SSE4.2 > [libx264 @ 0x564a2f7cc480] profile High, level 2.0 > [libx264 @ 0x564a2f7cc480] 264 - core 148 r2748 97eaef2 - H.264/MPEG-4 AVC > codec - Copyleft 2003-2016 - http://www.videolan.org/x264.html - options: > cabac=1 ref=3 debloc > k=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 > me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 > fast_pskip=1 chroma_qp_offset > =-2 threads=3 lookahead_threads=1 sliced_threads=0 nr=0 decimate=1 > interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 > b_adapt=1 b_bias=0 direct=1 wei > ghtb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 > intra_refresh=0 rc_lookahead=40 rc=abr mbtree=1 bitrate=512 ratetol=1.0 > qcomp=0.60 qpmin=0 qpmax=69 qpst > ep=4 ip_ratio=1.40 aq=1:1.00 > [hls @ 0x564a2f7ccc40] Using AVStream.codec to pass codec parameters to > muxers is deprecated, use AVStream.codecpar instead. > Last message repeated 1 times > [hls @ 0x564a2f7ccc40] Opening '/encodedt/input.ts' for writing > Output #0, hls, to '/encoded/output.m3u8': > Metadata: > encoder : Lavf58.20.100 > Stream #0:0: Video: h264 (libx264), yuv420p, 320x240 [SAR 4:3 DAR > 16:9], q=-1--1, 512 kb/s, 25 fps, 90k tbn, 25 tbc > Metadata: > encoder : Lavc58.35.100 libx264 > Side data: > cpb: bitrate max/min/avg: 512000/0/512000 buffer size: 0 vbv_delay: -1 > Stream #0:1: Audio: aac (libfdk_aac), 48000 Hz, stereo, s16, 32 kb/s > Metadata: > encoder : Lavc58.35.100 libfdk_aac > Stream mapping: > Stream #0:0 -> #0:0 (h264 (native) -> h264 (libx264)) > Stream #0:1 -> #0:1 (ac3 (native) -> aac (libfdk_aac)) > Press [q] to stop, [?] for help > frame= 34 fps=0.1 q=0.0 size=N/A time=00:05:02.11 bitrate=N/A dup=29 > drop=0 speed=0.567x > [hls @ 0x564a2f7ccc40] Packets poorly interleaved, failed to avoid negative > timestamp -3360 in stream 0.0.567x > Try -max_interleave_delta 0 as a possible workaround. > > Since the encoding speed is too slow I had to cancel the encoding process. > I killed it, > > What is the reason for this slow encoding process? > > PS: My input file is of 1 hour duration. > 1 - Wrong mailing list. This should probably be on ffmpeg-user. 2 - What configure options did you use for ffmpeg? Why did you remove them? 3 - What "modifications" have you made"? 4 - What CPU do you have? One without AVX is either old, or limited (like Celerons and Pentiums) 5 - Why are you using an x264 from 2016? Have you "modified" it too? Next time just press 'q' to end encoding so we can see some stats. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 1/5] lavu/pixfmt: add Y210/AYUV/Y410 pixel formats
On 2019-06-28 03:03, Hendrik Leppkes wrote: > On Fri, Jun 28, 2019 at 1:26 AM James Darnley wrote: >> >> On 2019-06-28 04:26, Linjie Fu wrote: >>> Previously, media driver provided planar format(like 420 8 bit), but >>> for HEVC Range Extension (422/444 8/10 bit), the decoded image is >>> produced in packed format. >>> >>> Y210/AYUV/Y410 are packed formats which are needed in HEVC Rext decoding >>> for both VAAPI and QSV: >>> - Y210: 422 10 BIT >>> - AYUV: 444 8 BIT >>> - Y410: 444 10 BIT >>> >> >> >> Why am I suspicious that at least one of those is a re-ordered v210? I >> seem to recall that we rejected adding v210 to this list. Either they >> don't belong in this list or they don't belong because libavcodec has a >> proper decoder (at least for v210). >> > > They are not quite as bad as v210 (and not related). > > Microsoft documents them here as the recommended formats to be used on > Windows: > https://docs.microsoft.com/en-us/windows/desktop/medfound/recommended-8-bit-yuv-formats-for-video-rendering#444-formats-32-bits-per-pixel > https://docs.microsoft.com/en-us/windows/desktop/medfound/10-bit-and-16-bit-yuv-video-formats > > - Hendrik Okay y410 and y210 use the highest 10 bits in each 16-bit word. I apologise for jumping to that conclusion. signature.asc Description: OpenPGP digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 1/5] lavu/pixfmt: add Y210/AYUV/Y410 pixel formats
On 2019-06-28 04:26, Linjie Fu wrote: > Previously, media driver provided planar format(like 420 8 bit), but > for HEVC Range Extension (422/444 8/10 bit), the decoded image is > produced in packed format. > > Y210/AYUV/Y410 are packed formats which are needed in HEVC Rext decoding > for both VAAPI and QSV: > - Y210: 422 10 BIT > - AYUV: 444 8 BIT > - Y410: 444 10 BIT > Why am I suspicious that at least one of those is a re-ordered v210? I seem to recall that we rejected adding v210 to this list. Either they don't belong in this list or they don't belong because libavcodec has a proper decoder (at least for v210). This might be the thread I was remembering but March seems too recent > https://ffmpeg.org/pipermail/ffmpeg-devel/2019-March/241549.html No real conclusion was reached there. Do bit-packed formats belong in an AVPixelFormat? signature.asc Description: OpenPGP digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] avcodec: Add librav1e encoder
On 2019-05-28 22:00, Derek Buitenhuis wrote: > On 28/05/2019 20:58, James Almer wrote: >> I think x26* and vpx/aom call it crf? It's not in option_tables.h in any >> case. > > They do not. This is a constant quantizer mode, not constant rate factor. IIRC either qp or cqp signature.asc Description: OpenPGP digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 1/7] libavfilter/vf_overlay.c: change the commands style for the macro defined function
On 2019-05-24 12:06, James Darnley wrote: > On 2019-05-24 11:36, lance.lmw...@gmail.com wrote: >> From: Limin Wang >> >> ... > > Why? I see why: so you don't screw-up the macros you create later. signature.asc Description: OpenPGP digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 1/7] libavfilter/vf_overlay.c: change the commands style for the macro defined function
On 2019-05-24 11:36, lance.lmw...@gmail.com wrote: > From: Limin Wang > > ... Why? And these are "comments" not "commands". signature.asc Description: OpenPGP digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] avcodec/v210dec: Fix alignment check for AVX2
On 2019-05-18 12:15, Michael Niedermayer wrote: > On Sat, May 18, 2019 at 12:02:55PM +0200, James Darnley wrote: >> I object to the commit message though because it isn't a "null pointer >> dereference" but if that is the error as reported by the tool then keep >> it as is. > > yes, the tool(s) say things like "Null-dereference READ", "SEGV on unknown > address 0x" > Hm. It is almost certainly an aligned move on an unaligned address. I don't care that much about the rest of the commit message; the subject is correct which is good enough. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] avcodec/v210dec: Fix alignment check for AVX2
On 2019-05-18 09:39, Michael Niedermayer wrote: > Fixes: "null pointer dereference" > Fixes: > 14551/clusterfuzz-testcase-minimized-ffmpeg_AV_CODEC_ID_V210_fuzzer-5088609952071680 > > Found-by: continuous fuzzing process > https://github.com/google/oss-fuzz/tree/master/projects/ffmpeg > Signed-off-by: Michael Niedermayer > --- > libavcodec/v210dec.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c > index bc1e1d34ff..5a33d8c089 100644 > --- a/libavcodec/v210dec.c > +++ b/libavcodec/v210dec.c > @@ -104,7 +104,7 @@ static int decode_frame(AVCodecContext *avctx, void > *data, int *got_frame, > && avpkt->size - 64 >= stride * avctx->height) > psrc += 64; > > -aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf); > +aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f); > if (aligned_input != s->aligned_input) { > s->aligned_input = aligned_input; > ff_v210dec_init(s); > Ah yes, that'll be needed after the recent addition of avx2. LGTM and sorry. I object to the commit message though because it isn't a "null pointer dereference" but if that is the error as reported by the tool then keep it as is. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 0/3] v210dec checkasm test and avx2 function
On 2019-04-10 14:47, James Darnley wrote: > I am resending this my patches because I am not sure if I sent this version in > the past. I split my changes into two patches because they do separate > things. > > I also changed some tabs to spaces in Mike's AVX2 patch. > > James Darnley (2): > avcodec/v210dec: move DSP function setting into dedicated function > checkasm: add test for v210dec > > Michael Stoner (1): > libavcodec Adding ff_v210_planar_unpack AVX2 > > libavcodec/v210dec.c | 26 + > libavcodec/v210dec.h | 1 + > libavcodec/x86/v210-init.c | 8 > libavcodec/x86/v210.asm| 72 +++ > tests/checkasm/Makefile| 1 + > tests/checkasm/checkasm.c | 3 ++ > tests/checkasm/checkasm.h | 1 + > tests/checkasm/v210dec.c | 77 ++ > 8 files changed, 166 insertions(+), 23 deletions(-) > create mode 100644 tests/checkasm/v210dec.c > Any objections to this patchset? I have corrected the address of Michael's patch to the address I Cced. I hope that the right one. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 3/3] libavcodec Adding ff_v210_planar_unpack AVX2
On 2019-04-10 14:47, James Darnley wrote: > From: Michael Stoner Screw you mailing list or git, which ever one of you managed to screw up the author's address. I will correct that, if I can. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/3] avcodec/v210dec: move DSP function setting into dedicated function
Prepare for checkasm test. --- libavcodec/v210dec.c | 16 ++-- libavcodec/v210dec.h | 1 + 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c index ddc5dbe8be..fd8a6b0d78 100644 --- a/libavcodec/v210dec.c +++ b/libavcodec/v210dec.c @@ -50,6 +50,13 @@ static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, } } +av_cold void ff_v210dec_init(V210DecContext *s) +{ +s->unpack_frame = v210_planar_unpack_c; +if (ARCH_X86) +ff_v210_x86_init(s); +} + static av_cold int decode_init(AVCodecContext *avctx) { V210DecContext *s = avctx->priv_data; @@ -57,10 +64,8 @@ static av_cold int decode_init(AVCodecContext *avctx) avctx->pix_fmt = AV_PIX_FMT_YUV422P10; avctx->bits_per_raw_sample = 10; -s->unpack_frame= v210_planar_unpack_c; - -if (HAVE_MMX) -ff_v210_x86_init(s); +s->aligned_input = 0; +ff_v210dec_init(s); return 0; } @@ -102,8 +107,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf); if (aligned_input != s->aligned_input) { s->aligned_input = aligned_input; -if (HAVE_MMX) -ff_v210_x86_init(s); +ff_v210dec_init(s); } if ((ret = ff_get_buffer(avctx, pic, 0)) < 0) diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h index 533afc435c..cfdb29da09 100644 --- a/libavcodec/v210dec.h +++ b/libavcodec/v210dec.h @@ -31,6 +31,7 @@ typedef struct { void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); } V210DecContext; +void ff_v210dec_init(V210DecContext *s); void ff_v210_x86_init(V210DecContext *s); #endif /* AVCODEC_V210DEC_H */ -- 2.21.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 3/3] libavcodec Adding ff_v210_planar_unpack AVX2
From: Michael Stoner Replaced VSHUFPS with VPBLENDD to relieve port 5 bottleneck AVX2 is 1.4x faster than AVX --- Mike, is this still the patch you want applied. I had to make a small amendment to it because you had some tabs as indentation. libavcodec/v210dec.c | 10 +- libavcodec/x86/v210-init.c | 8 + libavcodec/x86/v210.asm| 72 +- 3 files changed, 73 insertions(+), 17 deletions(-) diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c index fd8a6b0d78..bc1e1d34ff 100644 --- a/libavcodec/v210dec.c +++ b/libavcodec/v210dec.c @@ -123,7 +123,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, const uint32_t *src = (const uint32_t*)psrc; uint32_t val; -w = (avctx->width / 6) * 6; +w = (avctx->width / 12) * 12; s->unpack_frame(src, y, u, v, w); y += w; @@ -131,6 +131,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, v += w >> 1; src += (w << 1) / 3; +if (w < avctx->width - 5) { +READ_PIXELS(u, y, v); +READ_PIXELS(y, u, y); +READ_PIXELS(v, y, u); +READ_PIXELS(y, v, y); +w += 6; +} + if (w < avctx->width - 1) { READ_PIXELS(u, y, v); diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c index d64dbca1a8..cb9a6cbd6a 100644 --- a/libavcodec/x86/v210-init.c +++ b/libavcodec/x86/v210-init.c @@ -21,9 +21,11 @@ extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_unaligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); av_cold void ff_v210_x86_init(V210DecContext *s) { @@ -36,6 +38,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s) if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) s->unpack_frame = ff_v210_planar_unpack_aligned_avx; + +if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2) +s->unpack_frame = ff_v210_planar_unpack_aligned_avx2; } else { if (cpu_flags & AV_CPU_FLAG_SSSE3) @@ -43,6 +48,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s) if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) s->unpack_frame = ff_v210_planar_unpack_unaligned_avx; + +if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2) +s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2; } #endif } diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm index c24c765e5b..706712313d 100644 --- a/libavcodec/x86/v210.asm +++ b/libavcodec/x86/v210.asm @@ -22,9 +22,14 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 + +; for AVX2 version only +v210_luma_permute: dd 0,1,2,4,5,6,7,7 ; 32-byte alignment required +v210_chroma_shuf2: db 0,1,2,3,4,5,8,9,10,11,12,13,-1,-1,-1,-1 +v210_luma_shuf_avx2: db 0,1,4,5,6,7,8,9,12,13,14,15,-1,-1,-1,-1 +v210_chroma_shuf_avx2: db 0,1,4,5,10,11,-1,-1,2,3,8,9,12,13,-1,-1 -v210_mask: times 4 dd 0x3ff v210_mult: dw 64,4,64,4,64,4,64,4 v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 @@ -34,40 +39,65 @@ SECTION .text %macro v210_planar_unpack 1 ; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) -cglobal v210_planar_unpack_%1, 5, 5, 7 +cglobal v210_planar_unpack_%1, 5, 5, 8 movsxdifnidn r4, r4d lear1, [r1+2*r4] addr2, r4 addr3, r4 negr4 -mova m3, [v210_mult] -mova m4, [v210_mask] -mova m5, [v210_luma_shuf] -mova m6, [v210_chroma_shuf] +VBROADCASTI128 m3, [v210_mult] +VBROADCASTI128 m5, [v210_chroma_shuf] + +%if cpuflag(avx2) +VBROADCASTI128 m4, [v210_luma_shuf_avx2] +VBROADCASTI128 m5, [v210_chroma_shuf_avx2] +mova m6, [v210_luma_permute] +VBROADCASTI128 m7, [v210_chroma_shuf2] +%else +VBROADCASTI128 m4, [v210_luma_shuf] +VBROADCASTI128 m5, [v210_chroma_shuf] +%endif + .loop: %ifidn %1, unaligned -movu m0, [r0] +movu m0, [r0]; yB v5 yA u5 y9 v4 y8 u4 y7 v3 y6 u3 y5 v2 y4 u2 y3 v1 y2 u1 y1 v0 y0 u0 %else mova m0, [r0] %endif pmullw m1, m0, m3 -psrld m0, 10 -psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5 -pand m0, m4 ; y0 __ u1 __ y3 __ v2
[FFmpeg-devel] [PATCH 0/3] v210dec checkasm test and avx2 function
I am resending this my patches because I am not sure if I sent this version in the past. I split my changes into two patches because they do separate things. I also changed some tabs to spaces in Mike's AVX2 patch. James Darnley (2): avcodec/v210dec: move DSP function setting into dedicated function checkasm: add test for v210dec Michael Stoner (1): libavcodec Adding ff_v210_planar_unpack AVX2 libavcodec/v210dec.c | 26 + libavcodec/v210dec.h | 1 + libavcodec/x86/v210-init.c | 8 libavcodec/x86/v210.asm| 72 +++ tests/checkasm/Makefile| 1 + tests/checkasm/checkasm.c | 3 ++ tests/checkasm/checkasm.h | 1 + tests/checkasm/v210dec.c | 77 ++ 8 files changed, 166 insertions(+), 23 deletions(-) create mode 100644 tests/checkasm/v210dec.c -- 2.21.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/3] checkasm: add test for v210dec
--- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 ++ tests/checkasm/checkasm.h | 1 + tests/checkasm/v210dec.c | 77 +++ 4 files changed, 82 insertions(+) create mode 100644 tests/checkasm/v210dec.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 8cc0bff2d1..886ae33167 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -25,6 +25,7 @@ AVCODECOBJS-$(CONFIG_JPEG2000_DECODER) += jpeg2000dsp.o AVCODECOBJS-$(CONFIG_PIXBLOCKDSP) += pixblockdsp.o AVCODECOBJS-$(CONFIG_HEVC_DECODER) += hevc_add_res.o hevc_idct.o hevc_sao.o AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER) += utvideodsp.o +AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 9eec41e3c4..bf51e00eab 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -136,6 +136,9 @@ static const struct { #if CONFIG_UTVIDEO_DECODER { "utvideodsp", checkasm_check_utvideodsp }, #endif +#if CONFIG_V210_DECODER +{ "v210dec", checkasm_check_v210dec }, +#endif #if CONFIG_V210_ENCODER { "v210enc", checkasm_check_v210enc }, #endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 9e8e879fd3..9b8d2f5419 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -69,6 +69,7 @@ void checkasm_check_sbrdsp(void); void checkasm_check_synth_filter(void); void checkasm_check_sw_rgb(void); void checkasm_check_utvideodsp(void); +void checkasm_check_v210dec(void); void checkasm_check_v210enc(void); void checkasm_check_vf_hflip(void); void checkasm_check_vf_threshold(void); diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c new file mode 100644 index 00..7dd50a8271 --- /dev/null +++ b/tests/checkasm/v210dec.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2019 James Darnley + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include "checkasm.h" +#include "libavcodec/v210dec.h" + +static uint32_t get_v210(void) +{ +uint32_t t0 = rnd() & 0x3ff, + t1 = rnd() & 0x3ff, + t2 = rnd() & 0x3ff; +uint32_t value = t0 + | (t1 << 10) + | (t2 << 20); +return value; +} + +#define NUM_SAMPLES 2048 + +static void randomize_buffers(uint32_t *src0, uint32_t *src1, int len) +{ +for (int i = 0; i < len; i++) { +uint32_t value = get_v210(); +src0[i] = value; +src1[i] = value; +} +} + +void checkasm_check_v210dec(void) +{ +V210DecContext h; + +h.aligned_input = 0; +ff_v210dec_init(&h); + +if (check_func(h.unpack_frame, "v210_unpack")) { +uint32_t src0[NUM_SAMPLES/3]; +uint32_t src1[NUM_SAMPLES/3]; +uint16_t y0[NUM_SAMPLES/2]; +uint16_t y1[NUM_SAMPLES/2]; +uint16_t u0[NUM_SAMPLES/4]; +uint16_t u1[NUM_SAMPLES/4]; +uint16_t v0[NUM_SAMPLES/4]; +uint16_t v1[NUM_SAMPLES/4]; +declare_func(void, const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +const int pixels = NUM_SAMPLES / 2 / 6 * 6; + +randomize_buffers(src0, src1, NUM_SAMPLES/3); +call_ref(src0, y0, u0, v0, pixels); +call_new(src1, y1, u1, v1, pixels); +if (memcmp(src0, src1, NUM_SAMPLES/3 * sizeof src0[0]) +|| memcmp(y0, y1, pixels * sizeof y0[0]) +|| memcmp(u0, u1, pixels/2 * sizeof u0[0]) +|| memcmp(v0, v1, pixels/2 * sizeof v0[0])) +fail(); +bench_new(src1, y1, u1, v1, pixels); +} +report("v210_unpack"); +} -- 2.21.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] libavcodec Adding ff_v210_planar_unpack AVX2
On 2019-03-26 21:22, Mike Stoner via ffmpeg-devel wrote: > Hello, > I’ve accounted for all feedback on this so far, I’m wondering if it is ready > to be pushed upstream? > > Here are my results from ‘checkasm’ (lower is better): > > v210_unpack_c: 1636 > v210_unpack_ssse3: 611 > v210_unpack_avx: 601 > v210_unpack_avx2: 423 > > I ran it 5 times and averaged the middle 3 results for each CPU target > (ignoring the highest and lowest time). > > https://patchwork.ffmpeg.org/patch/12325/ > > > Thanks… -Mike Sorry that I keep forgetting about this. I will try to make some time tomorrow to give this another look over. I'm not sure what order this and my checkasm patch should be applied in, which I also forgot about. Did anyone else make comments on either patch? signature.asc Description: OpenPGP digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/2] avcodec/v210dec: move DSP function setting into dedicated function
Prepare for checkasm test. --- libavcodec/v210dec.c | 16 ++-- libavcodec/v210dec.h | 1 + 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c index ddc5dbe8be..fd8a6b0d78 100644 --- a/libavcodec/v210dec.c +++ b/libavcodec/v210dec.c @@ -50,6 +50,13 @@ static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, } } +av_cold void ff_v210dec_init(V210DecContext *s) +{ +s->unpack_frame = v210_planar_unpack_c; +if (ARCH_X86) +ff_v210_x86_init(s); +} + static av_cold int decode_init(AVCodecContext *avctx) { V210DecContext *s = avctx->priv_data; @@ -57,10 +64,8 @@ static av_cold int decode_init(AVCodecContext *avctx) avctx->pix_fmt = AV_PIX_FMT_YUV422P10; avctx->bits_per_raw_sample = 10; -s->unpack_frame= v210_planar_unpack_c; - -if (HAVE_MMX) -ff_v210_x86_init(s); +s->aligned_input = 0; +ff_v210dec_init(s); return 0; } @@ -102,8 +107,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf); if (aligned_input != s->aligned_input) { s->aligned_input = aligned_input; -if (HAVE_MMX) -ff_v210_x86_init(s); +ff_v210dec_init(s); } if ((ret = ff_get_buffer(avctx, pic, 0)) < 0) diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h index 533afc435c..cfdb29da09 100644 --- a/libavcodec/v210dec.h +++ b/libavcodec/v210dec.h @@ -31,6 +31,7 @@ typedef struct { void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); } V210DecContext; +void ff_v210dec_init(V210DecContext *s); void ff_v210_x86_init(V210DecContext *s); #endif /* AVCODEC_V210DEC_H */ -- 2.20.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH 2/2] checkasm: add test for v210dec
--- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 ++ tests/checkasm/checkasm.h | 1 + tests/checkasm/v210dec.c | 77 +++ 4 files changed, 82 insertions(+) create mode 100644 tests/checkasm/v210dec.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 47b7b06d28..70abc1a407 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -25,6 +25,7 @@ AVCODECOBJS-$(CONFIG_JPEG2000_DECODER) += jpeg2000dsp.o AVCODECOBJS-$(CONFIG_PIXBLOCKDSP) += pixblockdsp.o AVCODECOBJS-$(CONFIG_HEVC_DECODER) += hevc_add_res.o hevc_idct.o hevc_sao.o AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER) += utvideodsp.o +AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 9eec41e3c4..bf51e00eab 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -136,6 +136,9 @@ static const struct { #if CONFIG_UTVIDEO_DECODER { "utvideodsp", checkasm_check_utvideodsp }, #endif +#if CONFIG_V210_DECODER +{ "v210dec", checkasm_check_v210dec }, +#endif #if CONFIG_V210_ENCODER { "v210enc", checkasm_check_v210enc }, #endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 9e8e879fd3..9b8d2f5419 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -69,6 +69,7 @@ void checkasm_check_sbrdsp(void); void checkasm_check_synth_filter(void); void checkasm_check_sw_rgb(void); void checkasm_check_utvideodsp(void); +void checkasm_check_v210dec(void); void checkasm_check_v210enc(void); void checkasm_check_vf_hflip(void); void checkasm_check_vf_threshold(void); diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c new file mode 100644 index 00..7dd50a8271 --- /dev/null +++ b/tests/checkasm/v210dec.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2019 James Darnley + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include "checkasm.h" +#include "libavcodec/v210dec.h" + +static uint32_t get_v210(void) +{ +uint32_t t0 = rnd() & 0x3ff, + t1 = rnd() & 0x3ff, + t2 = rnd() & 0x3ff; +uint32_t value = t0 + | (t1 << 10) + | (t2 << 20); +return value; +} + +#define NUM_SAMPLES 2048 + +static void randomize_buffers(uint32_t *src0, uint32_t *src1, int len) +{ +for (int i = 0; i < len; i++) { +uint32_t value = get_v210(); +src0[i] = value; +src1[i] = value; +} +} + +void checkasm_check_v210dec(void) +{ +V210DecContext h; + +h.aligned_input = 0; +ff_v210dec_init(&h); + +if (check_func(h.unpack_frame, "v210_unpack")) { +uint32_t src0[NUM_SAMPLES/3]; +uint32_t src1[NUM_SAMPLES/3]; +uint16_t y0[NUM_SAMPLES/2]; +uint16_t y1[NUM_SAMPLES/2]; +uint16_t u0[NUM_SAMPLES/4]; +uint16_t u1[NUM_SAMPLES/4]; +uint16_t v0[NUM_SAMPLES/4]; +uint16_t v1[NUM_SAMPLES/4]; +declare_func(void, const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +const int pixels = NUM_SAMPLES / 2 / 6 * 6; + +randomize_buffers(src0, src1, NUM_SAMPLES/3); +call_ref(src0, y0, u0, v0, pixels); +call_new(src1, y1, u1, v1, pixels); +if (memcmp(src0, src1, NUM_SAMPLES/3 * sizeof src0[0]) +|| memcmp(y0, y1, pixels * sizeof y0[0]) +|| memcmp(u0, u1, pixels/2 * sizeof u0[0]) +|| memcmp(v0, v1, pixels/2 * sizeof v0[0])) +fail(); +bench_new(src1, y1, u1, v1, pixels); +} +report("v210_unpack"); +} -- 2.20.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avcodec/v210dec: move DSP function setting into dedicated function
Prepare for checkasm test. --- libavcodec/v210dec.c | 16 ++-- libavcodec/v210dec.h | 1 + 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c index ddc5dbe8be..6db662538e 100644 --- a/libavcodec/v210dec.c +++ b/libavcodec/v210dec.c @@ -50,6 +50,14 @@ static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, } } +av_cold void ff_v210dec_init(V210DecContext *s) +{ +s->unpack_frame = v210_planar_unpack_c; +s->aligned_input = 0; +if (ARCH_X86) +ff_v210_x86_init(s); +} + static av_cold int decode_init(AVCodecContext *avctx) { V210DecContext *s = avctx->priv_data; @@ -57,10 +65,7 @@ static av_cold int decode_init(AVCodecContext *avctx) avctx->pix_fmt = AV_PIX_FMT_YUV422P10; avctx->bits_per_raw_sample = 10; -s->unpack_frame= v210_planar_unpack_c; - -if (HAVE_MMX) -ff_v210_x86_init(s); +ff_v210dec_init(s); return 0; } @@ -102,8 +107,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf); if (aligned_input != s->aligned_input) { s->aligned_input = aligned_input; -if (HAVE_MMX) -ff_v210_x86_init(s); +ff_v210dec_init(s); } if ((ret = ff_get_buffer(avctx, pic, 0)) < 0) diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h index 533afc435c..cfdb29da09 100644 --- a/libavcodec/v210dec.h +++ b/libavcodec/v210dec.h @@ -31,6 +31,7 @@ typedef struct { void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); } V210DecContext; +void ff_v210dec_init(V210DecContext *s); void ff_v210_x86_init(V210DecContext *s); #endif /* AVCODEC_V210DEC_H */ -- 2.20.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] checkasm: add test for v210dec
On 2019-03-06 20:31, James Darnley wrote: > ... Wrong patch and wrong reference. Please ignore this. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] checkasm: add test for v210dec
--- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 ++ tests/checkasm/checkasm.h | 1 + tests/checkasm/v210dec.c | 76 +++ 4 files changed, 81 insertions(+) create mode 100644 tests/checkasm/v210dec.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 47b7b06d28..70abc1a407 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -25,6 +25,7 @@ AVCODECOBJS-$(CONFIG_JPEG2000_DECODER) += jpeg2000dsp.o AVCODECOBJS-$(CONFIG_PIXBLOCKDSP) += pixblockdsp.o AVCODECOBJS-$(CONFIG_HEVC_DECODER) += hevc_add_res.o hevc_idct.o hevc_sao.o AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER) += utvideodsp.o +AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 9eec41e3c4..bf51e00eab 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -136,6 +136,9 @@ static const struct { #if CONFIG_UTVIDEO_DECODER { "utvideodsp", checkasm_check_utvideodsp }, #endif +#if CONFIG_V210_DECODER +{ "v210dec", checkasm_check_v210dec }, +#endif #if CONFIG_V210_ENCODER { "v210enc", checkasm_check_v210enc }, #endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 9e8e879fd3..9b8d2f5419 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -69,6 +69,7 @@ void checkasm_check_sbrdsp(void); void checkasm_check_synth_filter(void); void checkasm_check_sw_rgb(void); void checkasm_check_utvideodsp(void); +void checkasm_check_v210dec(void); void checkasm_check_v210enc(void); void checkasm_check_vf_hflip(void); void checkasm_check_vf_threshold(void); diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c new file mode 100644 index 00..7320ed5e37 --- /dev/null +++ b/tests/checkasm/v210dec.c @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2019 James Darnley + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include "checkasm.h" +#include "libavcodec/v210dec.h" + +static uint32_t get_v210(void) +{ +uint32_t t0 = rnd() & 0x3ff, + t1 = rnd() & 0x3ff, + t2 = rnd() & 0x3ff; +uint32_t value = t0 + | (t1 << 10) + | (t2 << 20); +return value; +} + +#define NUM_SAMPLES 2048 + +static void randomize_buffers(uint32_t *src0, uint32_t *src1, int len) +{ +for (int i = 0; i < len; i++) { +uint32_t value = get_v210(); +src0[i] = value; +src1[i] = value; +} +} + +void checkasm_check_v210dec(void) +{ +V210DecContext h; + +ff_v210dec_init(&h); + +if (check_func(h.unpack_frame, "v210_unpack")) { +uint32_t src0[NUM_SAMPLES/3]; +uint32_t src1[NUM_SAMPLES/3]; +uint16_t y0[NUM_SAMPLES/2]; +uint16_t y1[NUM_SAMPLES/2]; +uint16_t u0[NUM_SAMPLES/4]; +uint16_t u1[NUM_SAMPLES/4]; +uint16_t v0[NUM_SAMPLES/4]; +uint16_t v1[NUM_SAMPLES/4]; +declare_func(void, const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +const int pixels = NUM_SAMPLES / 2 / 6 * 6; + +randomize_buffers(src0, src1, NUM_SAMPLES/3); +call_ref(src0, y0, u0, v0, pixels); +call_new(src1, y1, u1, v1, pixels); +if (memcmp(src0, src1, NUM_SAMPLES/3 * sizeof src0[0]) +|| memcmp(y0, y1, pixels * sizeof y0[0]) +|| memcmp(u0, u1, pixels/2 * sizeof u0[0]) +|| memcmp(v0, v1, pixels/2 * sizeof v0[0])) +fail(); +bench_new(src1, y1, u1, v1, pixels); +} +report("v210_unpack"); +} -- 2.20.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH 1/2] avcodec/v210dec: move DSP function setting into dedicated function
On 2019-03-06 10:11, Paul B Mahol wrote: > On 3/6/19, Carl Eugen Hoyos wrote: >> 2019-03-04 23:58 GMT+01:00, James Darnley : >>> Prepare for checkasm test. >>> --- >>> libavcodec/v210dec.c | 13 + >>> libavcodec/v210dec.h | 1 + >>> 2 files changed, 10 insertions(+), 4 deletions(-) >>> >>> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c >>> index ddc5dbe8be..28cf00d320 100644 >>> --- a/libavcodec/v210dec.c >>> +++ b/libavcodec/v210dec.c >>> @@ -50,6 +50,14 @@ static void v210_planar_unpack_c(const uint32_t *src, >>> uint16_t *y, uint16_t *u, >>> } >>> } >>> >>> +av_cold void ff_v210dec_init(V210DecContext *s) >>> +{ >>> +s->unpack_frame = v210_planar_unpack_c; >> >>> +s->aligned_input = 0; >> >> Isn't this an unrelated change or do I misunderstand? > > You misunderstand. Maybe. I need to initialize that member before it is used in the x86 function. I expect valgrind or similar would catch the use. It doesn't matter for normal use because it will be set correctly based on the input data alignment for each frame. Now that you mention it I realize I forgot to change that to call the new function so I will send a v2 later. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH 2/2] checkasm: add test for v210dec
--- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 ++ tests/checkasm/checkasm.h | 1 + tests/checkasm/v210dec.c | 76 +++ 4 files changed, 81 insertions(+) create mode 100644 tests/checkasm/v210dec.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 47b7b06d28..70abc1a407 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -25,6 +25,7 @@ AVCODECOBJS-$(CONFIG_JPEG2000_DECODER) += jpeg2000dsp.o AVCODECOBJS-$(CONFIG_PIXBLOCKDSP) += pixblockdsp.o AVCODECOBJS-$(CONFIG_HEVC_DECODER) += hevc_add_res.o hevc_idct.o hevc_sao.o AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER) += utvideodsp.o +AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 9eec41e3c4..bf51e00eab 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -136,6 +136,9 @@ static const struct { #if CONFIG_UTVIDEO_DECODER { "utvideodsp", checkasm_check_utvideodsp }, #endif +#if CONFIG_V210_DECODER +{ "v210dec", checkasm_check_v210dec }, +#endif #if CONFIG_V210_ENCODER { "v210enc", checkasm_check_v210enc }, #endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 9e8e879fd3..9b8d2f5419 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -69,6 +69,7 @@ void checkasm_check_sbrdsp(void); void checkasm_check_synth_filter(void); void checkasm_check_sw_rgb(void); void checkasm_check_utvideodsp(void); +void checkasm_check_v210dec(void); void checkasm_check_v210enc(void); void checkasm_check_vf_hflip(void); void checkasm_check_vf_threshold(void); diff --git a/tests/checkasm/v210dec.c b/tests/checkasm/v210dec.c new file mode 100644 index 00..7320ed5e37 --- /dev/null +++ b/tests/checkasm/v210dec.c @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2019 James Darnley + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include "checkasm.h" +#include "libavcodec/v210dec.h" + +static uint32_t get_v210(void) +{ +uint32_t t0 = rnd() & 0x3ff, + t1 = rnd() & 0x3ff, + t2 = rnd() & 0x3ff; +uint32_t value = t0 + | (t1 << 10) + | (t2 << 20); +return value; +} + +#define NUM_SAMPLES 2048 + +static void randomize_buffers(uint32_t *src0, uint32_t *src1, int len) +{ +for (int i = 0; i < len; i++) { +uint32_t value = get_v210(); +src0[i] = value; +src1[i] = value; +} +} + +void checkasm_check_v210dec(void) +{ +V210DecContext h; + +ff_v210dec_init(&h); + +if (check_func(h.unpack_frame, "v210_unpack")) { +uint32_t src0[NUM_SAMPLES/3]; +uint32_t src1[NUM_SAMPLES/3]; +uint16_t y0[NUM_SAMPLES/2]; +uint16_t y1[NUM_SAMPLES/2]; +uint16_t u0[NUM_SAMPLES/4]; +uint16_t u1[NUM_SAMPLES/4]; +uint16_t v0[NUM_SAMPLES/4]; +uint16_t v1[NUM_SAMPLES/4]; +declare_func(void, const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +const int pixels = NUM_SAMPLES / 2 / 6 * 6; + +randomize_buffers(src0, src1, NUM_SAMPLES/3); +call_ref(src0, y0, u0, v0, pixels); +call_new(src1, y1, u1, v1, pixels); +if (memcmp(src0, src1, NUM_SAMPLES/3 * sizeof src0[0]) +|| memcmp(y0, y1, pixels * sizeof y0[0]) +|| memcmp(u0, u1, pixels/2 * sizeof u0[0]) +|| memcmp(v0, v1, pixels/2 * sizeof v0[0])) +fail(); +bench_new(src1, y1, u1, v1, pixels); +} +report("v210_unpack"); +} -- 2.20.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH 1/2] avcodec/v210dec: move DSP function setting into dedicated function
Prepare for checkasm test. --- libavcodec/v210dec.c | 13 + libavcodec/v210dec.h | 1 + 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c index ddc5dbe8be..28cf00d320 100644 --- a/libavcodec/v210dec.c +++ b/libavcodec/v210dec.c @@ -50,6 +50,14 @@ static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, } } +av_cold void ff_v210dec_init(V210DecContext *s) +{ +s->unpack_frame = v210_planar_unpack_c; +s->aligned_input = 0; +if (ARCH_X86) +ff_v210_x86_init(s); +} + static av_cold int decode_init(AVCodecContext *avctx) { V210DecContext *s = avctx->priv_data; @@ -57,10 +65,7 @@ static av_cold int decode_init(AVCodecContext *avctx) avctx->pix_fmt = AV_PIX_FMT_YUV422P10; avctx->bits_per_raw_sample = 10; -s->unpack_frame= v210_planar_unpack_c; - -if (HAVE_MMX) -ff_v210_x86_init(s); +ff_v210dec_init(s); return 0; } diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h index 533afc435c..cfdb29da09 100644 --- a/libavcodec/v210dec.h +++ b/libavcodec/v210dec.h @@ -31,6 +31,7 @@ typedef struct { void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); } V210DecContext; +void ff_v210dec_init(V210DecContext *s); void ff_v210_x86_init(V210DecContext *s); #endif /* AVCODEC_V210DEC_H */ -- 2.20.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] Added ff_v210_planar_unpack_aligned_avx2
On 2019-03-01 18:41, Michael Stoner wrote: > The AVX2 code leverages VPERMD to process 12 pixels/iteration. This is my > first patch submission so any comments are greatly appreciated. > > -Mike > > Tested on Skylake (Win32 & Win64) > 1920x1080 input frame > = > C code - 440 fps > SSSE3 - 920 fps > AVX- 930 fps > AVX2 - 1040 fps > > Regression tested at 1920x1080, 1280x720, and 352x288 > .loop: > %ifidn %1, unaligned > -movu m0, [r0] > +movu m0, [r0]; yB v5 yA u5 y9 v4 y8 u4 y7 v3 y6 > u3 y5 v2 y4 u2 y3 v1 y2 u1 y1 v0 y0 u0 > %else > mova m0, [r0] > %endif At first I didn't understand why you do so much seemingly unnecessary work. You don't change how the data loaded into register. After more in-depth reading I see now that you shuffle data around just so you can store the data with a single move for each plane. The chroma is below. > +%if cpuflag(avx2) > +vpermd m1, m6, m1 ; 00 v5 v4 v3 00 v2 v1 v0 00 u5 u4 u3 > 00 u2 u1 u0 > +pshufb m1, m7 ; 00 00 v5 v4 v3 v2 v1 v0 00 00 u5 u4 > u3 u2 u1 u0 > +movu [r2+r4], xm1 > +vextracti128 [r3+r4], m1, 1 > +%else > movq [r2+r4], m1 > movhps [r3+r4], m1 > +%endif Sounds commendable but I doubt the use of this many more shuffles gets you much over a naive AVX2 version (where you treat the high half of ymm like an unroll). > +; for AVX2 version only > +v210_luma_permute: dd 0,1,2,4,5,6,7,7 > +v210_chroma_permute: dd 0,1,4,5,2,3,6,7 Are you sure these can't be replaced with vpermq and its immediate operand? It really looks like the second could be. It'll save you a register. > -mova m3, [v210_mult] > -mova m4, [v210_mask] > -mova m5, [v210_luma_shuf] > -mova m6, [v210_chroma_shuf] > +mova m3, [v210_luma_shuf] > +mova m4, [v210_chroma_shuf1] > + > +%if cpuflag(avx2) > +mova m5, [v210_luma_permute] ; VPERMD constant must be in a > register > +mova m6, [v210_chroma_permute]; VPERMD constant must be in a > register > +mova m7, [v210_chroma_shuf2] > +%endif > + > +%if ARCH_X86_64 > +mova m8, [v210_mult] > +mova m9, [v210_mask] > +%endif > + It would let you clean this up a bit. My suggestion is to make the diff minimal by keeping the existing uses and if you still need more than 8 registers for avx2 then make it available for x86-64 only. Compare yours with the one I committed here https://github.com/Upipe/upipe/blob/master/lib/upipe-v210/v210dec.asm#L45 which is just FFmpeg's cleaned up a little plus avx2. I'm surprised it's not already in FFmpeg. You should do whatever is faster. signature.asc Description: OpenPGP digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] Added ff_v210_planar_unpack_aligned_avx2
On 2019-03-03 15:44, Martin Vignali wrote: > Hello, > > ... > > Not directly related to this patch, but it can be interesting for testing > purpose to write a checkasm test for the v210 func decoding. > So it's more easy to check the perf for "each" cpu flags, and be sure, the > various width cases works as expected. I can probably do that. I have one for v210 unpacking in a knock-off checkasm for another project. I will look over/review the submitted patch first. signature.asc Description: OpenPGP digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] Lossy GIF encoding
On 2019-02-15 10:01, Kornel wrote: > libavcodec/gif.c in ff_gif_encoder.pix_fmts seems to passively declare types > of pixel formats it accepts. If you want to experiment you can change that so it accepts rgb (also or only). Then you can implement and test what you want, then you can ask about submitting it. You can make your fancy encoding only available with rgb, or with some option and return an error when given pal8. signature.asc Description: OpenPGP digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avformat/matroskaenc: add reserve free space option
On 2018-09-06 19:39, Sigríður Regína Sigurþórsdóttir wrote: > +if (s->metadata_header_padding) { > +if (s->metadata_header_padding == 1) > +s->metadata_header_padding++; > +put_ebml_void(pb, s->metadata_header_padding); > +} Unfortunately I was forced to make the default -1 so you want to check that the value is greater than 0 rather than just true. Furthermore I think you will still want to add to Changelog making a note that the matroska muxer will now listen to metadata_header_padding. That may also want a micro version bump so that library users can check. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avformat/matroskaenc: add reserve free space option
On 2018-09-05 22:52, Sigríður Regína Sigurþórsdóttir wrote: > +{"reserve_free_space", "Reserve a given amount of space at the > beginning og the file for unspecified purpose." I added the "metadata_header_padding" global option many years ago. Can you not reuse it for this purpose? Is it not likely to be "metadata" that another software might fill this with? Also there is a typo in the bit I quoted. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] frame: Simplify the video allocation
On 2018-09-03 15:29, James Almer wrote: > pass 32 - 1 to both av_image_fill_pointers() calls directly? Please do not add a magic number where nobody will find it. Use one of the 3 already existing methods for knowing the alignment necessary for assembly. If this is unrelated, my apologies. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH 1/3] diracdec: add 10-bit Haar SIMD functions
On 2018-07-27 15:05, Henrik Gramner wrote: > On Fri, Jul 27, 2018 at 1:47 PM, James Darnley wrote: >> On 2018-07-26 17:29, Rostislav Pehlivanov wrote: >>>> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w, >>>> x, b2 >>>> +DECLARE_REG_TMP 2,5 >>>> +%if ARCH_X86_64 >>>> +%define tail r6d >>>> +%else >>>> +%define tail dword wm >>>> +%endif >>>> + >>>> >>> >>> You can remove this whole bit, the init function only gets called if >>> ARCH_X86_64 is true. >> >> Where did you get that from? I don't require 64-bit for this. > > Can't you just use 7 GPR:s on x86-32 as well? I'm sure I've done that in the past and at least 1 platform has always complained due to PIE or stack alignment or whatever, I think. I went looking for an old email but couldn't find it. If you want me to try it I can. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH 1/3] diracdec: add 10-bit Haar SIMD functions
On 2018-07-26 17:29, Rostislav Pehlivanov wrote: > On 26 July 2018 at 12:28, James Darnley wrote: > +cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w >> +DECLARE_REG_TMP 4,5 >> + >> +mova m2, [pd_1] >> +mov r3d, wd >> +and wd, ~(mmsize/4 - 1) >> +shl wd, 2 >> +add b0q, wq >> +add b1q, wq >> +neg wq >> + >> +ALIGN 16 >> +.loop_simd: >> +mova m0, [b0q + wq] >> +mova m1, [b1q + wq] >> +paddd m3, m1, m2 >> +psrad m3, 1 >> +psubd m0, m3 >> +paddd m1, m0 >> +mova [b0q + wq], m0 >> +mova [b1q + wq], m1 >> +add wq, mmsize >> +jl .loop_simd >> + >> +and r3d, mmsize/4 - 1 >> +jz .end >> +.loop_scalar: >> +mov t0d, [b0q] >> +mov t1d, [b1q] >> +mov r2d, t1d >> +add r2d, 1 >> +sar r2d, 1 >> +sub t0d, r2d >> +add t1d, t0d >> +mov [b0q], t0d >> +mov [b1q], t1d >> + >> +add b0q, 4 >> +add b1q, 4 >> +sub r3d, 1 >> +jg .loop_scalar >> + >> +.end: >> +RET >> + >> +%endmacro >> + >> +%macro HAAR_HORIZONTAL 0 >> > + >> > > Could you remove this newline from every patch? All asm I've written and > seen keep them without a newline. It made me think there's something in the > asm which checked the value of the macro, not that the entire function is > macro'd. What? I don't understand what you mean. Do you think I have too many blank lines between things? > +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w, >> x, b2 >> +DECLARE_REG_TMP 2,5 >> +%if ARCH_X86_64 >> +%define tail r6d >> +%else >> +%define tail dword wm >> +%endif >> + >> +mova m2, [pd_1] >> +xor xd, xd >> +shr wd, 1 >> +mov tail, wd >> +lea b2q, [bq + 4*wq] >> + >> +ALIGN 16 >> +.loop_lo: >> +mova m0, [bq + 4*xq] >> +movu m1, [b2q + 4*xq] >> +paddd m1, m2 >> +psrad m1, 1 >> +psubd m0, m1 >> +mova [temp_q + 4*xq], m0 >> +add xd, mmsize/4 >> +cmp xd, wd >> +jl .loop_lo >> + >> +xor xd, xd >> +and wd, ~(mmsize/4 - 1) >> + >> +ALIGN 16 >> +.loop_hi: >> +mova m0, [temp_q + 4*xq] >> +movu m1, [b2q+ 4*xq] >> +paddd m1, m0 >> +paddd m0, m2 >> +paddd m1, m2 >> +psrad m0, 1 >> +psrad m1, 1 >> +SBUTTERFLY dq, 0,1,3 >> +%if cpuflag(avx2) >> +SBUTTERFLY dqqq, 0,1,3 >> +%endif >> +mova [bq + 8*xq], m0 >> +mova [bq + 8*xq + mmsize], m1 >> +add xd, mmsize/4 >> +cmp xd, wd >> +jl .loop_hi >> + >> +and tail, mmsize/4 - 1 >> +jz .end >> +.loop_scalar: >> +mov t0d, [temp_q + 4*xq] >> +mov t1d, [b2q+ 4*xq] >> +add t1d, t0d >> +add t0d, 1 >> +add t1d, 1 >> +sar t0d, 1 >> +sar t1d, 1 >> +mov [bq + 8*xq], t0d >> +mov [bq + 8*xq + 4], t1d >> +add xq, 1 >> +sub tail, 1 >> +jg .loop_scalar >> + >> +.end: >> +REP_RET >> + >> +%endmacro >> + >> +INIT_XMM sse2 >> +HAAR_HORIZONTAL >> +HAAR_VERTICAL >> + >> +INIT_XMM avx >> +HAAR_HORIZONTAL >> +HAAR_VERTICAL >> > > You're not using any avx functions in that version, not unless a macro'd > instruction inserts one for you. I think you should remove the avx version > then. > Also since you always have a HAAR_HORIZONTAL and HAAR_VERTICAL macros per > version you can just make a single macro to do both versions at the same > time. Now that I think about it there will be only one 3-operand instruction in the SBUTTERFLY and the vertical function also only has 1. I will remove it. I can merge the two macros but I will look back at what I've done previously. I think it is usually 1 macro per function. > + >> +INIT_YMM avx2 >> +HAAR_HORIZONTAL >> +HAAR_VERTICAL >> diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c >> b/libavcodec/x86/dirac_dwt_init_10bit.c >> new file mode 100644 >> index 00..289862d728 >> --- /d
[FFmpeg-devel] [PATCH 1/3] diracdec: add 10-bit Haar SIMD functions
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the relevant transform. C:119fps SSE2: 204fps AVX: 206fps AVX2: 221fps timer measurements, haar horizontal compose: sse2: 3.68x faster (45143 vs. 12279 decicycles) compared with C avx: 3.68x faster (45143 vs. 12275 decicycles) compared with C avx2: 5.16x faster (45143 vs. 8742 decicycles) compared with C haar vertical compose: sse2: 1.64x faster (31792 vs. 19377 decicycles) compared with C avx: 1.58x faster (31792 vs. 20090 decicycles) compared with C avx2: 1.66x faster (31792 vs. 19157 decicycles) compared with C --- libavcodec/dirac_dwt.c| 7 +- libavcodec/dirac_dwt.h| 1 + libavcodec/x86/Makefile | 6 +- libavcodec/x86/dirac_dwt_10bit.asm| 160 ++ libavcodec/x86/dirac_dwt_init_10bit.c | 76 5 files changed, 247 insertions(+), 3 deletions(-) create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c index cc08f8865a..86bee5bb9b 100644 --- a/libavcodec/dirac_dwt.c +++ b/libavcodec/dirac_dwt.c @@ -59,8 +59,13 @@ int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type, return AVERROR_INVALIDDATA; } -if (ARCH_X86 && bit_depth == 8) +#if ARCH_X86 +if (bit_depth == 8) ff_spatial_idwt_init_x86(d, type); +else if (bit_depth == 10) +ff_spatial_idwt_init_10bit_x86(d, type); +#endif + return 0; } diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h index 994dc21d70..1ad7b9a821 100644 --- a/libavcodec/dirac_dwt.h +++ b/libavcodec/dirac_dwt.h @@ -88,6 +88,7 @@ enum dwt_type { int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type, int decomposition_count, int bit_depth); void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type); +void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type); void ff_spatial_idwt_slice2(DWTContext *d, int y); diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 2350c8bbee..590d83c167 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -7,7 +7,8 @@ OBJS-$(CONFIG_BLOCKDSP)+= x86/blockdsp_init.o OBJS-$(CONFIG_BSWAPDSP)+= x86/bswapdsp_init.o OBJS-$(CONFIG_DCT) += x86/dct_init.o OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \ - x86/dirac_dwt_init.o + x86/dirac_dwt_init.o \ + x86/dirac_dwt_init_10bit.o OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o OBJS-$(CONFIG_FFT) += x86/fft_init.o OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o @@ -153,7 +154,8 @@ X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)+= x86/diracdsp.o\ - x86/dirac_dwt.o + x86/dirac_dwt.o \ + x86/dirac_dwt_10bit.o X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER)+= x86/dnxhdenc.o X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm new file mode 100644 index 00..baea91329e --- /dev/null +++ b/libavcodec/x86/dirac_dwt_10bit.asm @@ -0,0 +1,160 @@ +;** +;* x86 optimized discrete 10-bit wavelet trasnform +;* Copyright (c) 2018 James Darnley +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pd_1 + +SECTION .text + +%macro HAAR_VERTICAL 0 + +cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w +
[FFmpeg-devel] [PATCH 0/3 v2] x86 SIMD for dirac 10-bit wavelet transforms
I will ask the same question as last time. Is the AVX worth it in Haar? Also I am surprised that the AVX2 doesn't have a bigger difference on some of the vertical transforms. James Darnley (3): diracdec: add 10-bit Haar SIMD functions diracdec: add 10-bit Legall 5,3 (5_3) SIMD functions diracdec: add 10-bit Deslauriers-Dubuc 9,7 (9_7) vertical high-pass function libavcodec/dirac_dwt.c| 7 +- libavcodec/dirac_dwt.h| 1 + libavcodec/x86/Makefile | 6 +- libavcodec/x86/dirac_dwt_10bit.asm| 302 ++ libavcodec/x86/dirac_dwt_init_10bit.c | 118 ++ 5 files changed, 431 insertions(+), 3 deletions(-) create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c -- 2.18.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH 3/3] diracdec: add 10-bit Deslauriers-Dubuc 9, 7 (9_7) vertical high-pass function
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the relevant transform. C: 84fps SSE2: 111fps AVX2: 115fps dd97 vertical hi sse2: 2.77x faster (31773 vs. 11457 decicycles) compared with C avx2: 3.83x faster (31773 vs. 8297 decicycles) compared with C --- libavcodec/x86/dirac_dwt_10bit.asm| 39 +++ libavcodec/x86/dirac_dwt_init_10bit.c | 29 2 files changed, 68 insertions(+) diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm index 0295e6f554..2ed77fe3b0 100644 --- a/libavcodec/x86/dirac_dwt_10bit.asm +++ b/libavcodec/x86/dirac_dwt_10bit.asm @@ -25,6 +25,7 @@ SECTION_RODATA 32 cextern pd_1 pd_2: times 8 dd 2 +pd_8: times 8 dd 8 SECTION .text @@ -246,7 +247,44 @@ RET %endmacro +%macro DD97_VERTICAL_HI 0 + +cglobal dd97_vertical_hi, 6, 6, 8, b0, b1, b2, b3, b4, w +mova m7, [pd_8] +shl wd, 2 +add b0q, wq +add b1q, wq +add b2q, wq +add b3q, wq +add b4q, wq +neg wq + +ALIGN 16 +.loop: +mova m0, [b0q + wq] +mova m1, [b1q + wq] +mova m2, [b2q + wq] +mova m3, [b3q + wq] +mova m4, [b4q + wq] +pslld m5, m1, 3 +pslld m6, m3, 3 +paddd m5, m1 +paddd m6, m3 +psubd m5, m0 +psubd m6, m4 +paddd m5, m7 +paddd m5, m6 +psrad m5, 4 +paddd m2, m5 +mova [b2q + wq], m2 +add wq, mmsize +jl .loop +RET + +%endmacro + INIT_XMM sse2 +DD97_VERTICAL_HI HAAR_HORIZONTAL HAAR_VERTICAL LEGALL53_VERTICAL_HI @@ -257,6 +295,7 @@ HAAR_HORIZONTAL HAAR_VERTICAL INIT_YMM avx2 +DD97_VERTICAL_HI HAAR_HORIZONTAL HAAR_VERTICAL LEGALL53_VERTICAL_HI diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c index d1234efac5..a9ac603bc5 100644 --- a/libavcodec/x86/dirac_dwt_init_10bit.c +++ b/libavcodec/x86/dirac_dwt_init_10bit.c @@ -23,6 +23,9 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/dirac_dwt.h" +void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t *b3, int32_t *b4, int width); +void ff_dd97_vertical_hi_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t *b3, int32_t *b4, int width); + void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); void ff_legall53_vertical_hi_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int width); @@ -36,6 +39,24 @@ void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_ali void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align); void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align); +static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, + int32_t *b3, int32_t *b4, int width) +{ +int i = width & ~3; +ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i); +for(; ivertical_compose_h0 = (void*)dd97_vertical_hi_sse2; +d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_sse2; +break; case DWT_DIRAC_LEGALL5_3: d->vertical_compose_h0 = (void*)ff_legall53_vertical_hi_sse2; d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_sse2; @@ -71,6 +96,10 @@ av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type) if (EXTERNAL_AVX2(cpu_flags)) { switch (type) { +case DWT_DIRAC_DD9_7: +d->vertical_compose_h0 = (void*)dd97_vertical_hi_avx2; +d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_avx2; +break; case DWT_DIRAC_LEGALL5_3: d->vertical_compose_h0 = (void*)ff_legall53_vertical_hi_avx2; d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_avx2; -- 2.18.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH 2/3] diracdec: add 10-bit Legall 5, 3 (5_3) SIMD functions
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the relevant transform. C: 94fps SSE2: 118fps AVX2: 121fps legall vertical hi sse2: 3.86x faster (20201 vs. 5231 decicycles) compared with C avx2: 6.70x faster (20201 vs. 3014 decicycles) compared with C legall vertical lo sse2: 1.50x faster (28345 vs. 18908 decicycles) compared with C avx2: 1.63x faster (28345 vs. 17361 decicycles) compared with C --- libavcodec/x86/dirac_dwt_10bit.asm| 105 +- libavcodec/x86/dirac_dwt_init_10bit.c | 13 2 files changed, 117 insertions(+), 1 deletion(-) diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm index baea91329e..0295e6f554 100644 --- a/libavcodec/x86/dirac_dwt_10bit.asm +++ b/libavcodec/x86/dirac_dwt_10bit.asm @@ -21,9 +21,10 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 cextern pd_1 +pd_2: times 8 dd 2 SECTION .text @@ -147,9 +148,109 @@ REP_RET %endmacro +%macro LEGALL53_VERTICAL_LO 0 + +cglobal legall53_vertical_lo, 4, 6, 4, b0, b1, b2, w +DECLARE_REG_TMP 3,4,5 + +mova m3, [pd_2] +mov t2d, wd +and wd, ~(mmsize/4 - 1) +shl wd, 2 +add b0q, wq +add b1q, wq +add b2q, wq +neg wq + +ALIGN 16 +.loop: +mova m0, [b0q + wq] +mova m1, [b1q + wq] +mova m2, [b2q + wq] +paddd m0, m2 +paddd m0, m3 +psrad m0, 2 +psubd m1, m0 +mova [b1q + wq], m1 +add wq, mmsize +jl .loop + +and t2d, mmsize/4 - 1 +jz .end +.loop_scalar: +mov t0d, [b0q] +mov t1d, [b1q] +add t0d, [b2q] +add t0d, 2 +sar t0d, 2 +sub t1d, t0d +mov [b1q], t1d + +add b0q, 4 +add b1q, 4 +add b2q, 4 +sub t2d, 1 +jg .loop_scalar + +.end: +RET + +%endmacro + +%macro LEGALL53_VERTICAL_HI 0 + +cglobal legall53_vertical_hi, 4, 6, 4, b0, b1, b2, w +DECLARE_REG_TMP 3,4,5 + +mova m3, [pd_1] +mov t2d, wd +and wd, ~(mmsize/4 - 1) +shl wd, 2 +add b0q, wq +add b1q, wq +add b2q, wq +neg wq + +ALIGN 16 +.loop: +mova m0, [b0q + wq] +mova m1, [b1q + wq] +mova m2, [b2q + wq] +paddd m0, m2 +paddd m0, m3 +psrad m0, 1 +paddd m1, m0 +mova [b1q + wq], m1 +add wq, mmsize +jl .loop + +and t2d, mmsize/4 - 1 +jz .end +.loop_scalar: +mov t0d, [b0q] +mov t1d, [b1q] +add t0d, [b2q] +add t0d, 1 +sar t0d, 1 +add t1d, t0d +mov [b1q], t1d + +add b0q, 4 +add b1q, 4 +add b2q, 4 +sub t2d, 1 +jg .loop_scalar + +.end: +RET + +%endmacro + INIT_XMM sse2 HAAR_HORIZONTAL HAAR_VERTICAL +LEGALL53_VERTICAL_HI +LEGALL53_VERTICAL_LO INIT_XMM avx HAAR_HORIZONTAL @@ -158,3 +259,5 @@ HAAR_VERTICAL INIT_YMM avx2 HAAR_HORIZONTAL HAAR_VERTICAL +LEGALL53_VERTICAL_HI +LEGALL53_VERTICAL_LO diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c index 289862d728..d1234efac5 100644 --- a/libavcodec/x86/dirac_dwt_init_10bit.c +++ b/libavcodec/x86/dirac_dwt_init_10bit.c @@ -23,6 +23,11 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/dirac_dwt.h" +void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); +void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); +void ff_legall53_vertical_hi_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int width); +void ff_legall53_vertical_lo_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int width); + void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align); void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align); void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align); @@ -38,6 +43,10 @@ av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type) if (EXTERNAL_SSE2(cpu_flags)) { switch (type) { +case DWT_DIRAC_LEGALL5_3: +d->vertical_compose_h0 = (void*)ff_legall53_vertical_hi_sse2; +d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_sse2; +break; case DWT_DIRAC_HAAR0: d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_sse2; break; @@ -62,6 +71,10 @@ av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type) if (EXTERNAL_AVX2(cpu_flags)) { switch (type) { +case DWT_DIRAC_LEGALL5_3: +d->vertical_compose_h0 = (void*)ff_legall53_vertical_hi_avx2; +d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_avx2; +break; case DWT_DIRAC_HAAR0: d->vertical_compose = (void*)ff_vertical_compose_haar_10b
Re: [FFmpeg-devel] [PATCH 0/6] x86 SIMD for dirac 10-bit wavelet transforms
On 2018-07-19 17:23, Rostislav Pehlivanov wrote: > Could you provide standard overall transform results using START/STOP_TIMER > rather than overall decoding speed? Ask and ye shall receive. > haar horizontal compose > sse2: 3.67x faster (45248±108.1 vs. 12328±21.1 decicycles) compared with > none > avx: 3.74x faster (45248±108.1 vs. 12091±11.0 decicycles) compared with > none > avx2: 5.14x faster (45248±108.1 vs. 8805±15.6 decicycles) compared with > none > haar vertical compose > sse2: 1.57x faster (31771±459.9 vs. 20179±786.2 decicycles) compared with > none > avx: 1.62x faster (31771±459.9 vs. 19572±253.1 decicycles) compared with > none > avx2: 1.73x faster (31771±459.9 vs. 18337±827.9 decicycles) compared with > none > > legall vertical hi > sse2: 3.68x faster (20506±46.2 vs. 5574±29.7 decicycles) compared with > none > avx2: 5.96x faster (20506±46.2 vs. 3442±32.7 decicycles) compared with > none > legall vertical lo > sse2: 1.52x faster (28360±178.6 vs. 18603±114.8 decicycles) compared with > none > avx2: 1.64x faster (28360±178.6 vs. 17255±372.3 decicycles) compared with > none > > dd97 vertical hi > sse2: 2.76x faster (31975±103.0 vs. 11570±247.5 decicycles) compared with > none > avx: 2.82x faster (31975±103.0 vs. 11346±179.0 decicycles) compared with > none > avx2: 3.83x faster (31975±103.0 vs. 8357±219.6 decicycles) compared with > none > dd97 vertical lo > sse2: 1.52x faster (29476±335.8 vs. 19429±518.7 decicycles) compared with > none > avx2: 1.62x faster (29476±335.8 vs. 18246±559.8 decicycles) compared with > none Here "none" refers to the C functions, from "-cpuflags none" option. I also have the results of removing the C wrappers from these functions, except dd97. They aren't that much better. > haar horizontal compose > sse2: 3.68x faster (45143±36.4 vs. 12279±16.4 decicycles) compared with > none > avx: 3.68x faster (45143±36.4 vs. 12275±9.2 decicycles) compared with > none > avx2: 5.16x faster (45143±36.4 vs. 8742±12.3 decicycles) compared with > none > haar vertical compose > sse2: 1.64x faster (31792±367.5 vs. 19377±271.7 decicycles) compared with > none > avx: 1.58x faster (31792±367.5 vs. 20090±593.9 decicycles) compared with > none > avx2: 1.66x faster (31792±367.5 vs. 19157±1352.4 decicycles) compared > with none > > legall vertical hi > sse2: 3.86x faster (20201±26.5 vs. 5231±39.0 decicycles) compared with > none > avx2: 6.70x faster (20201±26.5 vs. 3014±39.1 decicycles) compared with > none > legall vertical lo > sse2: 1.50x faster (28345±206.6 vs. 18908±440.3 decicycles) compared with > none > avx2: 1.63x faster (28345±206.6 vs. 17361±637.9 decicycles) compared with > none I will squash patches, update commit messages, and send a new patch thread. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH 3/6] diracdec: add 10-bit Deslauriers-Dubuc 9, 7 (9_7) vertical high-pass function
On 2018-07-19 17:26, Rostislav Pehlivanov wrote: > On 19 July 2018 at 15:52, James Darnley wrote: > >> int32_t *b1, int32_t *b2, int >> b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); >> } >> >> +static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, >> + int32_t *b3, int32_t *b4, int width) >> +{ >> +int i = width & ~3; >> +ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i); >> +for(; i> +b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); >> + >> +} >> > > > This, along with the rest of the patchset: what's up with the hybrid > implementations? Couldn't you put the second part in the asm code as well? > Now there are 2 function calls instead of 1. The 8-bit code does this and I just followed it lead. I believe this is done because we cannot write junk data beyond what we think is the end of the line because this might be one of the higher depths and the coeffs for the next level sit beyond the end of the line. But now it has just occurred to me that maybe you meant "why didn't you do the scalar operations in SIMD?", is that what you meant? Answer is because it didn't occur to me at the time. Aside from that I always write do-while loops in assembly because I can usually guarantee 1 run of the block. I can certainly look at making that change. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH 0/6] x86 SIMD for dirac 10-bit wavelet transforms
On 2018-07-19 17:23, Rostislav Pehlivanov wrote: > > Could you provide standard overall transform results using START/STOP_TIMER > rather than overall decoding speed? > Coefficients sizes and therefore golomb unpacking speed changes with > respect to the transform so potentially there could be somewhat of a > bottleneck on decoding before the inverse transform. Ah, you are right about that. Should I limit the depth to 1 so that the functions operate on the same width all the time? Anyway, I will get the timers in there. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH 5/6] diracdec: avx2 dd97
--- libavcodec/x86/dirac_dwt_10bit.asm| 3 ++- libavcodec/x86/dirac_dwt_init_10bit.c | 13 + 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm index ae110d2945..2e039e11ea 100644 --- a/libavcodec/x86/dirac_dwt_10bit.asm +++ b/libavcodec/x86/dirac_dwt_10bit.asm @@ -25,7 +25,7 @@ SECTION_RODATA cextern pd_1 pd_2: times 8 dd 2 -pd_8: times 4 dd 8 +pd_8: times 8 dd 8 SECTION .text @@ -202,6 +202,7 @@ HAAR_HORIZONTAL HAAR_VERTICAL INIT_YMM avx2 +DD97_VERTICAL_HI HAAR_HORIZONTAL HAAR_VERTICAL LEGALL53_VERTICAL_HI diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c index 51d6eeae93..f103a56176 100644 --- a/libavcodec/x86/dirac_dwt_init_10bit.c +++ b/libavcodec/x86/dirac_dwt_init_10bit.c @@ -24,6 +24,7 @@ #include "libavcodec/dirac_dwt.h" void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t *b3, int32_t *b4, int width); +void ff_dd97_vertical_hi_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t *b3, int32_t *b4, int width); void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); @@ -137,7 +138,15 @@ static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i); for(; ivertical_compose_h0 = (void*)dd97_vertical_hi_avx2; +d->vertical_compose_l0 = (void*)legall53_vertical_lo_avx2; +break; case DWT_DIRAC_LEGALL5_3: d->vertical_compose_h0 = (void*)legall53_vertical_hi_avx2; d->vertical_compose_l0 = (void*)legall53_vertical_lo_avx2; -- 2.17.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH 4/6] diracdec: avx2 legall
--- libavcodec/x86/dirac_dwt_10bit.asm| 4 +++- libavcodec/x86/dirac_dwt_init_10bit.c | 22 ++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm index 681de5e1df..ae110d2945 100644 --- a/libavcodec/x86/dirac_dwt_10bit.asm +++ b/libavcodec/x86/dirac_dwt_10bit.asm @@ -24,7 +24,7 @@ SECTION_RODATA cextern pd_1 -pd_2: times 4 dd 2 +pd_2: times 8 dd 2 pd_8: times 4 dd 8 SECTION .text @@ -204,3 +204,5 @@ HAAR_VERTICAL INIT_YMM avx2 HAAR_HORIZONTAL HAAR_VERTICAL +LEGALL53_VERTICAL_HI +LEGALL53_VERTICAL_LO diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c index e7e7534050..51d6eeae93 100644 --- a/libavcodec/x86/dirac_dwt_init_10bit.c +++ b/libavcodec/x86/dirac_dwt_init_10bit.c @@ -27,6 +27,8 @@ void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t *b3 void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); +void ff_legall53_vertical_hi_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int width); +void ff_legall53_vertical_lo_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int width); void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align); void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align); @@ -112,6 +114,22 @@ static void legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); } +static void legall53_vertical_lo_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int width) +{ +int i = width & ~7; +ff_legall53_vertical_lo_avx2(b0, b1, b2, i); +for(; ivertical_compose_h0 = (void*)legall53_vertical_hi_avx2; +d->vertical_compose_l0 = (void*)legall53_vertical_lo_avx2; +break; case DWT_DIRAC_HAAR0: d->vertical_compose = (void*)vertical_compose_haar_avx2; break; -- 2.17.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH 3/6] diracdec: add 10-bit Deslauriers-Dubuc 9, 7 (9_7) vertical high-pass function
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the relevant transform. C: 84fps SSE2: 111fps AVX2: 115fps --- libavcodec/x86/dirac_dwt_10bit.asm| 38 +++ libavcodec/x86/dirac_dwt_init_10bit.c | 16 +++ 2 files changed, 54 insertions(+) diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm index c00de32bfe..681de5e1df 100644 --- a/libavcodec/x86/dirac_dwt_10bit.asm +++ b/libavcodec/x86/dirac_dwt_10bit.asm @@ -25,6 +25,7 @@ SECTION_RODATA cextern pd_1 pd_2: times 4 dd 2 +pd_8: times 4 dd 8 SECTION .text @@ -153,7 +154,44 @@ RET %endmacro +%macro DD97_VERTICAL_HI 0 + +cglobal dd97_vertical_hi, 6, 6, 8, b0, b1, b2, b3, b4, w +mova m7, [pd_8] +shl wd, 2 +add b0q, wq +add b1q, wq +add b2q, wq +add b3q, wq +add b4q, wq +neg wq + +ALIGN 16 +.loop: +mova m0, [b0q + wq] +mova m1, [b1q + wq] +mova m2, [b2q + wq] +mova m3, [b3q + wq] +mova m4, [b4q + wq] +pslld m5, m1, 3 +pslld m6, m3, 3 +paddd m5, m1 +paddd m6, m3 +psubd m5, m0 +psubd m6, m4 +paddd m5, m7 +paddd m5, m6 +psrad m5, 4 +paddd m2, m5 +mova [b2q + wq], m2 +add wq, mmsize +jl .loop +RET + +%endmacro + INIT_XMM sse2 +DD97_VERTICAL_HI HAAR_HORIZONTAL HAAR_VERTICAL LEGALL53_VERTICAL_HI diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c index 88cf267d14..e7e7534050 100644 --- a/libavcodec/x86/dirac_dwt_init_10bit.c +++ b/libavcodec/x86/dirac_dwt_init_10bit.c @@ -23,6 +23,8 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/dirac_dwt.h" +void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t *b3, int32_t *b4, int width); + void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); @@ -110,6 +112,16 @@ static void legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); } +static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, + int32_t *b3, int32_t *b4, int width) +{ +int i = width & ~3; +ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i); +for(; ivertical_compose_h0 = (void*)dd97_vertical_hi_sse2; +d->vertical_compose_l0 = (void*)legall53_vertical_lo_sse2; +break; case DWT_DIRAC_LEGALL5_3: d->vertical_compose_h0 = (void*)legall53_vertical_hi_sse2; d->vertical_compose_l0 = (void*)legall53_vertical_lo_sse2; -- 2.17.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH 1/6] diracdec: add 10-bit Haar SIMD functions
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the relevant transform. C:119fps SSE2: 204fps AVX: 206fps AVX2: 221fps --- libavcodec/dirac_dwt.c| 7 +- libavcodec/dirac_dwt.h| 1 + libavcodec/x86/Makefile | 6 +- libavcodec/x86/dirac_dwt_10bit.asm| 113 + libavcodec/x86/dirac_dwt_init_10bit.c | 136 ++ 5 files changed, 260 insertions(+), 3 deletions(-) create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c index cc08f8865a..86bee5bb9b 100644 --- a/libavcodec/dirac_dwt.c +++ b/libavcodec/dirac_dwt.c @@ -59,8 +59,13 @@ int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type, return AVERROR_INVALIDDATA; } -if (ARCH_X86 && bit_depth == 8) +#if ARCH_X86 +if (bit_depth == 8) ff_spatial_idwt_init_x86(d, type); +else if (bit_depth == 10) +ff_spatial_idwt_init_10bit_x86(d, type); +#endif + return 0; } diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h index 994dc21d70..1ad7b9a821 100644 --- a/libavcodec/dirac_dwt.h +++ b/libavcodec/dirac_dwt.h @@ -88,6 +88,7 @@ enum dwt_type { int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type, int decomposition_count, int bit_depth); void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type); +void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type); void ff_spatial_idwt_slice2(DWTContext *d, int y); diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 2350c8bbee..590d83c167 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -7,7 +7,8 @@ OBJS-$(CONFIG_BLOCKDSP)+= x86/blockdsp_init.o OBJS-$(CONFIG_BSWAPDSP)+= x86/bswapdsp_init.o OBJS-$(CONFIG_DCT) += x86/dct_init.o OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \ - x86/dirac_dwt_init.o + x86/dirac_dwt_init.o \ + x86/dirac_dwt_init_10bit.o OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o OBJS-$(CONFIG_FFT) += x86/fft_init.o OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o @@ -153,7 +154,8 @@ X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)+= x86/diracdsp.o\ - x86/dirac_dwt.o + x86/dirac_dwt.o \ + x86/dirac_dwt_10bit.o X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER)+= x86/dnxhdenc.o X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm new file mode 100644 index 00..dc3830615e --- /dev/null +++ b/libavcodec/x86/dirac_dwt_10bit.asm @@ -0,0 +1,113 @@ +;** +;* x86 optimized discrete 10-bit wavelet trasnform +;* Copyright (c) 2018 James Darnley +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pd_1 + +SECTION .text + +%macro HAAR_VERTICAL 0 + +cglobal vertical_compose_haar_10bit, 3, 3, 4, b0, b1, w +mova m2, [pd_1] +shl wd, 2 +add b0q, wq +add b1q, wq +neg wq + +ALIGN 16 +.loop: +mova m0, [b0q + wq] +mova m1, [b1q + wq] +paddd m3, m1, m2 +psrad m3, 1 +psubd m0, m3 +paddd m1, m0 +mova [b0q + wq], m0 +mova [b1q + wq], m1 +add wq, mmsize +jl .loop +RET + +%endmacro + +%macro HAAR_HORIZONTAL 0 + +cglobal horizontal_compose_haar_10bit, 3, 6, 4, b, temp_, w,
[FFmpeg-devel] [PATCH 2/6] diracdec: add 10-bit Legall 5, 3 (5_3) SIMD functions
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the relevant transform. C: 94fps SSE2: 118fps AVX2: 121fps --- libavcodec/x86/dirac_dwt_10bit.asm| 55 +++ libavcodec/x86/dirac_dwt_init_10bit.c | 23 +++ 2 files changed, 78 insertions(+) diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm index dc3830615e..c00de32bfe 100644 --- a/libavcodec/x86/dirac_dwt_10bit.asm +++ b/libavcodec/x86/dirac_dwt_10bit.asm @@ -24,6 +24,7 @@ SECTION_RODATA cextern pd_1 +pd_2: times 4 dd 2 SECTION .text @@ -100,9 +101,63 @@ REP_RET %endmacro +%macro LEGALL53_VERTICAL_LO 0 + +cglobal legall53_vertical_lo, 4, 4, 4, b0, b1, b2, w +mova m3, [pd_2] +shl wd, 2 +add b0q, wq +add b1q, wq +add b2q, wq +neg wq + +ALIGN 16 +.loop: +mova m0, [b0q + wq] +mova m1, [b1q + wq] +mova m2, [b2q + wq] +paddd m0, m2 +paddd m0, m3 +psrad m0, 2 +psubd m1, m0 +mova [b1q + wq], m1 +add wq, mmsize +jl .loop +RET + +%endmacro + +%macro LEGALL53_VERTICAL_HI 0 + +cglobal legall53_vertical_hi, 4, 4, 4, b0, b1, b2, w +mova m3, [pd_1] +shl wd, 2 +add b0q, wq +add b1q, wq +add b2q, wq +neg wq + +ALIGN 16 +.loop: +mova m0, [b0q + wq] +mova m1, [b1q + wq] +mova m2, [b2q + wq] +paddd m0, m2 +paddd m0, m3 +psrad m0, 1 +paddd m1, m0 +mova [b1q + wq], m1 +add wq, mmsize +jl .loop +RET + +%endmacro + INIT_XMM sse2 HAAR_HORIZONTAL HAAR_VERTICAL +LEGALL53_VERTICAL_HI +LEGALL53_VERTICAL_LO INIT_XMM avx HAAR_HORIZONTAL diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c index 939950e3ff..88cf267d14 100644 --- a/libavcodec/x86/dirac_dwt_init_10bit.c +++ b/libavcodec/x86/dirac_dwt_init_10bit.c @@ -23,6 +23,9 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/dirac_dwt.h" +void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); +void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); + void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align); void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align); void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align); @@ -91,6 +94,22 @@ static void horizontal_compose_haar_avx2(int32_t *b, int32_t *tmp, int width) } } +static void legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width) +{ +int i = width & ~3; +ff_legall53_vertical_lo_sse2(b0, b1, b2, i); +for(; ivertical_compose_h0 = (void*)legall53_vertical_hi_sse2; +d->vertical_compose_l0 = (void*)legall53_vertical_lo_sse2; +break; case DWT_DIRAC_HAAR0: d->vertical_compose = (void*)vertical_compose_haar_sse2; break; -- 2.17.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel