Re: [FFmpeg-devel] [PATCH 1/6] avcodec/h264: mmx2, sse2, avx 10-bit h chroma deblock/loop filter
On 2016-12-02 00:31, Carl Eugen Hoyos wrote: > 2016-12-01 17:57 GMT+01:00 James Darnley: >> Yorkfield: >> - mmx2: 2.44x faster (278 vs. 114 cycles) >> - sse2: 3.35x faster (278 vs. 83 cycles) >> >> Skylake: >> - mmx2: 1.69x faster (169 vs. 100 cycles) >> - sse2: 2.34x faster (169 vs. 72 cycles) > > Is it expected (or possible) that the speed impact is so > different for different Intel hardware? Yes. Intel's Core branded processors introduced a much better micro-architecture (the generation after the Yorkfield) which will cause the scalar C code to be quite a bit faster. The SIMD on the other hand was already so quick it didn't gain much. (At least I think I remember this being the story.) >> - avx: 2.32x faster (169 vs. 73 cycles) > > Don't you agree that if this is true (I don't know if it is) > the patch should not be applied as is? I do agree and I wouldn't (deliberately) apply anything that made the decoder slower, or not as fast as it could be. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH 1/6] avcodec/h264: mmx2, sse2, avx 10-bit h chroma deblock/loop filter
On 2016-12-01 23:16, Michael Niedermayer wrote: > On Thu, Dec 01, 2016 at 05:57:44PM +0100, James Darnley wrote: >> Yorkfield: >> - mmx2: 2.44x faster (278 vs. 114 cycles) >> - sse2: 3.35x faster (278 vs. 83 cycles) >> >> Skylake: >> - mmx2: 1.69x faster (169 vs. 100 cycles) >> - sse2: 2.34x faster (169 vs. 72 cycles) >> - avx: 2.32x faster (169 vs. 73 cycles) >> --- >> libavcodec/x86/h264_deblock_10bit.asm | 118 >> ++ >> libavcodec/x86/h264dsp_init.c | 9 +++ >> 2 files changed, 127 insertions(+) > > breaks build on linux x86-32 > > YASMlibavcodec/x86/h264_deblock_10bit.o > src/libavcodec/x86/h264_deblock_10bit.asm:1039: warning: `bpl' is a register > in 64-bit mode > src/libavcodec/x86/h264_deblock_10bit.asm:1039: error: undefined symbol `bpl' > (first use) > src/libavcodec/x86/h264_deblock_10bit.asm:1039: error: (Each undefined > symbol is reported only once.) > src/libavcodec/x86/h264_deblock_10bit.asm:1039: warning: `bpl' is a register > in 64-bit mode Ah. I shouldn't do clever things like trying to use the byte-sized registers. It isn't needed and causes problems like this. Changed locally. Also changed in the 4:2:0 chroma intra patch. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH 1/6] avcodec/h264: mmx2, sse2, avx 10-bit h chroma deblock/loop filter
2016-12-01 17:57 GMT+01:00 James Darnley: > Yorkfield: > - mmx2: 2.44x faster (278 vs. 114 cycles) > - sse2: 3.35x faster (278 vs. 83 cycles) > > Skylake: > - mmx2: 1.69x faster (169 vs. 100 cycles) > - sse2: 2.34x faster (169 vs. 72 cycles) Is it expected (or possible) that the speed impact is so different for different Intel hardware? > - avx: 2.32x faster (169 vs. 73 cycles) Don't you agree that if this is true (I don't know if it is) the patch should not be applied as is? Carl Eugen ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH 1/6] avcodec/h264: mmx2, sse2, avx 10-bit h chroma deblock/loop filter
On Thu, Dec 01, 2016 at 05:57:44PM +0100, James Darnley wrote: > Yorkfield: > - mmx2: 2.44x faster (278 vs. 114 cycles) > - sse2: 3.35x faster (278 vs. 83 cycles) > > Skylake: > - mmx2: 1.69x faster (169 vs. 100 cycles) > - sse2: 2.34x faster (169 vs. 72 cycles) > - avx: 2.32x faster (169 vs. 73 cycles) > --- > libavcodec/x86/h264_deblock_10bit.asm | 118 > ++ > libavcodec/x86/h264dsp_init.c | 9 +++ > 2 files changed, 127 insertions(+) breaks build on linux x86-32 YASMlibavcodec/x86/h264_deblock_10bit.o src/libavcodec/x86/h264_deblock_10bit.asm:1039: warning: `bpl' is a register in 64-bit mode src/libavcodec/x86/h264_deblock_10bit.asm:1039: error: undefined symbol `bpl' (first use) src/libavcodec/x86/h264_deblock_10bit.asm:1039: error: (Each undefined symbol is reported only once.) src/libavcodec/x86/h264_deblock_10bit.asm:1039: warning: `bpl' is a register in 64-bit mode [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB In a rich man's house there is no place to spit but his face. -- Diogenes of Sinope signature.asc Description: Digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH 1/6] avcodec/h264: mmx2, sse2, avx 10-bit h chroma deblock/loop filter
Yorkfield: - mmx2: 2.44x faster (278 vs. 114 cycles) - sse2: 3.35x faster (278 vs. 83 cycles) Skylake: - mmx2: 1.69x faster (169 vs. 100 cycles) - sse2: 2.34x faster (169 vs. 72 cycles) - avx: 2.32x faster (169 vs. 73 cycles) --- libavcodec/x86/h264_deblock_10bit.asm | 118 ++ libavcodec/x86/h264dsp_init.c | 9 +++ 2 files changed, 127 insertions(+) diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm index ebf8a3f..e91b1c6 100644 --- a/libavcodec/x86/h264_deblock_10bit.asm +++ b/libavcodec/x86/h264_deblock_10bit.asm @@ -843,6 +843,88 @@ DEBLOCK_LUMA_INTRA mova [r0+2*r1], m2 %endmacro +; expands to [base],...,[base+7*stride] +%define PASS8ROWS(base, base3, stride, stride3) \ +[base], [base+stride], [base+stride*2], [base3], \ +[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] + +; in: 8 rows of 4 words in %4..%11 +; out: 4 rows of 8 words in m0..m3 +%macro TRANSPOSE4x8W_LOAD 8 +movq m0, %1 +movq m2, %2 +movq m1, %3 +movq m3, %4 + +punpcklwdm0, m2 +punpcklwdm1, m3 +punpckhdqm2, m0, m1 +punpckldqm0, m1 + +movq m4, %5 +movq m6, %6 +movq m5, %7 +movq m3, %8 + +punpcklwdm4, m6 +punpcklwdm5, m3 +punpckhdqm6, m4, m5 +punpckldqm4, m5 + +punpckhqdq m1, m0, m4 +punpcklqdq m0, m4 +punpckhqdq m3, m2, m6 +punpcklqdq m2, m6 +%endmacro + +; in: 4 rows of 8 words in m0..m3 +; out: 8 rows of 4 words in %1..%8 +%macro TRANSPOSE8x4W_STORE 8 +TRANSPOSE4x4W 0, 1, 2, 3, 4 +movq %1, m0 +movhps %2, m0 +movq %3, m1 +movhps %4, m1 +movq %5, m2 +movhps %6, m2 +movq %7, m3 +movhps %8, m3 +%endmacro + +; %1 = base + 3*stride +; %2 = 3*stride (unused on mmx) +; %3, %4 = place to store p1 and q1 values +%macro CHROMA_H_LOAD 4 +%if mmsize == 8 +movq m0, [pix_q - 4] +movq m1, [pix_q + stride_q - 4] +movq m2, [pix_q + 2*stride_q - 4] +movq m3, [%1 - 4] +TRANSPOSE4x4W 0, 1, 2, 3, 4 +%else +TRANSPOSE4x8W_LOAD PASS8ROWS(pix_q-4, %1-4, stride_q, %2) +%endif +mova %3, m0 +mova %4, m3 +%endmacro + +; %1 = base + 3*stride +; %2 = 3*stride (unused on mmx) +; %3, %4 = place to load p1 and q1 values +%macro CHROMA_H_STORE 4 +mova m0, %3 +mova m3, %4 +%if mmsize == 8 +TRANSPOSE4x4W 0, 1, 2, 3, 4 +movq [pix_q - 4], m0 +movq [pix_q + stride_q - 4], m1 +movq [pix_q + 2*stride_q - 4], m2 +movq [%1 - 4], m3 +%else +TRANSPOSE8x4W_STORE PASS8ROWS(pix_q-4, %1-4, stride_q, %2) +%endif +%endmacro + %macro CHROMA_V_LOAD_TC 2 movd%1, [%2] punpcklbw %1, %1 @@ -914,6 +996,42 @@ cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16) %else RET %endif + +;- +; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta, +; int8_t *tc0) +;- +cglobal deblock_h_chroma_10, 5, 7, 8, 2*mmsize, pix_, stride_, alpha_, beta_, tc0_ +shl alpha_d, 2 +shl beta_d, 2 +mov r5, pix_q +lea r6, [3*stride_q] +add r5, r6 +%if mmsize == 8 +mov r6b, 2 +.loop: +%endif + +CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize] +LOAD_AB m4, m5, alpha_d, beta_d +LOAD_MASKm0, m1, m2, m3, m4, m5, m7, m6, m4 +pxor m4, m4 +CHROMA_V_LOAD_TC m6, tc0_q +psubwm6, [pw_3] +pmaxsw m6, m4 +pand m7, m6 +DEBLOCK_P0_Q0m1, m2, m0, m3, m7, m5, m6 +CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize] + +%if mmsize == 8 +lea pix_q, [pix_q + 4*stride_q] +lea r5,[r5 + 4*stride_q] +add tc0_q, 2 +dec r6b +jg .loop +%endif +RET + %endmacro %if ARCH_X86_64 == 0 diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index c6c643a..c568762 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -310,6 +310,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, #if ARCH_X86_32 c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext; c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext; +if (chroma_format_idc <= 1) { +c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext; +} c->h264_v_loop_filter_luma