Re: [FFmpeg-devel] [PATCH 1/6] avcodec/h264: mmx2, sse2, avx 10-bit h chroma deblock/loop filter

2016-12-01 Thread James Darnley
On 2016-12-02 00:31, Carl Eugen Hoyos wrote:
> 2016-12-01 17:57 GMT+01:00 James Darnley :
>> Yorkfield:
>>  - mmx2: 2.44x faster (278 vs. 114 cycles)
>>  - sse2: 3.35x faster (278 vs.  83 cycles)
>>
>> Skylake:
>>  - mmx2: 1.69x faster (169 vs. 100 cycles)
>>  - sse2: 2.34x faster (169 vs.  72 cycles)
> 
> Is it expected (or possible) that the speed impact is so
> different for different Intel hardware?

Yes.  Intel's Core branded processors introduced a much better
micro-architecture (the generation after the Yorkfield) which will cause
the scalar C code to be quite a bit faster.  The SIMD on the other hand
was already so quick it didn't gain much.

(At least I think I remember this being the story.)

>>  - avx:  2.32x faster (169 vs.  73 cycles)
> 
> Don't you agree that if this is true (I don't know if it is)
> the patch should not be applied as is?

I do agree and I wouldn't (deliberately) apply anything that made the
decoder slower, or not as fast as it could be.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 1/6] avcodec/h264: mmx2, sse2, avx 10-bit h chroma deblock/loop filter

2016-12-01 Thread James Darnley
On 2016-12-01 23:16, Michael Niedermayer wrote:
> On Thu, Dec 01, 2016 at 05:57:44PM +0100, James Darnley wrote:
>> Yorkfield:
>>  - mmx2: 2.44x faster (278 vs. 114 cycles)
>>  - sse2: 3.35x faster (278 vs.  83 cycles)
>>
>> Skylake:
>>  - mmx2: 1.69x faster (169 vs. 100 cycles)
>>  - sse2: 2.34x faster (169 vs.  72 cycles)
>>  - avx:  2.32x faster (169 vs.  73 cycles)
>> ---
>>  libavcodec/x86/h264_deblock_10bit.asm | 118 
>> ++
>>  libavcodec/x86/h264dsp_init.c |   9 +++
>>  2 files changed, 127 insertions(+)
> 
> breaks build on linux x86-32
> 
> YASMlibavcodec/x86/h264_deblock_10bit.o
> src/libavcodec/x86/h264_deblock_10bit.asm:1039: warning: `bpl' is a register 
> in 64-bit mode
> src/libavcodec/x86/h264_deblock_10bit.asm:1039: error: undefined symbol `bpl' 
> (first use)
> src/libavcodec/x86/h264_deblock_10bit.asm:1039: error:  (Each undefined 
> symbol is reported only once.)
> src/libavcodec/x86/h264_deblock_10bit.asm:1039: warning: `bpl' is a register 
> in 64-bit mode

Ah.  I shouldn't do clever things like trying to use the byte-sized
registers.  It isn't needed and causes problems like this.  Changed
locally.  Also changed in the 4:2:0 chroma intra patch.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 1/6] avcodec/h264: mmx2, sse2, avx 10-bit h chroma deblock/loop filter

2016-12-01 Thread Carl Eugen Hoyos
2016-12-01 17:57 GMT+01:00 James Darnley :
> Yorkfield:
>  - mmx2: 2.44x faster (278 vs. 114 cycles)
>  - sse2: 3.35x faster (278 vs.  83 cycles)
>
> Skylake:
>  - mmx2: 1.69x faster (169 vs. 100 cycles)
>  - sse2: 2.34x faster (169 vs.  72 cycles)

Is it expected (or possible) that the speed impact is so
different for different Intel hardware?

>  - avx:  2.32x faster (169 vs.  73 cycles)

Don't you agree that if this is true (I don't know if it is)
the patch should not be applied as is?

Carl Eugen
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 1/6] avcodec/h264: mmx2, sse2, avx 10-bit h chroma deblock/loop filter

2016-12-01 Thread Michael Niedermayer
On Thu, Dec 01, 2016 at 05:57:44PM +0100, James Darnley wrote:
> Yorkfield:
>  - mmx2: 2.44x faster (278 vs. 114 cycles)
>  - sse2: 3.35x faster (278 vs.  83 cycles)
> 
> Skylake:
>  - mmx2: 1.69x faster (169 vs. 100 cycles)
>  - sse2: 2.34x faster (169 vs.  72 cycles)
>  - avx:  2.32x faster (169 vs.  73 cycles)
> ---
>  libavcodec/x86/h264_deblock_10bit.asm | 118 
> ++
>  libavcodec/x86/h264dsp_init.c |   9 +++
>  2 files changed, 127 insertions(+)

breaks build on linux x86-32

YASMlibavcodec/x86/h264_deblock_10bit.o
src/libavcodec/x86/h264_deblock_10bit.asm:1039: warning: `bpl' is a register in 
64-bit mode
src/libavcodec/x86/h264_deblock_10bit.asm:1039: error: undefined symbol `bpl' 
(first use)
src/libavcodec/x86/h264_deblock_10bit.asm:1039: error:  (Each undefined symbol 
is reported only once.)
src/libavcodec/x86/h264_deblock_10bit.asm:1039: warning: `bpl' is a register in 
64-bit mode

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

In a rich man's house there is no place to spit but his face.
-- Diogenes of Sinope


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 1/6] avcodec/h264: mmx2, sse2, avx 10-bit h chroma deblock/loop filter

2016-12-01 Thread James Darnley
Yorkfield:
 - mmx2: 2.44x faster (278 vs. 114 cycles)
 - sse2: 3.35x faster (278 vs.  83 cycles)

Skylake:
 - mmx2: 1.69x faster (169 vs. 100 cycles)
 - sse2: 2.34x faster (169 vs.  72 cycles)
 - avx:  2.32x faster (169 vs.  73 cycles)
---
 libavcodec/x86/h264_deblock_10bit.asm | 118 ++
 libavcodec/x86/h264dsp_init.c |   9 +++
 2 files changed, 127 insertions(+)

diff --git a/libavcodec/x86/h264_deblock_10bit.asm 
b/libavcodec/x86/h264_deblock_10bit.asm
index ebf8a3f..e91b1c6 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -843,6 +843,88 @@ DEBLOCK_LUMA_INTRA
 mova [r0+2*r1], m2
 %endmacro
 
+; expands to [base],...,[base+7*stride]
+%define PASS8ROWS(base, base3, stride, stride3) \
+[base], [base+stride], [base+stride*2], [base3], \
+[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
+
+; in: 8 rows of 4 words in %4..%11
+; out: 4 rows of 8 words in m0..m3
+%macro TRANSPOSE4x8W_LOAD 8
+movq m0, %1
+movq m2, %2
+movq m1, %3
+movq m3, %4
+
+punpcklwdm0, m2
+punpcklwdm1, m3
+punpckhdqm2, m0, m1
+punpckldqm0, m1
+
+movq m4, %5
+movq m6, %6
+movq m5, %7
+movq m3, %8
+
+punpcklwdm4, m6
+punpcklwdm5, m3
+punpckhdqm6, m4, m5
+punpckldqm4, m5
+
+punpckhqdq   m1, m0, m4
+punpcklqdq   m0, m4
+punpckhqdq   m3, m2, m6
+punpcklqdq   m2, m6
+%endmacro
+
+; in: 4 rows of 8 words in m0..m3
+; out: 8 rows of 4 words in %1..%8
+%macro TRANSPOSE8x4W_STORE 8
+TRANSPOSE4x4W 0, 1, 2, 3, 4
+movq %1, m0
+movhps   %2, m0
+movq %3, m1
+movhps   %4, m1
+movq %5, m2
+movhps   %6, m2
+movq %7, m3
+movhps   %8, m3
+%endmacro
+
+; %1 = base + 3*stride
+; %2 = 3*stride (unused on mmx)
+; %3, %4 = place to store p1 and q1 values
+%macro CHROMA_H_LOAD 4
+%if mmsize == 8
+movq m0, [pix_q - 4]
+movq m1, [pix_q +   stride_q - 4]
+movq m2, [pix_q + 2*stride_q - 4]
+movq m3, [%1 - 4]
+TRANSPOSE4x4W 0, 1, 2, 3, 4
+%else
+TRANSPOSE4x8W_LOAD PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
+%endif
+mova %3, m0
+mova %4, m3
+%endmacro
+
+; %1 = base + 3*stride
+; %2 = 3*stride (unused on mmx)
+; %3, %4 = place to load p1 and q1 values
+%macro CHROMA_H_STORE 4
+mova m0, %3
+mova m3, %4
+%if mmsize == 8
+TRANSPOSE4x4W 0, 1, 2, 3, 4
+movq [pix_q - 4],  m0
+movq [pix_q +   stride_q - 4], m1
+movq [pix_q + 2*stride_q - 4], m2
+movq [%1 - 4], m3
+%else
+TRANSPOSE8x4W_STORE PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
+%endif
+%endmacro
+
 %macro CHROMA_V_LOAD_TC 2
 movd%1, [%2]
 punpcklbw   %1, %1
@@ -914,6 +996,42 @@ cglobal deblock_v_chroma_intra_10, 
4,6-(mmsize/16),8*(mmsize/16)
 %else
 RET
 %endif
+
+;-
+; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
+; int8_t *tc0)
+;-
+cglobal deblock_h_chroma_10, 5, 7, 8, 2*mmsize, pix_, stride_, alpha_, beta_, 
tc0_
+shl alpha_d,  2
+shl beta_d,   2
+mov r5,   pix_q
+lea r6,  [3*stride_q]
+add r5,   r6
+%if mmsize == 8
+mov r6b,  2
+.loop:
+%endif
+
+CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize]
+LOAD_AB  m4,  m5, alpha_d, beta_d
+LOAD_MASKm0,  m1, m2, m3, m4, m5, m7, m6, m4
+pxor m4,  m4
+CHROMA_V_LOAD_TC m6,  tc0_q
+psubwm6, [pw_3]
+pmaxsw   m6,  m4
+pand m7,  m6
+DEBLOCK_P0_Q0m1,  m2, m0, m3, m7, m5, m6
+CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize]
+
+%if mmsize == 8
+lea pix_q, [pix_q + 4*stride_q]
+lea r5,[r5 + 4*stride_q]
+add tc0_q,  2
+dec r6b
+jg .loop
+%endif
+RET
+
 %endmacro
 
 %if ARCH_X86_64 == 0
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index c6c643a..c568762 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -310,6 +310,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const 
int bit_depth,
 #if ARCH_X86_32
 c->h264_v_loop_filter_chroma   = ff_deblock_v_chroma_10_mmxext;
 c->h264_v_loop_filter_chroma_intra = 
ff_deblock_v_chroma_intra_10_mmxext;
+if (chroma_format_idc <= 1) {
+c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
+}
 c->h264_v_loop_filter_luma