Re: [FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8
On Thu, Aug 21, 2014 at 12:42 AM, James Almer jamr...@gmail.com wrote: * Reduced xmm register count to 7 (As such they are now enabled for x86_32). * Removed four movdqa (affects the sse2 version only). * pxor is now used to clear m0 only once. ~5% faster. Signed-off-by: James Almer jamr...@gmail.com --- Good job, faster and 32-bit compat! libavcodec/x86/hevc_res_add.asm | 122 libavcodec/x86/hevcdsp_init.c | 10 ++-- 2 files changed, 51 insertions(+), 81 deletions(-) diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm index feea50c..7238fb3 100644 --- a/libavcodec/x86/hevc_res_add.asm +++ b/libavcodec/x86/hevc_res_add.asm @@ -88,71 +88,41 @@ cglobal hevc_transform_add4_8, 3, 4, 6 movhps [r0+r3 ], m1 %endmacro -%macro TR_ADD_INIT_SSE_8 0 -pxor m0, m0 - -mova m4, [r1] -mova m1, [r1+16] -psubw m2, m0, m1 -psubw m5, m0, m4 -packuswb m4, m1 -packuswb m5, m2 - -mova m6, [r1+32] -mova m1, [r1+48] -psubw m2, m0, m1 -psubw m7, m0, m6 -packuswb m6, m1 -packuswb m7, m2 - -mova m8, [r1+64] -mova m1, [r1+80] -psubw m2, m0, m1 -psubw m9, m0, m8 -packuswb m8, m1 -packuswb m9, m2 - -mova m10, [r1+96] -mova m1, [r1+112] -psubw m2, m0, m1 -psubwm11, m0, m10 -packuswb m10, m1 -packuswb m11, m2 -%endmacro - - -%macro TR_ADD_SSE_16_8 0 -TR_ADD_INIT_SSE_8 - -paddusb m0, m4, [r0 ] -paddusb m1, m6, [r0+r2 ] -paddusb m2, m8, [r0+r2*2] -paddusb m3, m10,[r0+r3 ] -psubusb m0, m5 -psubusb m1, m7 -psubusb m2, m9 -psubusb m3, m11 -mova [r0 ], m0 -mova [r0+r2 ], m1 -mova [r0+2*r2], m2 -mova [r0+r3 ], m3 -%endmacro - -%macro TR_ADD_SSE_32_8 0 -TR_ADD_INIT_SSE_8 - -paddusb m0, m4, [r0 ] -paddusb m1, m6, [r0+16 ] -paddusb m2, m8, [r0+r2 ] -paddusb m3, m10,[r0+r2+16] -psubusb m0, m5 -psubusb m1, m7 -psubusb m2, m9 -psubusb m3, m11 -mova [r0 ], m0 -mova [r0+16 ], m1 -mova [r0+r2 ], m2 -mova [r0+r2+16], m3 +%macro TR_ADD_SSE_16_32_8 3 +mova m2, [r1+%1 ] +mova m6, [r1+%1+16] +%if cpuflag(avx) +psubw m1, m0, m2 +psubw m5, m0, m6 +%else +mova m1, m0 +mova m5, m0 +psubw m1, m2 +psubw m5, m6 +%endif I was wondering about these blocks - doesn't the x264asm layer automatically add the mova's when you just use the 3-arg form on sse2? Or is there a speed benefit grouping the mov's? - Hendrik ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8
On 21/08/14 10:03 AM, Hendrik Leppkes wrote: On Thu, Aug 21, 2014 at 12:42 AM, James Almer jamr...@gmail.com wrote: * Reduced xmm register count to 7 (As such they are now enabled for x86_32). * Removed four movdqa (affects the sse2 version only). * pxor is now used to clear m0 only once. ~5% faster. Signed-off-by: James Almer jamr...@gmail.com --- Good job, faster and 32-bit compat! libavcodec/x86/hevc_res_add.asm | 122 libavcodec/x86/hevcdsp_init.c | 10 ++-- 2 files changed, 51 insertions(+), 81 deletions(-) diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm index feea50c..7238fb3 100644 --- a/libavcodec/x86/hevc_res_add.asm +++ b/libavcodec/x86/hevc_res_add.asm @@ -88,71 +88,41 @@ cglobal hevc_transform_add4_8, 3, 4, 6 movhps [r0+r3 ], m1 %endmacro -%macro TR_ADD_INIT_SSE_8 0 -pxor m0, m0 - -mova m4, [r1] -mova m1, [r1+16] -psubw m2, m0, m1 -psubw m5, m0, m4 -packuswb m4, m1 -packuswb m5, m2 - -mova m6, [r1+32] -mova m1, [r1+48] -psubw m2, m0, m1 -psubw m7, m0, m6 -packuswb m6, m1 -packuswb m7, m2 - -mova m8, [r1+64] -mova m1, [r1+80] -psubw m2, m0, m1 -psubw m9, m0, m8 -packuswb m8, m1 -packuswb m9, m2 - -mova m10, [r1+96] -mova m1, [r1+112] -psubw m2, m0, m1 -psubwm11, m0, m10 -packuswb m10, m1 -packuswb m11, m2 -%endmacro - - -%macro TR_ADD_SSE_16_8 0 -TR_ADD_INIT_SSE_8 - -paddusb m0, m4, [r0 ] -paddusb m1, m6, [r0+r2 ] -paddusb m2, m8, [r0+r2*2] -paddusb m3, m10,[r0+r3 ] -psubusb m0, m5 -psubusb m1, m7 -psubusb m2, m9 -psubusb m3, m11 -mova [r0 ], m0 -mova [r0+r2 ], m1 -mova [r0+2*r2], m2 -mova [r0+r3 ], m3 -%endmacro - -%macro TR_ADD_SSE_32_8 0 -TR_ADD_INIT_SSE_8 - -paddusb m0, m4, [r0 ] -paddusb m1, m6, [r0+16 ] -paddusb m2, m8, [r0+r2 ] -paddusb m3, m10,[r0+r2+16] -psubusb m0, m5 -psubusb m1, m7 -psubusb m2, m9 -psubusb m3, m11 -mova [r0 ], m0 -mova [r0+16 ], m1 -mova [r0+r2 ], m2 -mova [r0+r2+16], m3 +%macro TR_ADD_SSE_16_32_8 3 +mova m2, [r1+%1 ] +mova m6, [r1+%1+16] +%if cpuflag(avx) +psubw m1, m0, m2 +psubw m5, m0, m6 +%else +mova m1, m0 +mova m5, m0 +psubw m1, m2 +psubw m5, m6 +%endif I was wondering about these blocks - doesn't the x264asm layer automatically add the mova's when you just use the 3-arg form on sse2? Or is there a speed benefit grouping the mov's? - Hendrik It does that, but on older SSE2 cpus with not-so-good OOO execution grouping instructions like this might help reduce dependencies a bit. This trick is used all over the tree (including some macros from x86util), so it's certainly useful for some cpus. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8
Hi, 2014-08-21 0:42 GMT+02:00 James Almer jamr...@gmail.com: * Reduced xmm register count to 7 (As such they are now enabled for x86_32). * Removed four movdqa (affects the sse2 version only). * pxor is now used to clear m0 only once. OK. -- Christophe ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8
On 21/08/14 2:15 PM, Christophe Gisquet wrote: Hi, 2014-08-21 0:42 GMT+02:00 James Almer jamr...@gmail.com: * Reduced xmm register count to 7 (As such they are now enabled for x86_32). * Removed four movdqa (affects the sse2 version only). * pxor is now used to clear m0 only once. OK. Pushed. Thanks. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8
It does that, but on older SSE2 cpus with not-so-good OOO execution grouping instructions like this might help reduce dependencies a bit. Are any older SSE2 CPUs actually capable of decoding reasonable HEVC? ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8
On 21/08/14 5:40 PM, Kieran Kunhya wrote: It does that, but on older SSE2 cpus with not-so-good OOO execution grouping instructions like this might help reduce dependencies a bit. Are any older SSE2 CPUs actually capable of decoding reasonable HEVC? Probably not (at least nothing above dvd resolution), but aside from uglyfying the asm a bit it doesn't hurt. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8
On 21.08.2014, at 22:40, Kieran Kunhya kier...@obe.tv wrote: It does that, but on older SSE2 cpus with not-so-good OOO execution grouping instructions like this might help reduce dependencies a bit. Are any older SSE2 CPUs actually capable of decoding reasonable HEVC? Of course they are. Not in real time, but people would use the same method that was used for H.264 back in the day: re-encode to a simpler format (and/or skip processing steps). So any speedup will still be welcome on these even if less critical. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8
* Reduced xmm register count to 7 (As such they are now enabled for x86_32). * Removed four movdqa (affects the sse2 version only). * pxor is now used to clear m0 only once. ~5% faster. Signed-off-by: James Almer jamr...@gmail.com --- libavcodec/x86/hevc_res_add.asm | 122 libavcodec/x86/hevcdsp_init.c | 10 ++-- 2 files changed, 51 insertions(+), 81 deletions(-) diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm index feea50c..7238fb3 100644 --- a/libavcodec/x86/hevc_res_add.asm +++ b/libavcodec/x86/hevc_res_add.asm @@ -88,71 +88,41 @@ cglobal hevc_transform_add4_8, 3, 4, 6 movhps [r0+r3 ], m1 %endmacro -%macro TR_ADD_INIT_SSE_8 0 -pxor m0, m0 - -mova m4, [r1] -mova m1, [r1+16] -psubw m2, m0, m1 -psubw m5, m0, m4 -packuswb m4, m1 -packuswb m5, m2 - -mova m6, [r1+32] -mova m1, [r1+48] -psubw m2, m0, m1 -psubw m7, m0, m6 -packuswb m6, m1 -packuswb m7, m2 - -mova m8, [r1+64] -mova m1, [r1+80] -psubw m2, m0, m1 -psubw m9, m0, m8 -packuswb m8, m1 -packuswb m9, m2 - -mova m10, [r1+96] -mova m1, [r1+112] -psubw m2, m0, m1 -psubwm11, m0, m10 -packuswb m10, m1 -packuswb m11, m2 -%endmacro - - -%macro TR_ADD_SSE_16_8 0 -TR_ADD_INIT_SSE_8 - -paddusb m0, m4, [r0 ] -paddusb m1, m6, [r0+r2 ] -paddusb m2, m8, [r0+r2*2] -paddusb m3, m10,[r0+r3 ] -psubusb m0, m5 -psubusb m1, m7 -psubusb m2, m9 -psubusb m3, m11 -mova [r0 ], m0 -mova [r0+r2 ], m1 -mova [r0+2*r2], m2 -mova [r0+r3 ], m3 -%endmacro - -%macro TR_ADD_SSE_32_8 0 -TR_ADD_INIT_SSE_8 - -paddusb m0, m4, [r0 ] -paddusb m1, m6, [r0+16 ] -paddusb m2, m8, [r0+r2 ] -paddusb m3, m10,[r0+r2+16] -psubusb m0, m5 -psubusb m1, m7 -psubusb m2, m9 -psubusb m3, m11 -mova [r0 ], m0 -mova [r0+16 ], m1 -mova [r0+r2 ], m2 -mova [r0+r2+16], m3 +%macro TR_ADD_SSE_16_32_8 3 +mova m2, [r1+%1 ] +mova m6, [r1+%1+16] +%if cpuflag(avx) +psubw m1, m0, m2 +psubw m5, m0, m6 +%else +mova m1, m0 +mova m5, m0 +psubw m1, m2 +psubw m5, m6 +%endif +packuswb m2, m6 +packuswb m1, m5 + +mova m4, [r1+%1+32] +mova m6, [r1+%1+48] +%if cpuflag(avx) +psubw m3, m0, m4 +psubw m5, m0, m6 +%else +mova m3, m0 +mova m5, m0 +psubw m3, m4 +psubw m5, m6 +%endif +packuswb m4, m6 +packuswb m3, m5 + +paddusb m2, [%2] +paddusb m4, [%3] +psubusb m2, m1 +psubusb m4, m3 +mova[%2], m2 +mova[%3], m4 %endmacro @@ -166,30 +136,32 @@ cglobal hevc_transform_add8_8, 3, 4, 8 TR_ADD_SSE_8_8 RET -%if ARCH_X86_64 ; void ff_hevc_transform_add16_8_opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) -cglobal hevc_transform_add16_8, 3, 4, 12 +cglobal hevc_transform_add16_8, 3, 4, 7 +pxor m0, m0 lea r3, [r2*3] -TR_ADD_SSE_16_8 +TR_ADD_SSE_16_32_8 0, r0, r0+r2 +TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3 %rep 3 addr1, 128 lear0, [r0+r2*4] -TR_ADD_SSE_16_8 +TR_ADD_SSE_16_32_8 0, r0, r0+r2 +TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3 %endrep RET ; void ff_hevc_transform_add32_8_opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) -cglobal hevc_transform_add32_8, 3, 4, 12 - -TR_ADD_SSE_32_8 +cglobal hevc_transform_add32_8, 3, 4, 7 +pxor m0, m0 +TR_ADD_SSE_16_32_8 0, r0,r0+16 +TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16 %rep 15 addr1, 128 lear0, [r0+r2*2] -TR_ADD_SSE_32_8 +TR_ADD_SSE_16_32_8 0, r0,r0+16 +TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16 %endrep RET - -%endif ;ARCH_X86_64 %endmacro INIT_XMM sse2 diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index f6f0a4b..0709158 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -477,15 +477,14 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (ARCH_X86_64) {