[FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE4.1 optimization for divide
--- The reason why this function uses SSE4.1 is the roundps instruction. Would love to find a way to truncate a float to integer in SSE2. --- libavfilter/x86/vf_blend.asm| 32 libavfilter/x86/vf_blend_init.c | 6 ++ 2 files changed, 38 insertions(+) diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm index a5ea74c..dac04d7 100644 --- a/libavfilter/x86/vf_blend.asm +++ b/libavfilter/x86/vf_blend.asm @@ -24,6 +24,7 @@ SECTION_RODATA +ps_255: times 4 dd 255.0 pw_1: times 8 dw 1 pw_128: times 8 dw 128 pw_255: times 8 dw 255 @@ -285,3 +286,34 @@ INIT_XMM sse2 BLEND_ABS INIT_XMM ssse3 BLEND_ABS + +INIT_XMM sse4 +BLEND_INIT divide, 4 +pxor m2, m2 +mova m3, [ps_255] +.nextrow: +movxq, widthq + +.loop: +movdm0, [topq + xq] ; 00xx +movdm1, [bottomq + xq] +punpcklbw m0, m2 ; 0x0x +punpcklbw m1, m2 +punpcklwd m0, m2 ; 000x000x +punpcklwd m1, m2 + +cvtdq2psm0, m0 +cvtdq2psm1, m1 +divps m0, m1 ; a / b +mulps m0, m3 ; a / b * 255 +roundps m0, m0, 3; truncate +minps m0, m3 +cvtps2dqm0, m0 + +packusdwm0, m0 ; 0x0x +packuswbm0, m0 ; 00xx +movd [dstq + xq], m0 +add xq, mmsize / 4 + +jl .loop +BLEND_END diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c index a6baf94..f542870 100644 --- a/libavfilter/x86/vf_blend_init.c +++ b/libavfilter/x86/vf_blend_init.c @@ -48,6 +48,7 @@ BLEND_FUNC(difference, sse2) BLEND_FUNC(difference, ssse3) BLEND_FUNC(negation, sse2) BLEND_FUNC(negation, ssse3) +BLEND_FUNC(divide, sse4) av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) { @@ -79,4 +80,9 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) case BLEND_NEGATION: param->blend = ff_blend_negation_ssse3; break; } } +if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1 && !is_16bit) { +switch (param->mode) { +case BLEND_DIVIDE: param->blend = ff_blend_divide_sse4; break; +} +} } -- 2.1.4 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE4.1 optimization for divide
I've already answered these on IRC but for the sake of completion I'll include the answers here as well. On Sat, Feb 13, 2016 at 10:26:58PM -0300, James Almer wrote: > On 2/13/2016 9:27 PM, Timothy Gu wrote: > > --- > > > > The reason why this function uses SSE4.1 is the roundps instruction. Would > > love to find a way to truncate a float to integer in SSE2. CVTTPS2DQ—Convert with Truncation Packed Single-Precision FP Values to Packed Dword Integers > > +punpcklwd m0, m2 ; 000x000x > > +punpcklwd m1, m2 > > + > > +cvtdq2psm0, m0 > > +cvtdq2psm1, m1 > > +divps m0, m1 ; a / b > > +mulps m0, m3 ; a / b * 255 > > +roundps m0, m0, 3; truncate > > +minps m0, m3 > > Are these two really needed? After a quick glance GCC seems to simply > generate more > or less the same code you're using here sans these two. (convert to float, > div, mul, > convert to int, saturate to uint8_t). roundps becomes unnecessary after cvttps2dq. minps is needed for divide-by-0 cases. Timothy ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE4.1 optimization for divide
On 2/13/2016 9:27 PM, Timothy Gu wrote: > --- > > The reason why this function uses SSE4.1 is the roundps instruction. Would > love to find a way to truncate a float to integer in SSE2. > > --- > libavfilter/x86/vf_blend.asm| 32 > libavfilter/x86/vf_blend_init.c | 6 ++ > 2 files changed, 38 insertions(+) > > diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm > index a5ea74c..dac04d7 100644 > --- a/libavfilter/x86/vf_blend.asm > +++ b/libavfilter/x86/vf_blend.asm > @@ -24,6 +24,7 @@ > > SECTION_RODATA > > +ps_255: times 4 dd 255.0 > pw_1: times 8 dw 1 > pw_128: times 8 dw 128 > pw_255: times 8 dw 255 > @@ -285,3 +286,34 @@ INIT_XMM sse2 > BLEND_ABS > INIT_XMM ssse3 > BLEND_ABS > + > +INIT_XMM sse4 > +BLEND_INIT divide, 4 > +pxor m2, m2 > +mova m3, [ps_255] > +.nextrow: > +movxq, widthq > + > +.loop: > +movdm0, [topq + xq] ; 00xx > +movdm1, [bottomq + xq] > +punpcklbw m0, m2 ; 0x0x > +punpcklbw m1, m2 Assuming you keep using sse4, you could instead do pmovzxbd m0, [topq + xq] pmovzxbd m1, [bottomq + xq] > +punpcklwd m0, m2 ; 000x000x > +punpcklwd m1, m2 > + > +cvtdq2psm0, m0 > +cvtdq2psm1, m1 > +divps m0, m1 ; a / b > +mulps m0, m3 ; a / b * 255 > +roundps m0, m0, 3; truncate > +minps m0, m3 Are these two really needed? After a quick glance GCC seems to simply generate more or less the same code you're using here sans these two. (convert to float, div, mul, convert to int, saturate to uint8_t). > +cvtps2dqm0, m0 > + > +packusdwm0, m0 ; 0x0x > +packuswbm0, m0 ; 00xx > +movd [dstq + xq], m0 > +add xq, mmsize / 4 > + > +jl .loop > +BLEND_END > diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c > index a6baf94..f542870 100644 > --- a/libavfilter/x86/vf_blend_init.c > +++ b/libavfilter/x86/vf_blend_init.c > @@ -48,6 +48,7 @@ BLEND_FUNC(difference, sse2) > BLEND_FUNC(difference, ssse3) > BLEND_FUNC(negation, sse2) > BLEND_FUNC(negation, ssse3) > +BLEND_FUNC(divide, sse4) > > av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) > { > @@ -79,4 +80,9 @@ av_cold void ff_blend_init_x86(FilterParams *param, int > is_16bit) > case BLEND_NEGATION: param->blend = ff_blend_negation_ssse3; > break; > } > } > +if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1 && !is_16bit) { > +switch (param->mode) { > +case BLEND_DIVIDE: param->blend = ff_blend_divide_sse4; break; > +} > +} > } > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel