On 2/13/2016 9:27 PM, Timothy Gu wrote: > --- > > The reason why this function uses SSE4.1 is the roundps instruction. Would > love to find a way to truncate a float to integer in SSE2. > > --- > libavfilter/x86/vf_blend.asm | 32 ++++++++++++++++++++++++++++++++ > libavfilter/x86/vf_blend_init.c | 6 ++++++ > 2 files changed, 38 insertions(+) > > diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm > index a5ea74c..dac04d7 100644 > --- a/libavfilter/x86/vf_blend.asm > +++ b/libavfilter/x86/vf_blend.asm > @@ -24,6 +24,7 @@ > > SECTION_RODATA > > +ps_255: times 4 dd 255.0 > pw_1: times 8 dw 1 > pw_128: times 8 dw 128 > pw_255: times 8 dw 255 > @@ -285,3 +286,34 @@ INIT_XMM sse2 > BLEND_ABS > INIT_XMM ssse3 > BLEND_ABS > + > +INIT_XMM sse4 > +BLEND_INIT divide, 4 > + pxor m2, m2 > + mova m3, [ps_255] > +.nextrow: > + mov xq, widthq > + > + .loop: > + movd m0, [topq + xq] ; 000000xx > + movd m1, [bottomq + xq] > + punpcklbw m0, m2 ; 00000x0x > + punpcklbw m1, m2
Assuming you keep using sse4, you could instead do pmovzxbd m0, [topq + xq] pmovzxbd m1, [bottomq + xq] > + punpcklwd m0, m2 ; 000x000x > + punpcklwd m1, m2 > + > + cvtdq2ps m0, m0 > + cvtdq2ps m1, m1 > + divps m0, m1 ; a / b > + mulps m0, m3 ; a / b * 255 > + roundps m0, m0, 3 ; truncate > + minps m0, m3 Are these two really needed? After a quick glance GCC seems to simply generate more or less the same code you're using here sans these two. (convert to float, div, mul, convert to int, saturate to uint8_t). > + cvtps2dq m0, m0 > + > + packusdw m0, m0 ; 00000x0x > + packuswb m0, m0 ; 000000xx > + movd [dstq + xq], m0 > + add xq, mmsize / 4 > + > + jl .loop > +BLEND_END > diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c > index a6baf94..f542870 100644 > --- a/libavfilter/x86/vf_blend_init.c > +++ b/libavfilter/x86/vf_blend_init.c > @@ -48,6 +48,7 @@ BLEND_FUNC(difference, sse2) > BLEND_FUNC(difference, ssse3) > BLEND_FUNC(negation, sse2) > BLEND_FUNC(negation, ssse3) > +BLEND_FUNC(divide, sse4) > > av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) > { > @@ -79,4 +80,9 @@ av_cold void ff_blend_init_x86(FilterParams *param, int > is_16bit) > case BLEND_NEGATION: param->blend = ff_blend_negation_ssse3; > break; > } > } > + if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1 && !is_16bit) { > + switch (param->mode) { > + case BLEND_DIVIDE: param->blend = ff_blend_divide_sse4; break; > + } > + } > } > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel