Re: [FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE2 optimization for divide
On Sun, Feb 14, 2016 at 03:45:11PM +0100, Henrik Gramner wrote: > You could try doing 8 or 16 bytes per iteration instead of 4, it might > be faster depending on how good your cpu is at OOE. As discussed on IRC, no observable difference has been observed with such changes, mainly because the bottleneck is in the division itself. Therefore, patch applied without changes. Timothy ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE2 optimization for divide
You could try doing 8 or 16 bytes per iteration instead of 4, it might be faster depending on how good your cpu is at OOE. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE2 optimization for divide
On 2/14/16, Timothy Guwrote: > On Sat, Feb 13, 2016 at 07:21:25PM -0800, Timothy Gu wrote: >> --- >> libavfilter/x86/vf_blend.asm| 30 ++ >> libavfilter/x86/vf_blend_init.c | 2 ++ >> 2 files changed, 32 insertions(+) > > Locally added commit message: > > 4.5x faster than C float version with autovectorization > 10 x faster than C int version > 25 x faster than C float version without autovectorization > > Timothy > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > ok ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE2 optimization for divide
On Sat, Feb 13, 2016 at 07:21:25PM -0800, Timothy Gu wrote: > --- > libavfilter/x86/vf_blend.asm| 30 ++ > libavfilter/x86/vf_blend_init.c | 2 ++ > 2 files changed, 32 insertions(+) Locally added commit message: 4.5x faster than C float version with autovectorization 10 x faster than C int version 25 x faster than C float version without autovectorization Timothy ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE2 optimization for divide
--- libavfilter/x86/vf_blend.asm| 30 ++ libavfilter/x86/vf_blend_init.c | 2 ++ 2 files changed, 32 insertions(+) diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm index a5ea74c..303ea3a 100644 --- a/libavfilter/x86/vf_blend.asm +++ b/libavfilter/x86/vf_blend.asm @@ -24,6 +24,7 @@ SECTION_RODATA +ps_255: times 4 dd 255.0 pw_1: times 8 dw 1 pw_128: times 8 dw 128 pw_255: times 8 dw 255 @@ -219,6 +220,35 @@ BLEND_INIT hardmix, 5 jl .loop BLEND_END +BLEND_INIT divide, 4 +pxor m2, m2 +mova m3, [ps_255] +.nextrow: +movxq, widthq + +.loop: +movdm0, [topq + xq] ; 00xx +movdm1, [bottomq + xq] +punpcklbw m0, m2 ; 0x0x +punpcklbw m1, m2 +punpcklwd m0, m2 ; 000x000x +punpcklwd m1, m2 + +cvtdq2psm0, m0 +cvtdq2psm1, m1 +divps m0, m1 ; a / b +mulps m0, m3 ; a / b * 255 +minps m0, m3 +cvttps2dq m0, m0 + +packssdwm0, m0 ; 0x0x +packuswbm0, m0 ; 00xx +movd [dstq + xq], m0 +add xq, mmsize / 4 + +jl .loop +BLEND_END + BLEND_INIT phoenix, 4 mova m3, [pb_255] .nextrow: diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c index a6baf94..677e759 100644 --- a/libavfilter/x86/vf_blend_init.c +++ b/libavfilter/x86/vf_blend_init.c @@ -39,6 +39,7 @@ BLEND_FUNC(difference128, sse2) BLEND_FUNC(multiply, sse2) BLEND_FUNC(screen, sse2) BLEND_FUNC(hardmix, sse2) +BLEND_FUNC(divide, sse2) BLEND_FUNC(lighten, sse2) BLEND_FUNC(or, sse2) BLEND_FUNC(phoenix, sse2) @@ -61,6 +62,7 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) case BLEND_AVERAGE: param->blend = ff_blend_average_sse2; break; case BLEND_DARKEN: param->blend = ff_blend_darken_sse2; break; case BLEND_DIFFERENCE128: param->blend = ff_blend_difference128_sse2; break; +case BLEND_DIVIDE: param->blend = ff_blend_divide_sse2; break; case BLEND_HARDMIX: param->blend = ff_blend_hardmix_sse2; break; case BLEND_LIGHTEN: param->blend = ff_blend_lighten_sse2; break; case BLEND_MULTIPLY: param->blend = ff_blend_multiply_sse2; break; -- 2.1.4 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel