Re: [FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE2 optimization for divide

2016-02-28 Thread Timothy Gu
On Sun, Feb 14, 2016 at 03:45:11PM +0100, Henrik Gramner wrote:
> You could try doing 8 or 16 bytes per iteration instead of 4, it might
> be faster depending on how good your cpu is at OOE.

As discussed on IRC, no observable difference has been observed with such
changes, mainly because the bottleneck is in the division itself.

Therefore, patch applied without changes.

Timothy
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE2 optimization for divide

2016-02-14 Thread Henrik Gramner
You could try doing 8 or 16 bytes per iteration instead of 4, it might
be faster depending on how good your cpu is at OOE.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE2 optimization for divide

2016-02-14 Thread Paul B Mahol
On 2/14/16, Timothy Gu  wrote:
> On Sat, Feb 13, 2016 at 07:21:25PM -0800, Timothy Gu wrote:
>> ---
>>  libavfilter/x86/vf_blend.asm| 30 ++
>>  libavfilter/x86/vf_blend_init.c |  2 ++
>>  2 files changed, 32 insertions(+)
>
> Locally added commit message:
>
>  4.5x faster than C float version with autovectorization
> 10  x faster than C int version
> 25  x faster than C float version without autovectorization
>
> Timothy
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>

ok
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE2 optimization for divide

2016-02-13 Thread Timothy Gu
On Sat, Feb 13, 2016 at 07:21:25PM -0800, Timothy Gu wrote:
> ---
>  libavfilter/x86/vf_blend.asm| 30 ++
>  libavfilter/x86/vf_blend_init.c |  2 ++
>  2 files changed, 32 insertions(+)

Locally added commit message:

 4.5x faster than C float version with autovectorization
10  x faster than C int version
25  x faster than C float version without autovectorization

Timothy
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE2 optimization for divide

2016-02-13 Thread Timothy Gu
---
 libavfilter/x86/vf_blend.asm| 30 ++
 libavfilter/x86/vf_blend_init.c |  2 ++
 2 files changed, 32 insertions(+)

diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
index a5ea74c..303ea3a 100644
--- a/libavfilter/x86/vf_blend.asm
+++ b/libavfilter/x86/vf_blend.asm
@@ -24,6 +24,7 @@
 
 SECTION_RODATA
 
+ps_255: times 4 dd 255.0
 pw_1:   times 8 dw 1
 pw_128: times 8 dw 128
 pw_255: times 8 dw 255
@@ -219,6 +220,35 @@ BLEND_INIT hardmix, 5
 jl .loop
 BLEND_END
 
+BLEND_INIT divide, 4
+pxor   m2, m2
+mova   m3, [ps_255]
+.nextrow:
+movxq, widthq
+
+.loop:
+movdm0, [topq + xq]  ; 00xx
+movdm1, [bottomq + xq]
+punpcklbw   m0, m2   ; 0x0x
+punpcklbw   m1, m2
+punpcklwd   m0, m2   ; 000x000x
+punpcklwd   m1, m2
+
+cvtdq2psm0, m0
+cvtdq2psm1, m1
+divps   m0, m1   ; a / b
+mulps   m0, m3   ; a / b * 255
+minps   m0, m3
+cvttps2dq   m0, m0
+
+packssdwm0, m0   ; 0x0x
+packuswbm0, m0   ; 00xx
+movd   [dstq + xq], m0
+add xq, mmsize / 4
+
+jl .loop
+BLEND_END
+
 BLEND_INIT phoenix, 4
 mova   m3, [pb_255]
 .nextrow:
diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c
index a6baf94..677e759 100644
--- a/libavfilter/x86/vf_blend_init.c
+++ b/libavfilter/x86/vf_blend_init.c
@@ -39,6 +39,7 @@ BLEND_FUNC(difference128, sse2)
 BLEND_FUNC(multiply, sse2)
 BLEND_FUNC(screen, sse2)
 BLEND_FUNC(hardmix, sse2)
+BLEND_FUNC(divide, sse2)
 BLEND_FUNC(lighten, sse2)
 BLEND_FUNC(or, sse2)
 BLEND_FUNC(phoenix, sse2)
@@ -61,6 +62,7 @@ av_cold void ff_blend_init_x86(FilterParams *param, int 
is_16bit)
 case BLEND_AVERAGE:  param->blend = ff_blend_average_sse2;  break;
 case BLEND_DARKEN:   param->blend = ff_blend_darken_sse2;   break;
 case BLEND_DIFFERENCE128: param->blend = ff_blend_difference128_sse2; 
break;
+case BLEND_DIVIDE:   param->blend = ff_blend_divide_sse2;   break;
 case BLEND_HARDMIX:  param->blend = ff_blend_hardmix_sse2;  break;
 case BLEND_LIGHTEN:  param->blend = ff_blend_lighten_sse2;  break;
 case BLEND_MULTIPLY: param->blend = ff_blend_multiply_sse2; break;
-- 
2.1.4

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel