[FFmpeg-devel] [PATCH] vf_blend: Add SSE2 optimization for multiply

2016-02-08 Thread Timothy Gu
5 times faster than C, 3 times overall.
---

Removes constant.c changes in accordance with James's comments

---
 libavfilter/x86/vf_blend.asm| 29 +
 libavfilter/x86/vf_blend_init.c |  2 ++
 2 files changed, 31 insertions(+)

diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
index 730be77..9388a74 100644
--- a/libavfilter/x86/vf_blend.asm
+++ b/libavfilter/x86/vf_blend.asm
@@ -24,6 +24,7 @@
 
 SECTION_RODATA
 
+pw_1:   times 8 dw 1
 pw_128: times 8 dw 128
 pw_255: times 8 dw 255
 pb_127: times 16 db 127
@@ -101,6 +102,34 @@ BLEND_INIT difference128, 4
 jl .loop
 BLEND_END
 
+BLEND_INIT multiply, 4
+pxor   m2, m2
+mova   m3, [pw_1]
+.nextrow:
+movxq, widthq
+
+.loop:
+ ; word
+ ; |--|
+movhm0, [topq + xq]  ; 
+movhm1, [bottomq + xq]
+punpcklbw   m0, m2   ; 00xx00xx
+punpcklbw   m1, m2
+
+pmullw  m0, m1   ;   a * b
+paddw   m0, m3
+movam1, m0
+psrlw   m1, 8
+paddw   m0, m1
+psrlw   m0, 8; 00xx00xx  a * b / 255
+
+packuswbm0, m0   ; 
+movh   [dstq + xq], m0
+add xq, mmsize / 2
+
+jl .loop
+BLEND_END
+
 BLEND_INIT average, 3
 pxor   m2, m2
 .nextrow:
diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c
index dc29547..8ac526a 100644
--- a/libavfilter/x86/vf_blend_init.c
+++ b/libavfilter/x86/vf_blend_init.c
@@ -36,6 +36,7 @@ BLEND_FUNC(average, sse2)
 BLEND_FUNC(and, sse2)
 BLEND_FUNC(darken, sse2)
 BLEND_FUNC(difference128, sse2)
+BLEND_FUNC(multiply, sse2)
 BLEND_FUNC(hardmix, sse2)
 BLEND_FUNC(lighten, sse2)
 BLEND_FUNC(or, sse2)
@@ -61,6 +62,7 @@ av_cold void ff_blend_init_x86(FilterParams *param, int 
is_16bit)
 case BLEND_DIFFERENCE128: param->blend = ff_blend_difference128_sse2; 
break;
 case BLEND_HARDMIX:  param->blend = ff_blend_hardmix_sse2;  break;
 case BLEND_LIGHTEN:  param->blend = ff_blend_lighten_sse2;  break;
+case BLEND_MULTIPLY: param->blend = ff_blend_multiply_sse2; break;
 case BLEND_OR:   param->blend = ff_blend_or_sse2;   break;
 case BLEND_PHOENIX:  param->blend = ff_blend_phoenix_sse2;  break;
 case BLEND_SUBTRACT: param->blend = ff_blend_subtract_sse2; break;
-- 
1.9.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] vf_blend: Add SSE2 optimization for multiply

2016-02-08 Thread Timothy Gu
On Mon, Feb 08, 2016 at 09:51:55PM +0100, Paul B Mahol wrote:
> On 2/8/16, Timothy Gu  wrote:
> > 5 times faster than C, 3 times overall.
> > ---
> >
> > Removes constant.c changes in accordance with James's comments
> >
> > ---
> >  libavfilter/x86/vf_blend.asm| 29 +
> >  libavfilter/x86/vf_blend_init.c |  2 ++
> >  2 files changed, 31 insertions(+)
> >
> 
> If output is always bitexact with C, should be OK for me.

Yes.

Pushed, thanks.

Timothy
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel