[FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE4.1 optimization for divide

2016-02-13 Thread Timothy Gu
---

The reason why this function uses SSE4.1 is the roundps instruction. Would
love to find a way to truncate a float to integer in SSE2.

---
 libavfilter/x86/vf_blend.asm| 32 
 libavfilter/x86/vf_blend_init.c |  6 ++
 2 files changed, 38 insertions(+)

diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
index a5ea74c..dac04d7 100644
--- a/libavfilter/x86/vf_blend.asm
+++ b/libavfilter/x86/vf_blend.asm
@@ -24,6 +24,7 @@
 
 SECTION_RODATA
 
+ps_255: times 4 dd 255.0
 pw_1:   times 8 dw 1
 pw_128: times 8 dw 128
 pw_255: times 8 dw 255
@@ -285,3 +286,34 @@ INIT_XMM sse2
 BLEND_ABS
 INIT_XMM ssse3
 BLEND_ABS
+
+INIT_XMM sse4
+BLEND_INIT divide, 4
+pxor   m2, m2
+mova   m3, [ps_255]
+.nextrow:
+movxq, widthq
+
+.loop:
+movdm0, [topq + xq]  ; 00xx
+movdm1, [bottomq + xq]
+punpcklbw   m0, m2   ; 0x0x
+punpcklbw   m1, m2
+punpcklwd   m0, m2   ; 000x000x
+punpcklwd   m1, m2
+
+cvtdq2psm0, m0
+cvtdq2psm1, m1
+divps   m0, m1   ; a / b
+mulps   m0, m3   ; a / b * 255
+roundps m0, m0, 3; truncate
+minps   m0, m3
+cvtps2dqm0, m0
+
+packusdwm0, m0   ; 0x0x
+packuswbm0, m0   ; 00xx
+movd   [dstq + xq], m0
+add xq, mmsize / 4
+
+jl .loop
+BLEND_END
diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c
index a6baf94..f542870 100644
--- a/libavfilter/x86/vf_blend_init.c
+++ b/libavfilter/x86/vf_blend_init.c
@@ -48,6 +48,7 @@ BLEND_FUNC(difference, sse2)
 BLEND_FUNC(difference, ssse3)
 BLEND_FUNC(negation, sse2)
 BLEND_FUNC(negation, ssse3)
+BLEND_FUNC(divide, sse4)
 
 av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
 {
@@ -79,4 +80,9 @@ av_cold void ff_blend_init_x86(FilterParams *param, int 
is_16bit)
 case BLEND_NEGATION:   param->blend = ff_blend_negation_ssse3;   break;
 }
 }
+if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1 && !is_16bit) {
+switch (param->mode) {
+case BLEND_DIVIDE:   param->blend = ff_blend_divide_sse4;   break;
+}
+}
 }
-- 
2.1.4

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE4.1 optimization for divide

2016-02-13 Thread Timothy Gu
I've already answered these on IRC but for the sake of completion I'll include
the answers here as well.

On Sat, Feb 13, 2016 at 10:26:58PM -0300, James Almer wrote:
> On 2/13/2016 9:27 PM, Timothy Gu wrote:
> > ---
> > 
> > The reason why this function uses SSE4.1 is the roundps instruction. Would
> > love to find a way to truncate a float to integer in SSE2.

CVTTPS2DQ—Convert with Truncation Packed Single-Precision FP Values to Packed
Dword Integers

> > +punpcklwd   m0, m2   ; 000x000x
> > +punpcklwd   m1, m2
> > +
> > +cvtdq2psm0, m0
> > +cvtdq2psm1, m1
> > +divps   m0, m1   ; a / b
> > +mulps   m0, m3   ; a / b * 255
> > +roundps m0, m0, 3; truncate
> > +minps   m0, m3
> 
> Are these two really needed? After a quick glance GCC seems to simply 
> generate more
> or less the same code you're using here sans these two. (convert to float, 
> div, mul,
> convert to int, saturate to uint8_t).

roundps becomes unnecessary after cvttps2dq. minps is needed for divide-by-0
cases.

Timothy
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE4.1 optimization for divide

2016-02-13 Thread James Almer
On 2/13/2016 9:27 PM, Timothy Gu wrote:
> ---
> 
> The reason why this function uses SSE4.1 is the roundps instruction. Would
> love to find a way to truncate a float to integer in SSE2.
> 
> ---
>  libavfilter/x86/vf_blend.asm| 32 
>  libavfilter/x86/vf_blend_init.c |  6 ++
>  2 files changed, 38 insertions(+)
> 
> diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
> index a5ea74c..dac04d7 100644
> --- a/libavfilter/x86/vf_blend.asm
> +++ b/libavfilter/x86/vf_blend.asm
> @@ -24,6 +24,7 @@
>  
>  SECTION_RODATA
>  
> +ps_255: times 4 dd 255.0
>  pw_1:   times 8 dw 1
>  pw_128: times 8 dw 128
>  pw_255: times 8 dw 255
> @@ -285,3 +286,34 @@ INIT_XMM sse2
>  BLEND_ABS
>  INIT_XMM ssse3
>  BLEND_ABS
> +
> +INIT_XMM sse4
> +BLEND_INIT divide, 4
> +pxor   m2, m2
> +mova   m3, [ps_255]
> +.nextrow:
> +movxq, widthq
> +
> +.loop:
> +movdm0, [topq + xq]  ; 00xx
> +movdm1, [bottomq + xq]
> +punpcklbw   m0, m2   ; 0x0x
> +punpcklbw   m1, m2

Assuming you keep using sse4, you could instead do

pmovzxbd m0, [topq + xq]
pmovzxbd m1, [bottomq + xq]

> +punpcklwd   m0, m2   ; 000x000x
> +punpcklwd   m1, m2
> +
> +cvtdq2psm0, m0
> +cvtdq2psm1, m1
> +divps   m0, m1   ; a / b
> +mulps   m0, m3   ; a / b * 255
> +roundps m0, m0, 3; truncate
> +minps   m0, m3

Are these two really needed? After a quick glance GCC seems to simply generate 
more
or less the same code you're using here sans these two. (convert to float, div, 
mul,
convert to int, saturate to uint8_t).

> +cvtps2dqm0, m0
> +
> +packusdwm0, m0   ; 0x0x
> +packuswbm0, m0   ; 00xx
> +movd   [dstq + xq], m0
> +add xq, mmsize / 4
> +
> +jl .loop
> +BLEND_END
> diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c
> index a6baf94..f542870 100644
> --- a/libavfilter/x86/vf_blend_init.c
> +++ b/libavfilter/x86/vf_blend_init.c
> @@ -48,6 +48,7 @@ BLEND_FUNC(difference, sse2)
>  BLEND_FUNC(difference, ssse3)
>  BLEND_FUNC(negation, sse2)
>  BLEND_FUNC(negation, ssse3)
> +BLEND_FUNC(divide, sse4)
>  
>  av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
>  {
> @@ -79,4 +80,9 @@ av_cold void ff_blend_init_x86(FilterParams *param, int 
> is_16bit)
>  case BLEND_NEGATION:   param->blend = ff_blend_negation_ssse3;   
> break;
>  }
>  }
> +if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1 && !is_16bit) {
> +switch (param->mode) {
> +case BLEND_DIVIDE:   param->blend = ff_blend_divide_sse4;   break;
> +}
> +}
>  }
> 

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel