Provides a SPLATD_LOW macro that can either splat in-place or load/splat from
a 32-bit value in memory.
---
 libavcodec/x86/dsputil_yasm.asm |   10 +++-----
 libavcodec/x86/fmtconvert.asm   |   28 ++++++++++++--------------
 libavutil/x86/x86util.asm       |   40 ++++++++++++++++++++++++--------------
 libswscale/x86/output.asm       |    2 +-
 4 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 746ba69..4e662d9 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1062,14 +1062,14 @@ cglobal vector_clip_int32, 5,5,11, dst, src, min, max, 
len
 %if notcpuflag(sse4) && cpuflag(sse2) && notcpuflag(atom)
     cvtsi2ss   m4, minm
     cvtsi2ss   m5, maxm
+    SPLATD     m4
+    SPLATD     m5
     %assign is_float 1
 %else
-    movd      m4, minm
-    movd      m5, maxm
+    SPLATD     m4, minm
+    SPLATD     m5, maxm
     %assign is_float 0
 %endif
-    SPLATD    m4
-    SPLATD    m5
 .loop:
 %assign %%i 1
 %rep %1
@@ -1113,10 +1113,8 @@ cglobal vector_clip_int32, 5,5,11, dst, src, min, max, 
len
 %endmacro
 
 INIT_MMX mmx
-%define SPLATD SPLATD_MMX
 VECTOR_CLIP_INT32 1, 0
 INIT_XMM sse2,atom
-%define SPLATD SPLATD_SSE2
 VECTOR_CLIP_INT32 1, 0
 INIT_XMM sse2
 VECTOR_CLIP_INT32 2, 0
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 4916e7a..2660361 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -27,18 +27,20 @@ SECTION_TEXT
 
;---------------------------------------------------------------------------------
 ; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int 
len);
 
;---------------------------------------------------------------------------------
-%macro INT32_TO_FLOAT_FMUL_SCALAR 2
+%macro INT32_TO_FLOAT_FMUL_SCALAR 0
 %if UNIX64
-cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
+cglobal int32_to_float_fmul_scalar, 3,3,5, dst, src, len
 %else
-cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
+cglobal int32_to_float_fmul_scalar, 4,4,5, dst, src, mul, len
 %endif
-%if WIN64
+%if ARCH_X86_32
+    SPLATD    m0, mulm
+%else
+    %if WIN64
     SWAP 0, 2
-%elif ARCH_X86_32
-    movss   m0, mulm
+    %endif
+    SPLATD    m0
 %endif
-    SPLATD  m0
     shl     lenq, 2
     add     srcq, lenq
     add     dstq, lenq
@@ -64,14 +66,10 @@ cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, 
mul, len
     REP_RET
 %endmacro
 
-INIT_XMM
-%define SPLATD SPLATD_SSE
-%define movdqa movaps
-INT32_TO_FLOAT_FMUL_SCALAR sse, 5
-%undef movdqa
-%define SPLATD SPLATD_SSE2
-INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
-%undef SPLATD
+INIT_XMM sse
+INT32_TO_FLOAT_FMUL_SCALAR
+INIT_XMM sse2
+INT32_TO_FLOAT_FMUL_SCALAR
 
 
 ;------------------------------------------------------------------------------
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index a33858b..654aba4 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -559,24 +559,34 @@
 %endif
 %endmacro
 
-%macro SPLATD 2-3 0
-%if mmsize == 16
-    pshufd %1, %2, (%3)*0x55
-%else
-    pshufw %1, %2, (%3)*0x11 + ((%3)+1)*0x44
+; splat low dword or m32 to all dwords
+; %1 = dst/src mmreg, %2 = src m32 (optional)
+%macro SPLATD 1-2
+%if %0 > 1 && notcpuflag(avx)
+    %if mmsize == 8 || cpuflag(sse2)
+    movd   %1, %2
+    %else
+    movss  %1, %2
+    %endif
 %endif
-%endmacro
-
-%macro SPLATD_MMX 1
+%if mmsize == 8
     punpckldq  %1, %1
-%endmacro
-
-%macro SPLATD_SSE 1
-    shufps  %1, %1, 0
-%endmacro
-
-%macro SPLATD_SSE2 1
+%elif mmsize == 16
+    %if %0 > 1 && cpuflag(avx)
+    vbroadcastss %1, %2
+    %elif cpuflag(sse2)
     pshufd  %1, %1, 0
+    %else ; sse
+    shufps  %1, %1, 0
+    %endif
+%elif mmsize == 32
+    %if %0 > 1
+    vbroadcastss %1, %2
+    %else
+    vperm2f128 %1, %1, %1, 0
+    shufps     %1, %1, 0
+    %endif
+%endif
 %endmacro
 
 %macro CLIPW 3 ;(dst, min, max)
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index 9b0b012..b970792 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -188,7 +188,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, 
dst, w, dither, offset
 %else ; %1 == 10/9/8
     punpcklwd       m5,  m3,  m4
     punpckhwd       m3,  m4
-    SPLATD          m0,  m0
+    SPLATD          m0
 
     pmaddwd         m5,  m0
     pmaddwd         m3,  m0
-- 
1.7.1

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to