---
 libavutil/x86/float_dsp.asm    |   47 ++++++++++++++++++++++++++++++++++++++++
 libavutil/x86/float_dsp_init.c |   42 ++---------------------------------
 2 files changed, 50 insertions(+), 39 deletions(-)

diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index ae9b0e4..e9019fd 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -212,3 +212,50 @@ INIT_XMM sse2
 VECTOR_FMUL_WINDOW
 INIT_YMM avx
 VECTOR_FMUL_WINDOW
+
+;------------------------------------------------------------------------------
+; void ff_vector_clipf(float *dst, const float *src, float min, float max,
+;                      int len)
+;------------------------------------------------------------------------------
+
+INIT_XMM sse
+%if ARCH_X86_32
+cglobal vector_clipf, 5,5,6, dst, src, min, max, len
+%else
+cglobal vector_clipf, 3,3,6, dst, src, len
+%endif
+%if ARCH_X86_32
+    ; caller puts min/max on the stack
+    movss    m0, minm
+    movss    m1, maxm
+%elif WIN64
+    ; caller puts min/max in xmm2/xmm3
+    SWAP      0, 2
+    SWAP      1, 3
+%endif
+    shufps   m0, m0, 0
+    shufps   m1, m1, 0
+    lea    lenq, [4*lend]
+    add    dstq, lenq
+    add    srcq, lenq
+    neg    lenq
+.loop:
+    mova     m2, [srcq+lenq+0*mmsize]
+    mova     m3, [srcq+lenq+1*mmsize]
+    mova     m4, [srcq+lenq+2*mmsize]
+    mova     m5, [srcq+lenq+3*mmsize]
+    maxps    m2, m0
+    maxps    m3, m0
+    maxps    m4, m0
+    maxps    m5, m0
+    minps    m2, m1
+    minps    m3, m1
+    minps    m4, m1
+    minps    m5, m1
+    mova [dstq+lenq+0*mmsize], m2
+    mova [dstq+lenq+1*mmsize], m3
+    mova [dstq+lenq+2*mmsize], m4
+    mova [dstq+lenq+3*mmsize], m5
+    add    lenq, 4*mmsize
+    jl .loop
+    REP_RET
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index 7b0711e..38e3dfe 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -21,7 +21,6 @@
 #include "libavutil/cpu.h"
 #include "libavutil/float_dsp.h"
 #include "cpu.h"
-#include "asm.h"
 
 extern void ff_vector_fmul_sse(float *dst, const float *src0, const float 
*src1,
                                int len);
@@ -48,41 +47,8 @@ extern void ff_vector_fmul_window_avx(float *dst, const 
float *src0,
                                       const float *src1, const float *win,
                                       int len);
 
-#if HAVE_INLINE_ASM
-static void vector_clipf_sse(float *dst, const float *src, float min,
-                             float max, int len)
-{
-    x86_reg i = (len - 16) * 4;
-    __asm__ volatile (
-        "movss          %3, %%xmm4      \n\t"
-        "movss          %4, %%xmm5      \n\t"
-        "shufps $0, %%xmm4, %%xmm4      \n\t"
-        "shufps $0, %%xmm5, %%xmm5      \n\t"
-        "1:                             \n\t"
-        "movaps   (%2, %0), %%xmm0      \n\t" // 3/1 on intel
-        "movaps 16(%2, %0), %%xmm1      \n\t"
-        "movaps 32(%2, %0), %%xmm2      \n\t"
-        "movaps 48(%2, %0), %%xmm3      \n\t"
-        "maxps      %%xmm4, %%xmm0      \n\t"
-        "maxps      %%xmm4, %%xmm1      \n\t"
-        "maxps      %%xmm4, %%xmm2      \n\t"
-        "maxps      %%xmm4, %%xmm3      \n\t"
-        "minps      %%xmm5, %%xmm0      \n\t"
-        "minps      %%xmm5, %%xmm1      \n\t"
-        "minps      %%xmm5, %%xmm2      \n\t"
-        "minps      %%xmm5, %%xmm3      \n\t"
-        "movaps     %%xmm0,   (%1, %0)  \n\t"
-        "movaps     %%xmm1, 16(%1, %0)  \n\t"
-        "movaps     %%xmm2, 32(%1, %0)  \n\t"
-        "movaps     %%xmm3, 48(%1, %0)  \n\t"
-        "sub           $64, %0          \n\t"
-        "jge            1b              \n\t"
-        : "+&r"(i)
-        : "r"(dst), "r"(src), "m"(min), "m"(max)
-        : "memory"
-    );
-}
-#endif /* HAVE_INLINE_ASM */
+extern void ff_vector_clipf_sse(float *dst, const float *src, float min,
+                                float max, int len);
 
 void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
 {
@@ -92,9 +58,7 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
         fdsp->vector_fmul = ff_vector_fmul_sse;
         fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
         fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
-#if HAVE_INLINE_ASM
-        fdsp->vector_clipf = vector_clipf_sse;
-#endif
+        fdsp->vector_clipf       = ff_vector_clipf_sse;
     }
     if (EXTERNAL_SSE2(mm_flags)) {
         fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
-- 
1.7.1

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to