---
libavutil/x86/float_dsp.asm | 47 ++++++++++++++++++++++++++++++++++++++++
libavutil/x86/float_dsp_init.c | 42 ++---------------------------------
2 files changed, 50 insertions(+), 39 deletions(-)
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index ae9b0e4..e9019fd 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -212,3 +212,50 @@ INIT_XMM sse2
VECTOR_FMUL_WINDOW
INIT_YMM avx
VECTOR_FMUL_WINDOW
+
+;------------------------------------------------------------------------------
+; void ff_vector_clipf(float *dst, const float *src, float min, float max,
+; int len)
+;------------------------------------------------------------------------------
+
+INIT_XMM sse
+%if ARCH_X86_32
+cglobal vector_clipf, 5,5,6, dst, src, min, max, len
+%else
+cglobal vector_clipf, 3,3,6, dst, src, len
+%endif
+%if ARCH_X86_32
+ ; caller puts min/max on the stack
+ movss m0, minm
+ movss m1, maxm
+%elif WIN64
+ ; caller puts min/max in xmm2/xmm3
+ SWAP 0, 2
+ SWAP 1, 3
+%endif
+ shufps m0, m0, 0
+ shufps m1, m1, 0
+ lea lenq, [4*lend]
+ add dstq, lenq
+ add srcq, lenq
+ neg lenq
+.loop:
+ mova m2, [srcq+lenq+0*mmsize]
+ mova m3, [srcq+lenq+1*mmsize]
+ mova m4, [srcq+lenq+2*mmsize]
+ mova m5, [srcq+lenq+3*mmsize]
+ maxps m2, m0
+ maxps m3, m0
+ maxps m4, m0
+ maxps m5, m0
+ minps m2, m1
+ minps m3, m1
+ minps m4, m1
+ minps m5, m1
+ mova [dstq+lenq+0*mmsize], m2
+ mova [dstq+lenq+1*mmsize], m3
+ mova [dstq+lenq+2*mmsize], m4
+ mova [dstq+lenq+3*mmsize], m5
+ add lenq, 4*mmsize
+ jl .loop
+ REP_RET
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index 7b0711e..38e3dfe 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -21,7 +21,6 @@
#include "libavutil/cpu.h"
#include "libavutil/float_dsp.h"
#include "cpu.h"
-#include "asm.h"
extern void ff_vector_fmul_sse(float *dst, const float *src0, const float
*src1,
int len);
@@ -48,41 +47,8 @@ extern void ff_vector_fmul_window_avx(float *dst, const
float *src0,
const float *src1, const float *win,
int len);
-#if HAVE_INLINE_ASM
-static void vector_clipf_sse(float *dst, const float *src, float min,
- float max, int len)
-{
- x86_reg i = (len - 16) * 4;
- __asm__ volatile (
- "movss %3, %%xmm4 \n\t"
- "movss %4, %%xmm5 \n\t"
- "shufps $0, %%xmm4, %%xmm4 \n\t"
- "shufps $0, %%xmm5, %%xmm5 \n\t"
- "1: \n\t"
- "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
- "movaps 16(%2, %0), %%xmm1 \n\t"
- "movaps 32(%2, %0), %%xmm2 \n\t"
- "movaps 48(%2, %0), %%xmm3 \n\t"
- "maxps %%xmm4, %%xmm0 \n\t"
- "maxps %%xmm4, %%xmm1 \n\t"
- "maxps %%xmm4, %%xmm2 \n\t"
- "maxps %%xmm4, %%xmm3 \n\t"
- "minps %%xmm5, %%xmm0 \n\t"
- "minps %%xmm5, %%xmm1 \n\t"
- "minps %%xmm5, %%xmm2 \n\t"
- "minps %%xmm5, %%xmm3 \n\t"
- "movaps %%xmm0, (%1, %0) \n\t"
- "movaps %%xmm1, 16(%1, %0) \n\t"
- "movaps %%xmm2, 32(%1, %0) \n\t"
- "movaps %%xmm3, 48(%1, %0) \n\t"
- "sub $64, %0 \n\t"
- "jge 1b \n\t"
- : "+&r"(i)
- : "r"(dst), "r"(src), "m"(min), "m"(max)
- : "memory"
- );
-}
-#endif /* HAVE_INLINE_ASM */
+extern void ff_vector_clipf_sse(float *dst, const float *src, float min,
+ float max, int len);
void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
{
@@ -92,9 +58,7 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
fdsp->vector_fmul = ff_vector_fmul_sse;
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
-#if HAVE_INLINE_ASM
- fdsp->vector_clipf = vector_clipf_sse;
-#endif
+ fdsp->vector_clipf = ff_vector_clipf_sse;
}
if (EXTERNAL_SSE2(mm_flags)) {
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
--
1.7.1
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel