1.5x-1.8x faster on sandybridge
---
 libavutil/lls.c          |  3 +++
 libavutil/lls.h          |  1 +
 libavutil/x86/lls.asm    | 31 +++++++++++++++++++++++++++++++
 libavutil/x86/lls_init.c |  6 +++++-
 4 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/libavutil/lls.c b/libavutil/lls.c
index eb500af..8f1aff1 100644
--- a/libavutil/lls.c
+++ b/libavutil/lls.c
@@ -119,6 +119,9 @@ double avpriv_evaluate_lls(LLSModel *m, double *param, int 
order)
     int i;
     double out = 0;
 
+    if (m->evaluate_lls)
+        return m->evaluate_lls(m->coeff[order], param, order);
+
     for (i = 0; i <= order; i++)
         out += param[i] * m->coeff[order][i];
 
diff --git a/libavutil/lls.h b/libavutil/lls.h
index 76ff10c..8a4d318 100644
--- a/libavutil/lls.h
+++ b/libavutil/lls.h
@@ -39,6 +39,7 @@ typedef struct LLSModel {
     double variance[MAX_VARS];
     int indep_count;
     void (*update_lls)(struct LLSModel *m, double *var, int order);
+    double (*evaluate_lls)(double *coefs, double *var, int order);
 } LLSModel;
 
 void avpriv_init_lls(LLSModel *m, int indep_count);
diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm
index f44f7ae..b5e04d9 100644
--- a/libavutil/x86/lls.asm
+++ b/libavutil/x86/lls.asm
@@ -182,3 +182,34 @@ cglobal update_lls, 3,6,8, covar, var, count, i, j, count2
     jle .loop2x1
 .ret:
     REP_RET
+
+
+INIT_XMM sse2
+cglobal evaluate_lls, 3,3,3, coefs, var, i
+    ; This function is often called on the same buffer as update_lls, but with 
an offset. They can't both be aligned.
+    movsd   m0, [varq]
+    movhpd  m0, [varq+8]
+    mulpd   m0, [coefsq]
+    lea coefsq, [coefsq+iq*8]
+    lea   varq, [varq+iq*8]
+    neg     iq
+    add     iq, 2
+.loop:
+    movsd   m1, [varq+iq*8]
+    movhpd  m1, [varq+iq*8+8]
+    mulpd   m1, [coefsq+iq*8]
+    addpd   m0, m1
+    add     iq, 2
+    jl .loop
+    jg .skip1
+    movsd   m1, [varq+iq*8]
+    mulsd   m1, [coefsq+iq*8]
+    addpd   m0, m1
+.skip1:
+    movhlps m1, m0
+    addsd   m0, m1
+%if ARCH_X86_32
+    movsd  r0m, m0
+    fld   qword r0m
+%endif
+    RET
diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c
index 1215b14..d65ddc8 100644
--- a/libavutil/x86/lls_init.c
+++ b/libavutil/x86/lls_init.c
@@ -25,12 +25,16 @@
 
 void ff_update_lls_sse2(LLSModel *m, double *var, int order);
 void ff_update_lls_avx(LLSModel *m, double *var, int order);
+double ff_evaluate_lls_sse2(double *coefs, double *var, int order);
 
 void avpriv_init_lls_x86(LLSModel *m)
 {
     int cpu_flags = av_get_cpu_flags();
-    if (EXTERNAL_SSE2(cpu_flags))
+    if (EXTERNAL_SSE2(cpu_flags)) {
         m->update_lls = ff_update_lls_sse2;
+        if (m->indep_count >= 4)
+            m->evaluate_lls = ff_evaluate_lls_sse2;
+    }
     if (EXTERNAL_AVX(cpu_flags))
         m->update_lls = ff_update_lls_avx;
 }
-- 
1.8.1.5

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to