---
doesn't do much, but it helps a little.

athlon64:
c    - 1060
sse  -  303
sse3 -  298

sandybridge:
c    -  738
sse  -  215
sse3 -  217
avx  -  208

 libavcodec/x86/dsputil_mmx.c    |   12 ++++++++++--
 libavcodec/x86/dsputil_yasm.asm |   37 ++++++++++++++++++++++++++++++-------
 2 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index be0ac2e..ed80ab6 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2413,7 +2413,9 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, 
const uint8_t *top, const
 int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int 
w, int left);
 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, 
int left);
 
-float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
+float ff_scalarproduct_float_sse_aligned (const float *v1, const float *v2, 
int order);
+float ff_scalarproduct_float_sse3_aligned(const float *v1, const float *v2, 
int order);
+float ff_scalarproduct_float_avx         (const float *v1, const float *v2, 
int order);
 
 void ff_vector_clip_int32_mmx     (int32_t *dst, const int32_t *src, int32_t 
min,
                                    int32_t max, unsigned int len);
@@ -2867,7 +2869,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext 
*avctx)
 #endif
             c->vector_clipf = vector_clipf_sse;
 #if HAVE_YASM
-            c->scalarproduct_float = ff_scalarproduct_float_sse;
+            c->scalarproduct_float = ff_scalarproduct_float_sse_aligned;
 
             if (!high_bit_depth)
                 c->emulated_edge_mc = emulated_edge_mc_sse;
@@ -2894,6 +2896,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext 
*avctx)
             }
 #endif
         }
+        if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE3) {
+#if HAVE_YASM
+            c->scalarproduct_float = ff_scalarproduct_float_sse3_aligned;
+#endif
+        }
         if (mm_flags & AV_CPU_FLAG_SSSE3) {
 #if HAVE_YASM
             if (mm_flags & AV_CPU_FLAG_ATOM) {
@@ -2925,6 +2932,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext 
*avctx)
                 c->put_h264_chroma_pixels_tab[0]= 
ff_put_h264_chroma_mc8_10_avx;
                 c->avg_h264_chroma_pixels_tab[0]= 
ff_avg_h264_chroma_mc8_10_avx;
             }
+            c->scalarproduct_float = ff_scalarproduct_float_avx;
         }
 #endif
     }
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 5244362..25eb19f 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -456,29 +456,52 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, 
w, left
     ADD_HFYU_LEFT_LOOP 0
 
 
-; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
-cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
+;------------------------------------------------------------------------------
+; float ff_scalarproduct_float(const float *v1, const float *v2, int len)
+;------------------------------------------------------------------------------
+
+%macro SCALARPRODUCT_FLOAT 0
+cglobal scalarproduct_float, 3,3,3, v1, v2, offset
     neg offsetq
     shl offsetq, 2
     sub v1q, offsetq
     sub v2q, offsetq
     xorps xmm0, xmm0
-    .loop:
-        movaps   xmm1, [v1q+offsetq]
-        mulps    xmm1, [v2q+offsetq]
-        addps    xmm0, xmm1
-        add      offsetq, 16
+.loop:
+    movu            m1, [v1q+offsetq]
+    mulps           m1, m1, [v2q+offsetq]
+    addps           m0, m0, m1
+    add        offsetq, mmsize
         js       .loop
+%if cpuflag(avx)
+    vextractf128  xmm0, ymm0, 0
+    vextractf128  xmm1, ymm0, 1
+    addps         xmm0, xmm1
+%endif
+%if cpuflag(sse3)
+    haddps        xmm0, xmm0
+    haddps        xmm0, xmm0
+%else
     movhlps xmm1, xmm0
     addps   xmm0, xmm1
     movss   xmm1, xmm0
     shufps  xmm0, xmm0, 1
     addss   xmm0, xmm1
+%endif
 %ifndef ARCH_X86_64
     movd    r0m,  xmm0
     fld     dword r0m
 %endif
     RET
+%endmacro
+
+INIT_XMM sse, aligned
+SCALARPRODUCT_FLOAT
+INIT_XMM sse3, aligned
+SCALARPRODUCT_FLOAT
+INIT_YMM avx
+SCALARPRODUCT_FLOAT
+
 
 ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg 
linesize,
 ;                              x86_reg start_y, x86_reg end_y, x86_reg block_h,
-- 
1.7.1

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to