The unique user so far is wmalossless 24bits. The few samples tested show an
order of 8, so more unrolling or an avx2 version do not make sense.

Timings: 72 -> 49 cycles
---
 libavcodec/x86/lossless_audiodsp.asm    | 31 +++++++++++++++++++++++++------
 libavcodec/x86/lossless_audiodsp_init.c |  7 +++++++
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/libavcodec/x86/lossless_audiodsp.asm 
b/libavcodec/x86/lossless_audiodsp.asm
index 5597dad..d00869b 100644
--- a/libavcodec/x86/lossless_audiodsp.asm
+++ b/libavcodec/x86/lossless_audiodsp.asm
@@ -22,13 +22,17 @@
 
 SECTION .text
 
-%macro SCALARPRODUCT 0
+%macro SCALARPRODUCT 1
 ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
 ;                                     int order, int mul)
-cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
-    shl orderq, 1
+; int ff_scalarproduct_and_madd_int32(int32_t *v1, int32_t *v2, int32_t *v3,
+;                                     int order, int mul)
+cglobal scalarproduct_and_madd_int %+ %1, 4,4,8, v1, v2, v3, order, mul
+    shl orderq, (%1/16)
     movd    m7, mulm
-%if mmsize == 16
+%if %1 == 32
+    SPLATD  m7
+%elif mmsize == 16
     pshuflw m7, m7, 0
     punpcklqdq m7, m7
 %else
@@ -46,14 +50,26 @@ cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, 
order, mul
     mova    m5, [v1q + orderq + mmsize]
     movu    m2, [v3q + orderq]
     movu    m3, [v3q + orderq + mmsize]
+%if %1 == 32
+    pmulld  m0, m4
+    pmulld  m1, m5
+    pmulld  m2, m7
+    pmulld  m3, m7
+%else
     pmaddwd m0, m4
     pmaddwd m1, m5
     pmullw  m2, m7
     pmullw  m3, m7
+%endif
     paddd   m6, m0
     paddd   m6, m1
+%if %1 == 32
+    paddd   m2, m4
+    paddd   m3, m5
+%else
     paddw   m2, m4
     paddw   m3, m5
+%endif
     mova    [v1q + orderq], m2
     mova    [v1q + orderq + mmsize], m3
     add     orderq, mmsize*2
@@ -64,9 +80,12 @@ cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, 
order, mul
 %endmacro
 
 INIT_MMX mmxext
-SCALARPRODUCT
+SCALARPRODUCT 16
 INIT_XMM sse2
-SCALARPRODUCT
+SCALARPRODUCT 16
+
+INIT_XMM sse4
+SCALARPRODUCT 32
 
 %macro SCALARPRODUCT_LOOP 1
 align 16
diff --git a/libavcodec/x86/lossless_audiodsp_init.c 
b/libavcodec/x86/lossless_audiodsp_init.c
index 197173c..85306cb 100644
--- a/libavcodec/x86/lossless_audiodsp_init.c
+++ b/libavcodec/x86/lossless_audiodsp_init.c
@@ -31,6 +31,10 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, 
const int16_t *v2,
                                               const int16_t *v3,
                                               int order, int mul);
 
+int32_t ff_scalarproduct_and_madd_int32_sse4(int32_t *v1, const int32_t *v2,
+                                             const int32_t *v3,
+                                             int order, int mul);
+
 av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
 {
 #if HAVE_YASM
@@ -45,5 +49,8 @@ av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
     if (EXTERNAL_SSSE3(cpu_flags) &&
         !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
         c->scalarproduct_and_madd_int16 = 
ff_scalarproduct_and_madd_int16_ssse3;
+
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->scalarproduct_and_madd_int32 = ff_scalarproduct_and_madd_int32_sse4;
 #endif
 }
-- 
2.8.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to