The unique user so far is wmalossless 24bits. The few samples tested show an order of 8, so more unrolling or an avx2 version do not make sense.
Timings: 72 -> 49 cycles --- libavcodec/x86/lossless_audiodsp.asm | 31 +++++++++++++++++++++++++------ libavcodec/x86/lossless_audiodsp_init.c | 7 +++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/libavcodec/x86/lossless_audiodsp.asm b/libavcodec/x86/lossless_audiodsp.asm index 5597dad..d00869b 100644 --- a/libavcodec/x86/lossless_audiodsp.asm +++ b/libavcodec/x86/lossless_audiodsp.asm @@ -22,13 +22,17 @@ SECTION .text -%macro SCALARPRODUCT 0 +%macro SCALARPRODUCT 1 ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, ; int order, int mul) -cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul - shl orderq, 1 +; int ff_scalarproduct_and_madd_int32(int32_t *v1, int32_t *v2, int32_t *v3, +; int order, int mul) +cglobal scalarproduct_and_madd_int %+ %1, 4,4,8, v1, v2, v3, order, mul + shl orderq, (%1/16) movd m7, mulm -%if mmsize == 16 +%if %1 == 32 + SPLATD m7 +%elif mmsize == 16 pshuflw m7, m7, 0 punpcklqdq m7, m7 %else @@ -46,14 +50,26 @@ cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul mova m5, [v1q + orderq + mmsize] movu m2, [v3q + orderq] movu m3, [v3q + orderq + mmsize] +%if %1 == 32 + pmulld m0, m4 + pmulld m1, m5 + pmulld m2, m7 + pmulld m3, m7 +%else pmaddwd m0, m4 pmaddwd m1, m5 pmullw m2, m7 pmullw m3, m7 +%endif paddd m6, m0 paddd m6, m1 +%if %1 == 32 + paddd m2, m4 + paddd m3, m5 +%else paddw m2, m4 paddw m3, m5 +%endif mova [v1q + orderq], m2 mova [v1q + orderq + mmsize], m3 add orderq, mmsize*2 @@ -64,9 +80,12 @@ cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul %endmacro INIT_MMX mmxext -SCALARPRODUCT +SCALARPRODUCT 16 INIT_XMM sse2 -SCALARPRODUCT +SCALARPRODUCT 16 + +INIT_XMM sse4 +SCALARPRODUCT 32 %macro SCALARPRODUCT_LOOP 1 align 16 diff --git a/libavcodec/x86/lossless_audiodsp_init.c b/libavcodec/x86/lossless_audiodsp_init.c index 197173c..85306cb 100644 --- a/libavcodec/x86/lossless_audiodsp_init.c +++ b/libavcodec/x86/lossless_audiodsp_init.c @@ -31,6 +31,10 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); +int32_t ff_scalarproduct_and_madd_int32_sse4(int32_t *v1, const int32_t *v2, + const int32_t *v3, + int order, int mul); + av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c) { #if HAVE_YASM @@ -45,5 +49,8 @@ av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c) if (EXTERNAL_SSSE3(cpu_flags) && !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; + + if (EXTERNAL_SSE4(cpu_flags)) + c->scalarproduct_and_madd_int32 = ff_scalarproduct_and_madd_int32_sse4; #endif } -- 2.8.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel