>From 1.3 to 2.5 times faster. Runtime reduced by 4 to 58%. As with the 16-bit version the speed-up generally increases with compression_level.
Also like the 16-bit version, it is not used with levels less than 3. --- libavcodec/x86/flac_dsp_gpl.asm | 101 +++++++++++++++++++++++++++++++++++++++ libavcodec/x86/flacdsp_init.c | 5 ++ 2 files changed, 106 insertions(+), 0 deletions(-) diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm index cedf083..7a49fae 100644 --- a/libavcodec/x86/flac_dsp_gpl.asm +++ b/libavcodec/x86/flac_dsp_gpl.asm @@ -22,6 +22,12 @@ %include "libavutil/x86/x86util.asm" +SECTION_RODATA + +pd_0_int_min: times 2 dd 0, -2147483648 +pq_int_min: times 2 dq -2147483648 +pq_int_max: times 2 dq 2147483647 + SECTION_TEXT INIT_XMM sse4 @@ -99,3 +105,98 @@ neg orderq sub length, (3*mmsize)/4 jg .looplen RET + +%macro PMINSQ 3 + pcmpgtq %3, %2, %1 + pand %1, %3 + pandn %3, %2 + por %1, %3 +%endmacro + +%macro PMAXSQ 3 + pcmpgtq %3, %1, %2 + pand %1, %3 + pandn %3, %2 + por %1, %3 +%endmacro + +%macro CLIPQ 4 ; reg, min, max, tmp + PMAXSQ %1, %2, %4 + PMINSQ %1, %3, %4 +%endmacro + +%macro HACK_PSRAQ 4 ; dst, src (shift), sign extend mask, tmp + pxor %4, %4 ; zero + pcmpgtq %4, %1 ; mask where 0 > dst + pand %4, %3 ; mask & sign extend mask + psrlq %1, %2 ; dst >>= shift + por %1, %4 ; dst | mask +%endmacro + +INIT_XMM sse42 +%if ARCH_X86_64 + cglobal flac_enc_lpc_32, 5, 7, 4, mmsize, res, smp, len, order, coefs + DECLARE_REG_TMP 5, 6 + %define length r2d + + movsxd orderq, orderd +%else + cglobal flac_enc_lpc_32, 5, 6, 4, mmsize, res, smp, len, order, coefs + DECLARE_REG_TMP 2, 5 + %define length r2mp +%endif + +; Here we assume that the maximum order value is 32. This means that we only +; need to copy a maximum of 32 samples. Therefore we let the preprocessor +; unroll this loop and copy all 32. +%assign iter 0 +%rep 32/(mmsize/4) + movu m0, [smpq+iter] + movu [resq+iter], m0 + %assign iter iter+mmsize +%endrep + +lea resq, [resq+orderq*4] +lea smpq, [smpq+orderq*4] +lea coefsq, [coefsq+orderq*4] +sub length, orderd +movd m3, r5m +neg orderq + +movu m4, [pd_0_int_min] ; load 1 bit +psrad m4, m3 ; turn that into shift+1 bits +pslld m4, 1 ; reduce that +mova [rsp], m4 ; save sign extend mask + +%define posj t0q +%define negj t1q + +.looplen: + pxor m0, m0 + mov posj, orderq + xor negj, negj + + .looporder: + movd m2, [coefsq+posj*4] ; c = coefs[j] + SPLATD m2 + movh m1, [smpq+negj*4-4] ; s = smp[i-j-1] + pshufd m1, m1, q3130 + pmuldq m1, m2 + paddq m0, m1 ; p += c * s + + dec negj + inc posj + jnz .looporder + + HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift + CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift) + pshufd m0, m0, q0020 ; pack into first 2 dwords + movh m1, [smpq] + psubd m1, m0 ; smp[i] - p + movh [resq], m1 ; res[i] = smp[i] - (p >> shift) + + add resq, mmsize/2 + add smpq, mmsize/2 + sub length, mmsize/8 +jg .looplen +RET diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c index ad88e5b..976ea2c 100644 --- a/libavcodec/x86/flacdsp_init.c +++ b/libavcodec/x86/flacdsp_init.c @@ -28,6 +28,7 @@ void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order, int qlevel, int len); void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int); +void ff_flac_enc_lpc_32_sse42(int32_t *, const int32_t *, int, int, const int32_t *,int); av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int bps) @@ -45,5 +46,9 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, if (bps > 16 && CONFIG_FLAC_DECODER) c->lpc = ff_flac_lpc_32_xop; } + if (EXTERNAL_SSE42(cpu_flags)) { + if (bps > 16 && CONFIG_FLAC_ENCODER && CONFIG_GPL) + c->lpc_encode = ff_flac_enc_lpc_32_sse42; + } #endif } -- 1.7.9 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel