Around 1.1 times faster and reduces runtime by up to 6%. --- libavcodec/x86/flac_dsp_gpl.asm | 91 ++++++++++++++++++++++++++++++++--------- 1 file changed, 72 insertions(+), 19 deletions(-)
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm index 952fc8b86b..91989ce560 100644 --- a/libavcodec/x86/flac_dsp_gpl.asm +++ b/libavcodec/x86/flac_dsp_gpl.asm @@ -152,13 +152,13 @@ RET %macro FUNCTION_BODY_32 0 %if ARCH_X86_64 - cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs + cglobal flac_enc_lpc_32, 5, 7, 8, mmsize*4, res, smp, len, order, coefs DECLARE_REG_TMP 5, 6 %define length r2d movsxd orderq, orderd %else - cglobal flac_enc_lpc_32, 5, 6, 8, mmsize, res, smp, len, order, coefs + cglobal flac_enc_lpc_32, 5, 6, 8, mmsize*4, res, smp, len, order, coefs DECLARE_REG_TMP 2, 5 %define length r2mp %endif @@ -189,18 +189,23 @@ mova [rsp], m4 ; save sign extend mask %define negj t1q .looplen: + ; process "odd" samples pxor m0, m0 pxor m4, m4 pxor m6, m6 mov posj, orderq xor negj, negj - .looporder: + .looporder1: movd m2, [coefsq+posj*4] ; c = coefs[j] SPLATD m2 - pmovzxdq m1, [smpq+negj*4-4] ; s = smp[i-j-1] - pmovzxdq m5, [smpq+negj*4-4+mmsize/2] - pmovzxdq m7, [smpq+negj*4-4+mmsize] + movu m1, [smpq+negj*4-4] ; s = smp[i-j-1] + movu m5, [smpq+negj*4-4+mmsize] + movu m7, [smpq+negj*4-4+mmsize*2] + ; Rather than explicitly unpack adjacent samples into qwords we can let + ; the pmuldq instruction unpack the 0th and 2nd samples for us when it + ; does its multiply. This saves an unpack for every sample in the inner + ; loop meaning it should be (much) quicker. pmuldq m1, m2 pmuldq m5, m2 pmuldq m7, m2 @@ -210,7 +215,7 @@ mova [rsp], m4 ; save sign extend mask dec negj inc posj - jnz .looporder + jnz .looporder1 HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift HACK_PSRAQ m4, m3, [rsp], m2 @@ -218,22 +223,70 @@ mova [rsp], m4 ; save sign extend mask CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift) CLIPQ m4, [pq_int_min], [pq_int_max], m2 CLIPQ m6, [pq_int_min], [pq_int_max], m2 - pshufd m0, m0, q0020 ; pack into first 2 dwords - pshufd m4, m4, q0020 - pshufd m6, m6, q0020 - movh m1, [smpq] - movh m5, [smpq+mmsize/2] - movh m7, [smpq+mmsize] + movu m1, [smpq] + movu m5, [smpq+mmsize] + movu m7, [smpq+mmsize*2] psubd m1, m0 ; smp[i] - p psubd m5, m4 psubd m7, m6 - movh [resq], m1 ; res[i] = smp[i] - (p >> shift) - movh [resq+mmsize/2], m5 - movh [resq+mmsize], m7 + mova [rsp+mmsize], m1 ; res[i] = smp[i] - (p >> shift) + mova [rsp+mmsize*2], m5 + mova [rsp+mmsize*3], m7 + + ; process "even" samples + pxor m0, m0 + pxor m4, m4 + pxor m6, m6 + mov posj, orderq + xor negj, negj + + .looporder2: + movd m2, [coefsq+posj*4] ; c = coefs[j] + SPLATD m2 + movu m1, [smpq+negj*4] ; s = smp[i-j-1] + movu m5, [smpq+negj*4+mmsize] + movu m7, [smpq+negj*4+mmsize*2] + pmuldq m1, m2 + pmuldq m5, m2 + pmuldq m7, m2 + paddq m0, m1 ; p += c * s + paddq m4, m5 + paddq m6, m7 + + dec negj + inc posj + jnz .looporder2 + + HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift + HACK_PSRAQ m4, m3, [rsp], m2 + HACK_PSRAQ m6, m3, [rsp], m2 + CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift) + CLIPQ m4, [pq_int_min], [pq_int_max], m2 + CLIPQ m6, [pq_int_min], [pq_int_max], m2 + movu m1, [smpq+4] + movu m5, [smpq+4+mmsize] + movu m7, [smpq+4+mmsize*2] + psubd m1, m0 ; smp[i] - p + psubd m5, m4 + psubd m7, m6 + + ; interleave odd and even samples + pslldq m1, 4 + pslldq m5, 4 + pslldq m7, 4 + + pblendw m1, [rsp+mmsize], q0303 + pblendw m5, [rsp+mmsize*2], q0303 + pblendw m7, [rsp+mmsize*3], q0303 + + movu [resq], m1 + movu [resq+mmsize], m5 + movu [resq+mmsize*2], m7 + + add resq, 3*mmsize + add smpq, 3*mmsize + sub length, (3*mmsize)/4 - add resq, (3*mmsize)/2 - add smpq, (3*mmsize)/2 - sub length, (3*mmsize)/8 jg .looplen RET -- 2.15.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel