On Fri, Nov 30, 2012 at 6:58 AM, Christophe Gisquet <[email protected]> wrote: > 497 to 253 cycles under Win64. > Replacing the multiplication by s_m[m] by an andps and an xorps with > appropriate vectors is slower. Unrolling is a 15 cycles win. > --- > libavcodec/sbrdsp.c | 1 - > libavcodec/x86/sbrdsp.asm | 93 > ++++++++++++++++++++++++++++++++++++++++++ > libavcodec/x86/sbrdsp_init.c | 16 +++++++ > 3 files changed, 109 insertions(+), 1 deletions(-) > > diff --git a/libavcodec/sbrdsp.c b/libavcodec/sbrdsp.c > index 781ec83..d0a0b93 100644 > --- a/libavcodec/sbrdsp.c > +++ b/libavcodec/sbrdsp.c > @@ -175,7 +175,6 @@ static av_always_inline void sbr_hf_apply_noise(float > (*Y)[2], > int m_max) > { > int m; > - > for (m = 0; m < m_max; m++) { > float y0 = Y[m][0]; > float y1 = Y[m][1]; > diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm > index cfbd6e8..608dee6 100644 > --- a/libavcodec/x86/sbrdsp.asm > +++ b/libavcodec/x86/sbrdsp.asm > @@ -26,6 +26,12 @@ SECTION_RODATA > ps_mask times 2 dd 1<<31, 0 > ps_mask2 times 2 dd 0, 1<<31 > ps_neg times 4 dd 1<<31 > +ps_noise0 times 2 dd 1.0, 0.0, > +ps_noise2 times 2 dd -1.0, 0.0 > +ps_noise13 dd 0.0, 1.0, 0.0, -1.0 > + dd 0.0, -1.0, 0.0, 1.0 > + dd 0.0, 1.0, 0.0, -1.0 > +cextern sbr_noise_table > > SECTION_TEXT > > @@ -318,3 +324,90 @@ cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c > sub cq, 2*mmsize > jge .loop > REP_RET > + > +; r0q=Y r1q=s_m r2q=q_filt r3q=noise r4q=max_m > +cglobal hf_apply_noise_main > + dec r3q > + shl r4q, 2 > + lea r0q, [r0q + 2*r4q] > + add r1q, r4q > + add r2q, r4q > + shl r3q, 3 > + xorps m5, m5 > + neg r4q > +.loop: > + add r3q, 16 > + and r3q, 0x1ff<<3 > + movh m1, [r2q + r4q] > + movu m3, [r3q + sbr_noise_table] > + movh m2, [r2q + r4q + 8] > + add r3q, 16 > + and r3q, 0x1ff<<3 > + movu m4, [r3q + sbr_noise_table] > + unpcklps m1, m1 > + unpcklps m2, m2 > + mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] > + mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] > + movh m3, [r1q + r4q] > + movh m4, [r1q + r4q + 8] > + unpcklps m3, m3 > + unpcklps m4, m4 > + mova m6, m3 > + mova m7, m4 > + mulps m3, m0 ; s_m[m] * phi_sign > + mulps m4, m0 ; s_m[m] * phi_sign > + cmpps m6, m5, 0 ; m1 == 0 > + cmpps m7, m5, 0 ; m1 == 0 > + andps m1, m6 > + andps m2, m7 > + movu m6, [r0q + 2*r4q] > + movu m7, [r0q + 2*r4q + 16] > + addps m6, m1 > + addps m7, m2 > + addps m6, m3 > + addps m7, m4
Maybe add m1/m2 to m3/m4 before to m6/m7, to better hide the memory load? Jason _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
