On Fri, 30 Nov 2012, Christophe Gisquet wrote: > 497 to 253 cycles under Win64.
cpu is more relevant than os. > +; r0q=Y r1q=s_m r2q=q_filt r3q=noise r4q=max_m > +cglobal hf_apply_noise_main You can invoke DEFINE_ARGS even if not generating a prologue. > + dec r3q > + shl r4q, 2 > + lea r0q, [r0q + 2*r4q] > + add r1q, r4q > + add r2q, r4q > + shl r3q, 3 > + xorps m5, m5 > + neg r4q > +.loop: > + add r3q, 16 > + and r3q, 0x1ff<<3 > + movh m1, [r2q + r4q] > + movu m3, [r3q + sbr_noise_table] > + movh m2, [r2q + r4q + 8] > + add r3q, 16 > + and r3q, 0x1ff<<3 > + movu m4, [r3q + sbr_noise_table] > + unpcklps m1, m1 > + unpcklps m2, m2 > + mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] > + mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] > + movh m3, [r1q + r4q] > + movh m4, [r1q + r4q + 8] Can these be a single aligned load? > + unpcklps m3, m3 > + unpcklps m4, m4 > + mova m6, m3 > + mova m7, m4 > + mulps m3, m0 ; s_m[m] * phi_sign > + mulps m4, m0 ; s_m[m] * phi_sign > + cmpps m6, m5, 0 ; m1 == 0 > + cmpps m7, m5, 0 ; m1 == 0 You mean m7 == 0? > + andps m1, m6 > + andps m2, m7 > + movu m6, [r0q + 2*r4q] > + movu m7, [r0q + 2*r4q + 16] > + addps m6, m1 > + addps m7, m2 > + addps m6, m3 > + addps m7, m4 > + movu [r0q + 2*r4q], m6 > + movu [r0q + 2*r4q + 16], m7 > + add r4q, 16 > + jl .loop > + ret > + > +; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m, > +; const float *q_filt, int noise, > +; int kx, int m_max) > +cglobal sbr_hf_apply_noise_0, 4,5,8, Y,s_m,q_filt,noise,kx,m_max > + mova m0, [ps_noise0] > + mov r4d, m_maxm > + call hf_apply_noise_main > + RET TAIL_CALL hf_apply_noise_main, 1 --Loren Merritt _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
