On Fri, Nov 30, 2012 at 6:58 AM, Christophe Gisquet
<[email protected]> wrote:
> 497 to 253 cycles under Win64.
> Replacing the multiplication by s_m[m] by an andps and an xorps with
> appropriate vectors is slower. Unrolling is a 15 cycles win.
> ---
>  libavcodec/sbrdsp.c          |    1 -
>  libavcodec/x86/sbrdsp.asm    |   93 
> ++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/sbrdsp_init.c |   16 +++++++
>  3 files changed, 109 insertions(+), 1 deletions(-)
>
> diff --git a/libavcodec/sbrdsp.c b/libavcodec/sbrdsp.c
> index 781ec83..d0a0b93 100644
> --- a/libavcodec/sbrdsp.c
> +++ b/libavcodec/sbrdsp.c
> @@ -175,7 +175,6 @@ static av_always_inline void sbr_hf_apply_noise(float 
> (*Y)[2],
>                                                  int m_max)
>  {
>      int m;
> -
>      for (m = 0; m < m_max; m++) {
>          float y0 = Y[m][0];
>          float y1 = Y[m][1];
> diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
> index cfbd6e8..608dee6 100644
> --- a/libavcodec/x86/sbrdsp.asm
> +++ b/libavcodec/x86/sbrdsp.asm
> @@ -26,6 +26,12 @@ SECTION_RODATA
>  ps_mask         times 2 dd 1<<31, 0
>  ps_mask2        times 2 dd 0, 1<<31
>  ps_neg          times 4 dd 1<<31
> +ps_noise0       times 2 dd  1.0,  0.0,
> +ps_noise2       times 2 dd -1.0,  0.0
> +ps_noise13      dd  0.0,  1.0, 0.0, -1.0
> +                dd  0.0, -1.0, 0.0,  1.0
> +                dd  0.0,  1.0, 0.0, -1.0
> +cextern         sbr_noise_table
>
>  SECTION_TEXT
>
> @@ -318,3 +324,90 @@ cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
>    sub        cq, 2*mmsize
>    jge     .loop
>    REP_RET
> +
> +; r0q=Y   r1q=s_m   r2q=q_filt   r3q=noise  r4q=max_m
> +cglobal hf_apply_noise_main
> +  dec       r3q
> +  shl       r4q, 2
> +  lea       r0q, [r0q + 2*r4q]
> +  add       r1q, r4q
> +  add       r2q, r4q
> +  shl       r3q, 3
> +  xorps      m5, m5
> +  neg       r4q
> +.loop:
> +  add       r3q, 16
> +  and       r3q, 0x1ff<<3
> +  movh       m1, [r2q + r4q]
> +  movu       m3, [r3q + sbr_noise_table]
> +  movh       m2, [r2q + r4q + 8]
> +  add       r3q, 16
> +  and       r3q, 0x1ff<<3
> +  movu       m4, [r3q + sbr_noise_table]
> +  unpcklps   m1, m1
> +  unpcklps   m2, m2
> +  mulps      m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
> +  mulps      m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
> +  movh       m3, [r1q + r4q]
> +  movh       m4, [r1q + r4q + 8]
> +  unpcklps   m3, m3
> +  unpcklps   m4, m4
> +  mova       m6, m3
> +  mova       m7, m4
> +  mulps      m3, m0 ; s_m[m] * phi_sign
> +  mulps      m4, m0 ; s_m[m] * phi_sign
> +  cmpps      m6, m5, 0 ; m1 == 0
> +  cmpps      m7, m5, 0 ; m1 == 0
> +  andps      m1, m6
> +  andps      m2, m7
> +  movu       m6, [r0q + 2*r4q]
> +  movu       m7, [r0q + 2*r4q + 16]
> +  addps      m6, m1
> +  addps      m7, m2
> +  addps      m6, m3
> +  addps      m7, m4

Maybe add m1/m2 to m3/m4 before to m6/m7, to better hide the memory load?

Jason
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to