497 to 253 cycles under Win64.
Replacing the multiplication by s_m[m] by an andps and an xorps with
appropriate vectors is slower. Unrolling is a 15 cycles win.
---
 libavcodec/sbrdsp.c          |    1 -
 libavcodec/x86/sbrdsp.asm    |   93 ++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/sbrdsp_init.c |   16 +++++++
 3 files changed, 109 insertions(+), 1 deletions(-)

diff --git a/libavcodec/sbrdsp.c b/libavcodec/sbrdsp.c
index 781ec83..d0a0b93 100644
--- a/libavcodec/sbrdsp.c
+++ b/libavcodec/sbrdsp.c
@@ -175,7 +175,6 @@ static av_always_inline void sbr_hf_apply_noise(float 
(*Y)[2],
                                                 int m_max)
 {
     int m;
-
     for (m = 0; m < m_max; m++) {
         float y0 = Y[m][0];
         float y1 = Y[m][1];
diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
index cfbd6e8..608dee6 100644
--- a/libavcodec/x86/sbrdsp.asm
+++ b/libavcodec/x86/sbrdsp.asm
@@ -26,6 +26,12 @@ SECTION_RODATA
 ps_mask         times 2 dd 1<<31, 0
 ps_mask2        times 2 dd 0, 1<<31
 ps_neg          times 4 dd 1<<31
+ps_noise0       times 2 dd  1.0,  0.0,
+ps_noise2       times 2 dd -1.0,  0.0
+ps_noise13      dd  0.0,  1.0, 0.0, -1.0
+                dd  0.0, -1.0, 0.0,  1.0
+                dd  0.0,  1.0, 0.0, -1.0
+cextern         sbr_noise_table
 
 SECTION_TEXT
 
@@ -318,3 +324,90 @@ cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
   sub        cq, 2*mmsize
   jge     .loop
   REP_RET
+
+; r0q=Y   r1q=s_m   r2q=q_filt   r3q=noise  r4q=max_m
+cglobal hf_apply_noise_main
+  dec       r3q
+  shl       r4q, 2
+  lea       r0q, [r0q + 2*r4q]
+  add       r1q, r4q
+  add       r2q, r4q
+  shl       r3q, 3
+  xorps      m5, m5
+  neg       r4q
+.loop:
+  add       r3q, 16
+  and       r3q, 0x1ff<<3
+  movh       m1, [r2q + r4q]
+  movu       m3, [r3q + sbr_noise_table]
+  movh       m2, [r2q + r4q + 8]
+  add       r3q, 16
+  and       r3q, 0x1ff<<3
+  movu       m4, [r3q + sbr_noise_table]
+  unpcklps   m1, m1
+  unpcklps   m2, m2
+  mulps      m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+  mulps      m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+  movh       m3, [r1q + r4q]
+  movh       m4, [r1q + r4q + 8]
+  unpcklps   m3, m3
+  unpcklps   m4, m4
+  mova       m6, m3
+  mova       m7, m4
+  mulps      m3, m0 ; s_m[m] * phi_sign
+  mulps      m4, m0 ; s_m[m] * phi_sign
+  cmpps      m6, m5, 0 ; m1 == 0
+  cmpps      m7, m5, 0 ; m1 == 0
+  andps      m1, m6
+  andps      m2, m7
+  movu       m6, [r0q + 2*r4q]
+  movu       m7, [r0q + 2*r4q + 16]
+  addps      m6, m1
+  addps      m7, m2
+  addps      m6, m3
+  addps      m7, m4
+  movu    [r0q + 2*r4q], m6
+  movu    [r0q + 2*r4q + 16], m7
+  add       r4q, 16
+  jl      .loop
+  ret
+
+; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_0, 4,5,8, Y,s_m,q_filt,noise,kx,m_max
+  mova       m0, [ps_noise0]
+  mov       r4d, m_maxm
+  call      hf_apply_noise_main
+  RET
+
+; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_1, 5,5,8, Y,s_m,q_filt,noise,kx,m_max
+  and       kxq, 1
+  shl       kxq, 4
+  mova       m0, [kxq + ps_noise13]
+  mov       r4d, m_maxm
+  call  hf_apply_noise_main
+  RET
+
+; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_2, 4,5,8, Y,s_m,q_filt,noise,kx,m_max
+  mova       m0, [ps_noise2]
+  mov       r4d, m_maxm
+  call  hf_apply_noise_main
+  RET
+
+; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_3, 5,5,8, Y,s_m,q_filt,noise,kx,m_max
+  and       kxq, 1
+  shl       kxq, 4
+  mova       m0, [kxq + ps_noise13 + 16]
+  mov       r4d, m_maxm
+  call  hf_apply_noise_main
+  RET
diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c
index 5e3e131..9759314 100644
--- a/libavcodec/x86/sbrdsp_init.c
+++ b/libavcodec/x86/sbrdsp_init.c
@@ -36,6 +36,18 @@ void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float 
*z);
 void ff_sbr_qmf_pre_shuffle_sse(float *z);
 void ff_sbr_qmf_deint_neg_sse(float *v, const float *src);
 void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
+void ff_sbr_hf_apply_noise_0_sse(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_sse(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_sse(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_sse(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max);
 
 void ff_sbrdsp_init_x86(SBRDSPContext *s)
 {
@@ -51,5 +63,9 @@ void ff_sbrdsp_init_x86(SBRDSPContext *s)
         s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_sse;
         s->qmf_deint_neg = ff_sbr_qmf_deint_neg_sse;
         s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse;
+        s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse;
+        s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse;
+        s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse;
+        s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse;
     }
 }
-- 
1.7.7.msysgit.0

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to