497 to 253 cycles under Win64.
Replacing the multiplication by s_m[m] by an andps and an xorps with
appropriate vectors is slower. Unrolling is a 15 cycles win.
---
libavcodec/sbrdsp.c | 1 -
libavcodec/x86/sbrdsp.asm | 93 ++++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/sbrdsp_init.c | 16 +++++++
3 files changed, 109 insertions(+), 1 deletions(-)
diff --git a/libavcodec/sbrdsp.c b/libavcodec/sbrdsp.c
index 781ec83..d0a0b93 100644
--- a/libavcodec/sbrdsp.c
+++ b/libavcodec/sbrdsp.c
@@ -175,7 +175,6 @@ static av_always_inline void sbr_hf_apply_noise(float
(*Y)[2],
int m_max)
{
int m;
-
for (m = 0; m < m_max; m++) {
float y0 = Y[m][0];
float y1 = Y[m][1];
diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
index cfbd6e8..608dee6 100644
--- a/libavcodec/x86/sbrdsp.asm
+++ b/libavcodec/x86/sbrdsp.asm
@@ -26,6 +26,12 @@ SECTION_RODATA
ps_mask times 2 dd 1<<31, 0
ps_mask2 times 2 dd 0, 1<<31
ps_neg times 4 dd 1<<31
+ps_noise0 times 2 dd 1.0, 0.0,
+ps_noise2 times 2 dd -1.0, 0.0
+ps_noise13 dd 0.0, 1.0, 0.0, -1.0
+ dd 0.0, -1.0, 0.0, 1.0
+ dd 0.0, 1.0, 0.0, -1.0
+cextern sbr_noise_table
SECTION_TEXT
@@ -318,3 +324,90 @@ cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
sub cq, 2*mmsize
jge .loop
REP_RET
+
+; r0q=Y r1q=s_m r2q=q_filt r3q=noise r4q=max_m
+cglobal hf_apply_noise_main
+ dec r3q
+ shl r4q, 2
+ lea r0q, [r0q + 2*r4q]
+ add r1q, r4q
+ add r2q, r4q
+ shl r3q, 3
+ xorps m5, m5
+ neg r4q
+.loop:
+ add r3q, 16
+ and r3q, 0x1ff<<3
+ movh m1, [r2q + r4q]
+ movu m3, [r3q + sbr_noise_table]
+ movh m2, [r2q + r4q + 8]
+ add r3q, 16
+ and r3q, 0x1ff<<3
+ movu m4, [r3q + sbr_noise_table]
+ unpcklps m1, m1
+ unpcklps m2, m2
+ mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+ mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+ movh m3, [r1q + r4q]
+ movh m4, [r1q + r4q + 8]
+ unpcklps m3, m3
+ unpcklps m4, m4
+ mova m6, m3
+ mova m7, m4
+ mulps m3, m0 ; s_m[m] * phi_sign
+ mulps m4, m0 ; s_m[m] * phi_sign
+ cmpps m6, m5, 0 ; m1 == 0
+ cmpps m7, m5, 0 ; m1 == 0
+ andps m1, m6
+ andps m2, m7
+ movu m6, [r0q + 2*r4q]
+ movu m7, [r0q + 2*r4q + 16]
+ addps m6, m1
+ addps m7, m2
+ addps m6, m3
+ addps m7, m4
+ movu [r0q + 2*r4q], m6
+ movu [r0q + 2*r4q + 16], m7
+ add r4q, 16
+ jl .loop
+ ret
+
+; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_0, 4,5,8, Y,s_m,q_filt,noise,kx,m_max
+ mova m0, [ps_noise0]
+ mov r4d, m_maxm
+ call hf_apply_noise_main
+ RET
+
+; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_1, 5,5,8, Y,s_m,q_filt,noise,kx,m_max
+ and kxq, 1
+ shl kxq, 4
+ mova m0, [kxq + ps_noise13]
+ mov r4d, m_maxm
+ call hf_apply_noise_main
+ RET
+
+; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_2, 4,5,8, Y,s_m,q_filt,noise,kx,m_max
+ mova m0, [ps_noise2]
+ mov r4d, m_maxm
+ call hf_apply_noise_main
+ RET
+
+; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_3, 5,5,8, Y,s_m,q_filt,noise,kx,m_max
+ and kxq, 1
+ shl kxq, 4
+ mova m0, [kxq + ps_noise13 + 16]
+ mov r4d, m_maxm
+ call hf_apply_noise_main
+ RET
diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c
index 5e3e131..9759314 100644
--- a/libavcodec/x86/sbrdsp_init.c
+++ b/libavcodec/x86/sbrdsp_init.c
@@ -36,6 +36,18 @@ void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float
*z);
void ff_sbr_qmf_pre_shuffle_sse(float *z);
void ff_sbr_qmf_deint_neg_sse(float *v, const float *src);
void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
+void ff_sbr_hf_apply_noise_0_sse(float (*Y)[2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_sse(float (*Y)[2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_sse(float (*Y)[2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_sse(float (*Y)[2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
void ff_sbrdsp_init_x86(SBRDSPContext *s)
{
@@ -51,5 +63,9 @@ void ff_sbrdsp_init_x86(SBRDSPContext *s)
s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_sse;
s->qmf_deint_neg = ff_sbr_qmf_deint_neg_sse;
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse;
+ s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse;
+ s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse;
+ s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse;
+ s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse;
}
}
--
1.7.7.msysgit.0
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel