Hi,

first patch is a preliminary patch for the second one.

The amount of shuffling in the second one is unsatisfactory, but I
don't see a way to reduce it without worsening other things.

A SSE4 version using dpps might bring some improvement, but I don't
find it worth the effort.

Christophe
From 990efcdca4ecd2393478bf1d3f3b9fc5a9b98c1b Mon Sep 17 00:00:00 2001
From: Christophe GISQUET <[email protected]>
Date: Fri, 24 Feb 2012 22:15:33 +0100
Subject: [PATCH 3/6] SBR DSP: change some parameters type to intptr_t

---
 libavcodec/arm/sbrdsp_init_arm.c |    2 +-
 libavcodec/sbrdsp.c              |    2 +-
 libavcodec/sbrdsp.h              |    2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/arm/sbrdsp_init_arm.c b/libavcodec/arm/sbrdsp_init_arm.c
index 04294cc..e0f8edc 100644
--- a/libavcodec/arm/sbrdsp_init_arm.c
+++ b/libavcodec/arm/sbrdsp_init_arm.c
@@ -33,7 +33,7 @@ void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2],
                            const float *g_filt, int m_max, intptr_t ixh);
 void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2],
                         const float alpha0[2], const float alpha1[2],
-                        float bw, int start, int end);
+                        float bw, intptr_t start, intptr_t end);
 void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]);
 
 void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m,
diff --git a/libavcodec/sbrdsp.c b/libavcodec/sbrdsp.c
index f942759..d52b9df 100644
--- a/libavcodec/sbrdsp.c
+++ b/libavcodec/sbrdsp.c
@@ -124,7 +124,7 @@ static void sbr_autocorrelate_c(const float x[40][2], float phi[3][2][2])
 
 static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2],
                          const float alpha0[2], const float alpha1[2],
-                         float bw, int start, int end)
+                         float bw, intptr_t start, intptr_t end)
 {
     float alpha[4];
     int i;
diff --git a/libavcodec/sbrdsp.h b/libavcodec/sbrdsp.h
index fe91957..5950092 100644
--- a/libavcodec/sbrdsp.h
+++ b/libavcodec/sbrdsp.h
@@ -34,7 +34,7 @@ typedef struct SBRDSPContext {
     void (*autocorrelate)(const float x[40][2], float phi[3][2][2]);
     void (*hf_gen)(float (*X_high)[2], const float (*X_low)[2],
                    const float alpha0[2], const float alpha1[2],
-                   float bw, int start, int end);
+                   float bw, intptr_t start, intptr_t end);
     void (*hf_g_filt)(float (*Y)[2], const float (*X_high)[40][2],
                       const float *g_filt, int m_max, intptr_t ixh);
     void (*hf_apply_noise[4])(float (*Y)[2], const float *s_m,
-- 
1.7.7.6

From 7d6cf6f02af7588a052514c5f6030a3b575257fb Mon Sep 17 00:00:00 2001
From: Christophe GISQUET <[email protected]>
Date: Fri, 24 Feb 2012 22:11:19 +0100
Subject: [PATCH 4/6] SBR DSP x86: implement SSE sbr_hf_gen

Start and end index are multiple of 2, therefore guaranteeing aligned access.
Also, this allows to generate 4 floats per loop, keeping the alignment all
along.

Timing:
- 32 bits: 326c -> 172
- 64 bits: 323c -> 156c
---
 libavcodec/x86/sbrdsp.asm    |   90 +++++++++++++++++++++++++++++++++++++++++-
 libavcodec/x86/sbrdsp_init.c |    4 ++
 2 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
index c165c52..92c81d7 100644
--- a/libavcodec/x86/sbrdsp.asm
+++ b/libavcodec/x86/sbrdsp.asm
@@ -22,8 +22,10 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-;SECTION_RODATA
-SECTION .text
+SECTION_RODATA
+ps_m1p1m1p1:    times 2 dd -1.0, 1.0
+
+SECTION_TEXT
 
 INIT_XMM sse
 cglobal sbr_sum_square, 2, 3, 6
@@ -112,3 +114,87 @@ cglobal sbr_hf_g_filt, 5, 6, 5
     jnz         .loop1
 .end:
     RET
+
+;
+; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2],
+;                          const float alpha0[2], const float alpha1[2],
+;                          float bw, int start, int end)
+;
+cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1
+
+    ; Set pointers
+%if ARCH_X86_64 == 0
+    ; start and end 6th and 7th args on stack
+    mov         r2, [rsp + 28]
+    mov         r3, [rsp + 32]
+%xdefine  start r2
+%xdefine  end   r3
+%else
+
+%if WIN64
+    ; 2 last args are first on stack
+    ; xmm6 and xmm7 saved on the stack - account for offset
+    mov         r2, [rsp +  96]
+    mov         r3, [rsp + 104]
+%xdefine  start r2
+%xdefine  end   r3
+%else
+    ; 6 args in 6 regs
+%xdefine  start r8
+%xdefine  end   r9
+%endif
+
+%endif
+    sub         start, end          ; neg num of loops
+    lea         X_highq, [X_highq + end*2*4]
+    lea         X_lowq, [X_lowq + end*2*4 - 2*2*4]
+    shl         start, 3              ; offset from num loops
+
+    ; load alpha factors
+%define bw m0
+%if ARCH_X86_64 == 0
+    movss       bw, [rsp + (5+1)*4]
+%else
+    ; First float in xmm0 for x86_64 abis except win64, thus:
+    ; bw already loaded in xmm0 except for win64 where still on stack
+%if WIN64
+    movss       bw, [rsp +  88]
+%endif
+%endif
+    movq        m2, [alpha1q]
+    movq        m1, [alpha0q]
+    shufps      bw, bw, 0
+    mulps       m2, bw             ; (a1[0] a1[1])*bw
+    mulps       m1, bw             ; (a0[0] a0[1])*bw    = (a2 a3)
+    mulps       m2, bw             ; (a1[0] a1[1])*bw*bw = (a0 a1)
+    mova        m3, m1
+    mova        m4, m2
+    mova        m7, [ps_m1p1m1p1]
+    mova        m0, [X_lowq + start]
+    movlhps     m1, m1             ; (a2 a3 a2 a3)
+    movlhps     m2, m2             ; (a0 a1 a0 a1)
+    shufps      m3, m3, 00010001b  ; (a3 a2 a3 a2)
+    shufps      m4, m4, 00010001b  ; (a1 a0 a1 a0)
+    mulps       m3, m7             ; (-a3 a2 -a3 a2)
+    mulps       m4, m7             ; (-a1 a0 -a1 a0)
+.loop2:
+    mova        m5, m0
+    mova        m6, m0
+    shufps      m0, m0, 10100000b ; {Xl[-2][0],",Xl[-1][0],"}
+    shufps      m5, m5, 11110101b ; {Xl[-2][1],",Xl[-1][1],"}
+    mulps       m0, m2
+    mulps       m5, m4
+    mova        m7, m6
+    addps       m5, m0
+    mova        m0, [X_lowq + start + 2*2*4]
+    shufps      m6, m0, 00001010b ; {Xl[-1][0],",Xl[0][0],"}
+    shufps      m7, m0, 01011111b ; {Xl[-1][1],",Xl[1][1],"}
+    mulps       m6, m1
+    mulps       m7, m3
+    addps       m5, m6
+    addps       m7, m0
+    addps       m5, m7
+    movaps      [X_highq + start], m5
+    add         start, 16
+    jnz         .loop2
+    RET
diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c
index 0ffe5b9..b880df9 100644
--- a/libavcodec/x86/sbrdsp_init.c
+++ b/libavcodec/x86/sbrdsp_init.c
@@ -26,6 +26,9 @@
 float ff_sbr_sum_square_sse(float (*x)[2], int n);
 void ff_sbr_hf_g_filt_sse(float (*Y)[2], const float (*X_high)[40][2],
                           const float *g_filt, int m_max, intptr_t ixh);
+void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
+                       const float alpha0[2], const float alpha1[2],
+                       float bw, intptr_t start, intptr_t end);
 
 void ff_sbrdsp_init_x86(SBRDSPContext *s)
 {
@@ -35,6 +38,7 @@ void ff_sbrdsp_init_x86(SBRDSPContext *s)
         if (mm_flags & AV_CPU_FLAG_SSE) {
             s->sum_square = ff_sbr_sum_square_sse;
             s->hf_g_filt = ff_sbr_hf_g_filt_sse;
+            s->hf_gen = ff_sbr_hf_gen_sse;
         }
     }
 }
-- 
1.7.7.6

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to