This in particular allows to remove inline asm, which is the case for
x86 in this patch.

-- 
Christophe
From 85ec857da4251e3709f5c53213ddd5ec39d21895 Mon Sep 17 00:00:00 2001
From: Christophe Gisquet <[email protected]>
Date: Sat, 8 Feb 2014 17:48:43 +0100
Subject: [PATCH 8/9] dcadsp: move test out of loop and dspize

The vector dequantization has a test in a loop preventing effective SIMD
implementation. By moving it out of the loop, this loop can be DSPized.

Therefore, modify the current DSP implementation.

For x86 Arrandale:
        C  SSE SSE2 SSE4
win32: 260 162  119  104
win64: 242 N/A   89   72
---
 libavcodec/arm/dca.h             | 23 -----------------
 libavcodec/arm/dcadsp_init_arm.c | 42 ++++++++++++++++++++++++++++++
 libavcodec/dcadec.c              | 41 ++++++++++--------------------
 libavcodec/dcadsp.c              | 21 ++++++++++-----
 libavcodec/dcadsp.h              |  8 +++++-
 libavcodec/x86/dca.h             | 55 ----------------------------------------
 libavcodec/x86/dcadsp.asm        | 55 +++++++++++++++++++++++++++++-----------
 libavcodec/x86/dcadsp_init.c     | 18 ++++++++-----
 8 files changed, 129 insertions(+), 134 deletions(-)
 delete mode 100644 libavcodec/x86/dca.h

diff --git a/libavcodec/arm/dca.h b/libavcodec/arm/dca.h
index 580bd75..4aed576 100644
--- a/libavcodec/arm/dca.h
+++ b/libavcodec/arm/dca.h
@@ -81,27 +81,4 @@ static inline int decode_blockcodes(int code1, int code2, int levels,
 
 #endif
 
-#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
-
-#define int8x8_fmul_int32 int8x8_fmul_int32
-static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
-                                     float *dst, const int8_t *src, int scale)
-{
-    __asm__ ("vcvt.f32.s32 %2,  %2,  #4         \n"
-             "vld1.8       {d0},     [%1,:64]   \n"
-             "vmovl.s8     q0,  d0              \n"
-             "vmovl.s16    q1,  d1              \n"
-             "vmovl.s16    q0,  d0              \n"
-             "vcvt.f32.s32 q0,  q0              \n"
-             "vcvt.f32.s32 q1,  q1              \n"
-             "vmul.f32     q0,  q0,  %y2        \n"
-             "vmul.f32     q1,  q1,  %y2        \n"
-             "vst1.32      {q0-q1},  [%m0,:128] \n"
-             : "=Um"(*(float (*)[8])dst)
-             : "r"(src), "x"(scale)
-             : "d0", "d1", "d2", "d3");
-}
-
-#endif
-
 #endif /* AVCODEC_ARM_DCA_H */
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c
index 2ea1289..6359ec3 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@@ -53,6 +53,45 @@ void ff_synth_filter_float_neon(FFTContext *imdct,
                                 float out[32], const float in[32],
                                 float scale);
 
+#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
+
+static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
+                                     float *dst, const int8_t *src, int scale)
+{
+    __asm__ ("vcvt.f32.s32 %2,  %2,  #4         \n"
+             "vld1.8       {d0},     [%1,:64]   \n"
+             "vmovl.s8     q0,  d0              \n"
+             "vmovl.s16    q1,  d1              \n"
+             "vmovl.s16    q0,  d0              \n"
+             "vcvt.f32.s32 q0,  q0              \n"
+             "vcvt.f32.s32 q1,  q1              \n"
+             "vmul.f32     q0,  q0,  %y2        \n"
+             "vmul.f32     q1,  q1,  %y2        \n"
+             "vst1.32      {q0-q1},  [%m0,:128] \n"
+             : "=Um"(*(float (*)[8])dst)
+             : "r"(src), "x"(scale)
+             : "d0", "d1", "d2", "d3");
+}
+
+static void decode_hf_neon(float dst[DCA_SUBBANDS][8],
+                           const int32_t vq_num[DCA_SUBBANDS],
+                           const int8_t hf_vq[1024][32], intptr_t vq_offset,
+                           int32_t scale[DCA_SUBBANDS][2],
+                           intptr_t start, intptr_t end)
+{
+    int l;
+    for (l = start; l < end; l++) {
+        /* 1 vector -> 32 samples but we only need the 8 samples
+         * for this subsubframe. */
+        int hfvq = vq_num[l];
+
+        int8x8_fmul_int32(dst[l], hf_vq[hfvq] + vq_offset, scale[l][0]);
+    }
+}
+
+#endif
+
+
 av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -65,6 +104,9 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
     if (have_neon(cpu_flags)) {
         s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
         s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
+#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
+        s->decode_hf  = decode_hf_neon;
+#endif
     }
 }
 
diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
index 076a225..6aee7bf 100644
--- a/libavcodec/dcadec.c
+++ b/libavcodec/dcadec.c
@@ -50,14 +50,11 @@
 #if ARCH_ARM
 #   include "arm/dca.h"
 #endif
-#if ARCH_X86
-#   include "x86/dca.h"
-#endif
 
 //#define TRACE
 
 #define DCA_PRIM_CHANNELS_MAX  (7)
-#define DCA_SUBBANDS          (32)
+// DCA_SUBBANDS defined in dcadsp.h
 #define DCA_ABITS_MAX         (32)      /* Should be 28 */
 #define DCA_SUBSUBFRAMES_MAX   (4)
 #define DCA_SUBFRAMES_MAX     (16)
@@ -340,7 +337,7 @@ typedef struct {
     int prediction_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];      ///< prediction VQ coefs
     int bitalloc[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];           ///< bit allocation index
     int transition_mode[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];    ///< transition mode (transients)
-    int scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2];    ///< scale factors (2 if transient)
+    int32_t scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2];///< scale factors (2 if transient)
     int joint_huff[DCA_PRIM_CHANNELS_MAX];                       ///< joint subband scale factors codebook
     int joint_scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< joint subband scale factors
     float downmix_coef[DCA_PRIM_CHANNELS_MAX + 1][2];            ///< stereo downmix coefficients
@@ -353,7 +350,7 @@ typedef struct {
     uint8_t  core_downmix_amode;                                 ///< audio channel arrangement of embedded downmix
     uint16_t core_downmix_codes[DCA_PRIM_CHANNELS_MAX + 1][4];   ///< embedded downmix coefficients (9-bit codes)
 
-    int high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];       ///< VQ encoded high frequency subbands
+    int32_t  high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];  ///< VQ encoded high frequency subbands
 
     float lfe_data[2 * DCA_LFE_MAX * (DCA_BLOCKS_MAX + 4)];      ///< Low frequency effect data
     int lfe_scale_factor;
@@ -1088,14 +1085,6 @@ static int decode_blockcodes(int code1, int code2, int levels, int32_t *values)
 static const uint8_t abits_sizes[7]  = { 7, 10, 12, 13, 15, 17, 19 };
 static const uint8_t abits_levels[7] = { 3,  5,  7,  9, 13, 17, 25 };
 
-#ifndef int8x8_fmul_int32
-static inline void int8x8_fmul_int32(DCADSPContext *dsp, float *dst,
-                                     const int8_t *src, int scale)
-{
-    dsp->int8x8_fmul_int32(dst, src, scale);
-}
-#endif
-
 static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
 {
     int k, l;
@@ -1215,21 +1204,17 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
         /*
          * Decode VQ encoded high frequencies
          */
-        for (l = s->vq_start_subband[k]; l < s->subband_activity[k]; l++) {
-            /* 1 vector -> 32 samples but we only need the 8 samples
-             * for this subsubframe. */
-            int hfvq = s->high_freq_vq[k][l];
-
-            if (!s->debug_flag & 0x01) {
-                av_log(s->avctx, AV_LOG_DEBUG,
-                       "Stream with high frequencies VQ coding\n");
-                s->debug_flag |= 0x01;
-            }
-
-            int8x8_fmul_int32(&s->dcadsp, subband_samples[k][l],
-                              &high_freq_vq[hfvq][subsubframe * 8],
-                              s->scale_factor[k][l][0]);
+        if ((!s->debug_flag & 0x01) &&
+            s->subband_activity[k] > s->vq_start_subband[k]) {
+            av_log(s->avctx, AV_LOG_DEBUG,
+                   "Stream with high frequencies VQ coding\n");
+            s->debug_flag |= 0x01;
         }
+
+        s->dcadsp.decode_hf(subband_samples[k], s->high_freq_vq[k],
+                            high_freq_vq, subsubframe * 8,
+                            s->scale_factor[k], s->vq_start_subband[k],
+                            s->subband_activity[k]);
     }
 
     /* Check for DSYNC after subsubframe */
diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
index e9c7682..1e09bd3 100644
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -24,12 +24,21 @@
 #include "libavutil/intreadwrite.h"
 #include "dcadsp.h"
 
-static void int8x8_fmul_int32_c(float *dst, const int8_t *src, int scale)
+static void decode_hf_c(float dst[DCA_SUBBANDS][8],
+                        const int32_t vq_num[DCA_SUBBANDS],
+                        const int8_t hf_vq[1024][32], intptr_t vq_offset,
+                        int32_t scale[DCA_SUBBANDS][2],
+                        intptr_t start, intptr_t end)
 {
-    float fscale = scale / 16.0;
-    int i;
-    for (i = 0; i < 8; i++)
-        dst[i] = src[i] * fscale;
+    int l;
+    for (l = start; l < end; l++) {
+        /* 1 vector -> 32 samples but we only need the 8 samples
+         * for this subsubframe. */
+        int   i, hfvq = vq_num[l];
+        float fscale = scale[l][0] / 16.0;
+        for (i = 0; i < 8; i++)
+            dst[l][i] = hf_vq[hfvq][vq_offset + i] * fscale;
+    }
 }
 
 static inline void
@@ -103,7 +112,7 @@ av_cold void ff_dcadsp_init(DCADSPContext *s)
     s->lfe_fir[0] = dca_lfe_fir0_c;
     s->lfe_fir[1] = dca_lfe_fir1_c;
     s->qmf_32_subbands = dca_qmf_32_subbands;
-    s->int8x8_fmul_int32 = int8x8_fmul_int32_c;
+    s->decode_hf = decode_hf_c;
     if (ARCH_ARM) ff_dcadsp_init_arm(s);
     if (ARCH_X86) ff_dcadsp_init_x86(s);
 }
diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
index 3e04426..15105f0 100644
--- a/libavcodec/dcadsp.h
+++ b/libavcodec/dcadsp.h
@@ -22,6 +22,8 @@
 #include "avfft.h"
 #include "synth_filter.h"
 
+#define DCA_SUBBANDS          (32)
+
 typedef struct DCADSPContext {
     void (*lfe_fir[2])(float *out, const float *in, const float *coefs,
                        float scale);
@@ -31,7 +33,11 @@ typedef struct DCADSPContext {
                             int *synth_buf_offset, float synth_buf2[32],
                             const float window[512], float *samples_out,
                             float raXin[32], float scale);
-    void (*int8x8_fmul_int32)(float *dst, const int8_t *src, int scale);
+    void (*decode_hf)(float dst[DCA_SUBBANDS][8],
+                      const int32_t vq_num[DCA_SUBBANDS],
+                      const int8_t hf_vq[1024][32], intptr_t vq_offset,
+                      int32_t scale[DCA_SUBBANDS][2],
+                      intptr_t start, intptr_t end);
 } DCADSPContext;
 
 void ff_dcadsp_init(DCADSPContext *s);
diff --git a/libavcodec/x86/dca.h b/libavcodec/x86/dca.h
deleted file mode 100644
index ab175b3..0000000
--- a/libavcodec/x86/dca.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2012-2014 Christophe Gisquet <[email protected]>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-
-#if ARCH_X86_64 && HAVE_SSE2_INLINE
-# include "libavutil/x86/asm.h"
-# include "libavutil/mem.h"
-#include "libavcodec/dcadsp.h"
-
-# define int8x8_fmul_int32 int8x8_fmul_int32
-static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
-                                     float *dst, const int8_t *src, int scale)
-{
-    DECLARE_ALIGNED(16, static const uint32_t, inverse16) = 0x3D800000;
-    __asm__ volatile (
-        "cvtsi2ss        %2, %%xmm0 \n\t"
-        "mulss           %3, %%xmm0 \n\t"
-        "movq          (%1), %%xmm1 \n\t"
-        "punpcklbw   %%xmm1, %%xmm1 \n\t"
-        "movaps      %%xmm1, %%xmm2 \n\t"
-        "punpcklwd   %%xmm1, %%xmm1 \n\t"
-        "punpckhwd   %%xmm2, %%xmm2 \n\t"
-        "psrad          $24, %%xmm1 \n\t"
-        "psrad          $24, %%xmm2 \n\t"
-        "shufps  $0, %%xmm0, %%xmm0 \n\t"
-        "cvtdq2ps    %%xmm1, %%xmm1 \n\t"
-        "cvtdq2ps    %%xmm2, %%xmm2 \n\t"
-        "mulps       %%xmm0, %%xmm1 \n\t"
-        "mulps       %%xmm0, %%xmm2 \n\t"
-        "movaps      %%xmm1,  0(%0) \n\t"
-        "movaps      %%xmm2, 16(%0) \n\t"
-        :: "r"(dst), "r"(src), "m"(scale), "m"(inverse16)
-        XMM_CLOBBERS_ONLY("xmm0", "xmm1", "xmm2")
-    );
-}
-
-#endif /* ARCH_X86_64 && HAVE_SSE2_INLINE */
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 10dfaf6..f29d369 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -26,18 +26,37 @@ pf_inv16:  times 4 dd 0x3D800000 ; 1/16
 
 SECTION_TEXT
 
-; void int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale)
-%macro INT8X8_FMUL_INT32 0
-cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
-    cvtsi2ss    m0, scalem
+; decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS],
+;                     const int8_t hf_vq[1024][32], intptr_t vq_offset,
+;                     int32_t scale[DCA_SUBBANDS][2],
+;                     intptr_t start, intptr_t end)
+%macro DECODE_HF 0
+cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
+    cmp     startq, endm
+    je        .end
+    lea       srcq, [srcq + offsetq]
+    shl     startq, 2
+    mov    offsetd, endm
+%define DICT offsetq
+    shl    offsetq, 2
+    mov       endm, offsetq
+.loop:
+%if ARCH_X86_64
+    mov    offsetd, [scaleq + 2*startq]
+    cvtsi2ss    m0, offsetd
+%else
+    cvtsi2ss    m0, [scaleq + 2*startq]
+%endif
+    mov    offsetd, [numq + startq]
     mulss       m0, [pf_inv16]
+    shl       DICT, 5
     shufps      m0, m0, 0
 %if cpuflag(sse2)
 %if cpuflag(sse4)
-    pmovsxbd    m1, [srcq+0]
-    pmovsxbd    m2, [srcq+4]
+    pmovsxbd    m1, [srcq + DICT + 0]
+    pmovsxbd    m2, [srcq + DICT + 4]
 %else
-    movq        m1, [srcq]
+    movq        m1, [srcq + DICT]
     punpcklbw   m1, m1
     mova        m2, m1
     punpcklwd   m1, m1
@@ -48,8 +67,8 @@ cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
     cvtdq2ps    m1, m1
     cvtdq2ps    m2, m2
 %else
-    movd       mm0, [srcq+0]
-    movd       mm1, [srcq+4]
+    movd       mm0, [srcq + DICT + 0]
+    movd       mm1, [srcq + DICT + 4]
     punpcklbw  mm0, mm0
     punpcklbw  mm1, mm1
     movq       mm2, mm0
@@ -67,27 +86,33 @@ cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
     cvtpi2ps    m3, mm2
     cvtpi2ps    m4, mm3
     shufps      m0, m0, 0
-    emms
     shufps      m1, m3, q1010
     shufps      m2, m4, q1010
 %endif
     mulps       m1, m0
     mulps       m2, m0
-    mova [dstq+ 0], m1
-    mova [dstq+16], m2
+    mova [dstq + 8*startq +  0], m1
+    mova [dstq + 8*startq + 16], m2
+    add     startq, 4
+    cmp     startq, endm
+    jl       .loop
+.end:
+%if cpuflag(sse2) == 0
+    emms
+%endif
     REP_RET
 %endmacro
 
 %if ARCH_X86_32
 INIT_XMM sse
-INT8X8_FMUL_INT32
+DECODE_HF
 %endif
 
 INIT_XMM sse2
-INT8X8_FMUL_INT32
+DECODE_HF
 
 INIT_XMM sse4
-INT8X8_FMUL_INT32
+DECODE_HF
 
 ; %1=v0/v1  %2=in1  %3=in2
 %macro FIR_LOOP 2-3
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 140965a..19cbf75 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -23,9 +23,15 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/dcadsp.h"
 
-void ff_int8x8_fmul_int32_sse(float *dst, const int8_t *src, int scale);
-void ff_int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale);
-void ff_int8x8_fmul_int32_sse4(float *dst, const int8_t *src, int scale);
+void ff_decode_hf_sse(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
+                      const int8_t hf_vq[1024][32], intptr_t vq_offset,
+                      int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
+void ff_decode_hf_sse2(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
+                       const int8_t hf_vq[1024][32], intptr_t vq_offset,
+                       int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
+void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
+                       const int8_t hf_vq[1024][32], intptr_t vq_offset,
+                       int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
 void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs,
                          float scale);
 void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs,
@@ -37,18 +43,18 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
 
     if (EXTERNAL_SSE(cpu_flags)) {
 #if ARCH_X86_32
-        s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse;
+        s->decode_hf = ff_decode_hf_sse;
 #endif
         s->lfe_fir[0]        = ff_dca_lfe_fir0_sse;
         s->lfe_fir[1]        = ff_dca_lfe_fir1_sse;
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
-        s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse2;
+        s->decode_hf = ff_decode_hf_sse2;
     }
 
     if (EXTERNAL_SSE4(cpu_flags)) {
-        s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse4;
+        s->decode_hf = ff_decode_hf_sse4;
     }
 }
 
-- 
1.8.0.msysgit.0

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to