[libav-devel] [PATCH 08/10] dcadsp: move test out of loop and dspize

Christophe Gisquet Fri, 14 Feb 2014 08:04:38 -0800

The vector dequantization has a test in a loop preventing effective SIMD
implementation. By moving it out of the loop, this loop can be DSPized.


Therefore, modify the current DSP implementation. In particular, the
DSP implementation no longer has to handle null loop sizes.
---
 libavcodec/arm/dca.h             | 23 ----------------------
 libavcodec/arm/dcadsp_init_arm.c | 41 ++++++++++++++++++++++++++++++++++++++++
 libavcodec/dcadec.c              | 30 +++++++----------------------
 libavcodec/dcadsp.c              | 21 ++++++++++++++------
 libavcodec/dcadsp.h              |  8 +++++++-
 5 files changed, 70 insertions(+), 53 deletions(-)

diff --git a/libavcodec/arm/dca.h b/libavcodec/arm/dca.h
index 580bd75..4aed576 100644
--- a/libavcodec/arm/dca.h
+++ b/libavcodec/arm/dca.h
@@ -81,27 +81,4 @@ static inline int decode_blockcodes(int code1, int code2, 
int levels,
 
 #endif
 
-#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
-
-#define int8x8_fmul_int32 int8x8_fmul_int32
-static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
-                                     float *dst, const int8_t *src, int scale)
-{
-    __asm__ ("vcvt.f32.s32 %2,  %2,  #4         \n"
-             "vld1.8       {d0},     [%1,:64]   \n"
-             "vmovl.s8     q0,  d0              \n"
-             "vmovl.s16    q1,  d1              \n"
-             "vmovl.s16    q0,  d0              \n"
-             "vcvt.f32.s32 q0,  q0              \n"
-             "vcvt.f32.s32 q1,  q1              \n"
-             "vmul.f32     q0,  q0,  %y2        \n"
-             "vmul.f32     q1,  q1,  %y2        \n"
-             "vst1.32      {q0-q1},  [%m0,:128] \n"
-             : "=Um"(*(float (*)[8])dst)
-             : "r"(src), "x"(scale)
-             : "d0", "d1", "d2", "d3");
-}
-
-#endif
-
 #endif /* AVCODEC_ARM_DCA_H */
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c
index 2ea1289..9942581 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@@ -53,6 +53,44 @@ void ff_synth_filter_float_neon(FFTContext *imdct,
                                 float out[32], const float in[32],
                                 float scale);
 
+#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
+
+static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
+                                     float *dst, const int8_t *src, int scale)
+{
+    __asm__ ("vcvt.f32.s32 %2,  %2,  #4         \n"
+             "vld1.8       {d0},     [%1,:64]   \n"
+             "vmovl.s8     q0,  d0              \n"
+             "vmovl.s16    q1,  d1              \n"
+             "vmovl.s16    q0,  d0              \n"
+             "vcvt.f32.s32 q0,  q0              \n"
+             "vcvt.f32.s32 q1,  q1              \n"
+             "vmul.f32     q0,  q0,  %y2        \n"
+             "vmul.f32     q1,  q1,  %y2        \n"
+             "vst1.32      {q0-q1},  [%m0,:128] \n"
+             : "=Um"(*(float (*)[8])dst)
+             : "r"(src), "x"(scale)
+             : "d0", "d1", "d2", "d3");
+}
+
+static void decode_hf_neon(float dst[DCA_SUBBANDS][8],
+                           const int32_t vq_num[DCA_SUBBANDS],
+                           const int8_t hf_vq[1024][32], intptr_t vq_offset,
+                           int32_t scale[DCA_SUBBANDS][2],
+                           intptr_t start, intptr_t end)
+{
+    int l;
+    for (l = start; l < end; l++) {
+        /* 1 vector -> 32 samples but we only need the 8 samples
+         * for this subsubframe. */
+        int hfvq = vq_num[l];
+
+        int8x8_fmul_int32(dst[l], hf_vq[hfvq] + vq_offset, scale[l][0]);
+    }
+}
+
+#endif /* HAVE_NEON_INLINE && HAVE_ASM_MOD_Y */
+
 av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -65,6 +103,9 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
     if (have_neon(cpu_flags)) {
         s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
         s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
+#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
+        s->decode_hf  = decode_hf_neon;
+#endif
     }
 }
 
diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
index 2d88cb4..371c838 100644
--- a/libavcodec/dcadec.c
+++ b/libavcodec/dcadec.c
@@ -50,14 +50,10 @@
 #if ARCH_ARM
 #   include "arm/dca.h"
 #endif
-#if ARCH_X86
-#   include "x86/dca.h"
-#endif
 
 //#define TRACE
 
 #define DCA_PRIM_CHANNELS_MAX  (7)
-#define DCA_SUBBANDS          (32)
 #define DCA_ABITS_MAX         (32)      /* Should be 28 */
 #define DCA_SUBSUBFRAMES_MAX   (4)
 #define DCA_SUBFRAMES_MAX     (16)
@@ -340,7 +336,7 @@ typedef struct {
     int prediction_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];      ///< 
prediction VQ coefs
     int bitalloc[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];           ///< bit 
allocation index
     int transition_mode[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];    ///< 
transition mode (transients)
-    int scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2];    ///< scale 
factors (2 if transient)
+    int32_t scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2];///< scale 
factors (2 if transient)
     int joint_huff[DCA_PRIM_CHANNELS_MAX];                       ///< joint 
subband scale factors codebook
     int joint_scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< joint 
subband scale factors
     float downmix_coef[DCA_PRIM_CHANNELS_MAX + 1][2];            ///< stereo 
downmix coefficients
@@ -353,7 +349,7 @@ typedef struct {
     uint8_t  core_downmix_amode;                                 ///< audio 
channel arrangement of embedded downmix
     uint16_t core_downmix_codes[DCA_PRIM_CHANNELS_MAX + 1][4];   ///< embedded 
downmix coefficients (9-bit codes)
 
-    int high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];       ///< VQ 
encoded high frequency subbands
+    int32_t  high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];  ///< VQ 
encoded high frequency subbands
 
     float lfe_data[2 * DCA_LFE_MAX * (DCA_BLOCKS_MAX + 4)];      ///< Low 
frequency effect data
     int lfe_scale_factor;
@@ -1088,14 +1084,6 @@ static int decode_blockcodes(int code1, int code2, int 
levels, int32_t *values)
 static const uint8_t abits_sizes[7]  = { 7, 10, 12, 13, 15, 17, 19 };
 static const uint8_t abits_levels[7] = { 3,  5,  7,  9, 13, 17, 25 };
 
-#ifndef int8x8_fmul_int32
-static inline void int8x8_fmul_int32(DCADSPContext *dsp, float *dst,
-                                     const int8_t *src, int scale)
-{
-    dsp->int8x8_fmul_int32(dst, src, scale);
-}
-#endif
-
 static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
 {
     int k, l;
@@ -1215,20 +1203,16 @@ static int dca_subsubframe(DCAContext *s, int 
base_channel, int block_index)
         /*
          * Decode VQ encoded high frequencies
          */
-        for (l = s->vq_start_subband[k]; l < s->subband_activity[k]; l++) {
-            /* 1 vector -> 32 samples but we only need the 8 samples
-             * for this subsubframe. */
-            int hfvq = s->high_freq_vq[k][l];
-
+        if (s->subband_activity[k] > s->vq_start_subband[k]) {
             if (!s->debug_flag & 0x01) {
                 av_log(s->avctx, AV_LOG_DEBUG,
                        "Stream with high frequencies VQ coding\n");
                 s->debug_flag |= 0x01;
             }
-
-            int8x8_fmul_int32(&s->dcadsp, subband_samples[k][l],
-                              &high_freq_vq[hfvq][subsubframe * 8],
-                              s->scale_factor[k][l][0]);
+            s->dcadsp.decode_hf(subband_samples[k], s->high_freq_vq[k],
+                                high_freq_vq, subsubframe * 8,
+                                s->scale_factor[k], s->vq_start_subband[k],
+                                s->subband_activity[k]);
         }
     }
 
diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
index 30b732a..10fadd6 100644
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -24,12 +24,21 @@
 #include "libavutil/intreadwrite.h"
 #include "dcadsp.h"
 
-static void int8x8_fmul_int32_c(float *dst, const int8_t *src, int scale)
+static void decode_hf_c(float dst[DCA_SUBBANDS][8],
+                        const int32_t vq_num[DCA_SUBBANDS],
+                        const int8_t hf_vq[1024][32], intptr_t vq_offset,
+                        int32_t scale[DCA_SUBBANDS][2],
+                        intptr_t start, intptr_t end)
 {
-    float fscale = scale / 16.0;
-    int i;
-    for (i = 0; i < 8; i++)
-        dst[i] = src[i] * fscale;
+    int l;
+    for (l = start; l < end; l++) {
+        /* 1 vector -> 32 samples but we only need the 8 samples
+         * for this subsubframe. */
+        int   i, hfvq = vq_num[l];
+        float fscale = scale[l][0] / 16.0;
+        for (i = 0; i < 8; i++)
+            dst[l][i] = hf_vq[hfvq][vq_offset + i] * fscale;
+    }
 }
 
 static inline void
@@ -97,7 +106,7 @@ av_cold void ff_dcadsp_init(DCADSPContext *s)
     s->lfe_fir[0] = dca_lfe_fir0_c;
     s->lfe_fir[1] = dca_lfe_fir1_c;
     s->qmf_32_subbands = dca_qmf_32_subbands;
-    s->int8x8_fmul_int32 = int8x8_fmul_int32_c;
+    s->decode_hf = decode_hf_c;
     if (ARCH_ARM) ff_dcadsp_init_arm(s);
     if (ARCH_X86) ff_dcadsp_init_x86(s);
 }
diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
index e4c1bc7..0fa75a5 100644
--- a/libavcodec/dcadsp.h
+++ b/libavcodec/dcadsp.h
@@ -22,6 +22,8 @@
 #include "avfft.h"
 #include "synth_filter.h"
 
+#define DCA_SUBBANDS 32
+
 typedef struct DCADSPContext {
     void (*lfe_fir[2])(float *out, const float *in, const float *coefs);
     void (*qmf_32_subbands)(float samples_in[32][8], int sb_act,
@@ -30,7 +32,11 @@ typedef struct DCADSPContext {
                             int *synth_buf_offset, float synth_buf2[32],
                             const float window[512], float *samples_out,
                             float raXin[32], float scale);
-    void (*int8x8_fmul_int32)(float *dst, const int8_t *src, int scale);
+    void (*decode_hf)(float dst[DCA_SUBBANDS][8],
+                      const int32_t vq_num[DCA_SUBBANDS],
+                      const int8_t hf_vq[1024][32], intptr_t vq_offset,
+                      int32_t scale[DCA_SUBBANDS][2],
+                      intptr_t start, intptr_t end);
 } DCADSPContext;
 
 void ff_dcadsp_init(DCADSPContext *s);
-- 
1.8.0.msysgit.0

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 08/10] dcadsp: move test out of loop and dspize

Reply via email to