Hi,

Le 15 févr. 2014 18:31, "Janne Grunau" <[email protected]> a écrit :
>
> On 2014-02-14 16:03:13 +0000, Christophe Gisquet wrote:
> > The vector dequantization has a test in a loop preventing effective SIMD
> > implementation. By moving it out of the loop, this loop can be DSPized.
> >
> > Therefore, modify the current DSP implementation. In particular, the
> > DSP implementation no longer has to handle null loop sizes.
> > ---
> >  libavcodec/arm/dca.h             | 23 ----------------------
> >  libavcodec/arm/dcadsp_init_arm.c | 41
++++++++++++++++++++++++++++++++++++++++
> >  libavcodec/dcadec.c              | 30 +++++++----------------------
> >  libavcodec/dcadsp.c              | 21 ++++++++++++++------
> >  libavcodec/dcadsp.h              |  8 +++++++-
> >  5 files changed, 70 insertions(+), 53 deletions(-)
> >
> > diff --git a/libavcodec/arm/dca.h b/libavcodec/arm/dca.h
> > index 580bd75..4aed576 100644
> > --- a/libavcodec/arm/dca.h
> > +++ b/libavcodec/arm/dca.h
> > @@ -81,27 +81,4 @@ static inline int decode_blockcodes(int code1, int
code2, int levels,
> >
> >  #endif
> >
> > -#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
> > -
> > -#define int8x8_fmul_int32 int8x8_fmul_int32
> > -static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
> > -                                     float *dst, const int8_t *src,
int scale)
> > -{
> > -    __asm__ ("vcvt.f32.s32 %2,  %2,  #4         \n"
> > -             "vld1.8       {d0},     [%1,:64]   \n"
> > -             "vmovl.s8     q0,  d0              \n"
> > -             "vmovl.s16    q1,  d1              \n"
> > -             "vmovl.s16    q0,  d0              \n"
> > -             "vcvt.f32.s32 q0,  q0              \n"
> > -             "vcvt.f32.s32 q1,  q1              \n"
> > -             "vmul.f32     q0,  q0,  %y2        \n"
> > -             "vmul.f32     q1,  q1,  %y2        \n"
> > -             "vst1.32      {q0-q1},  [%m0,:128] \n"
> > -             : "=Um"(*(float (*)[8])dst)
> > -             : "r"(src), "x"(scale)
> > -             : "d0", "d1", "d2", "d3");
> > -}
> > -
> > -#endif
> > -
> >  #endif /* AVCODEC_ARM_DCA_H */
> > diff --git a/libavcodec/arm/dcadsp_init_arm.c
b/libavcodec/arm/dcadsp_init_arm.c
> > index 2ea1289..9942581 100644
> > --- a/libavcodec/arm/dcadsp_init_arm.c
> > +++ b/libavcodec/arm/dcadsp_init_arm.c
> > @@ -53,6 +53,44 @@ void ff_synth_filter_float_neon(FFTContext *imdct,
> >                                  float out[32], const float in[32],
> >                                  float scale);
> >
> > +#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
> > +
> > +static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
> > +                                     float *dst, const int8_t *src,
int scale)
> > +{
> > +    __asm__ ("vcvt.f32.s32 %2,  %2,  #4         \n"
> > +             "vld1.8       {d0},     [%1,:64]   \n"
> > +             "vmovl.s8     q0,  d0              \n"
> > +             "vmovl.s16    q1,  d1              \n"
> > +             "vmovl.s16    q0,  d0              \n"
> > +             "vcvt.f32.s32 q0,  q0              \n"
> > +             "vcvt.f32.s32 q1,  q1              \n"
> > +             "vmul.f32     q0,  q0,  %y2        \n"
> > +             "vmul.f32     q1,  q1,  %y2        \n"
> > +             "vst1.32      {q0-q1},  [%m0,:128] \n"
> > +             : "=Um"(*(float (*)[8])dst)
> > +             : "r"(src), "x"(scale)
> > +             : "d0", "d1", "d2", "d3");
> > +}
> > +
> > +static void decode_hf_neon(float dst[DCA_SUBBANDS][8],
> > +                           const int32_t vq_num[DCA_SUBBANDS],
> > +                           const int8_t hf_vq[1024][32], intptr_t
vq_offset,
> > +                           int32_t scale[DCA_SUBBANDS][2],
> > +                           intptr_t start, intptr_t end)
> > +{
> > +    int l;
> > +    for (l = start; l < end; l++) {
> > +        /* 1 vector -> 32 samples but we only need the 8 samples
> > +         * for this subsubframe. */
> > +        int hfvq = vq_num[l];
> > +
> > +        int8x8_fmul_int32(dst[l], hf_vq[hfvq] + vq_offset,
scale[l][0]);
> > +    }
> > +}
> > +
> > +#endif /* HAVE_NEON_INLINE && HAVE_ASM_MOD_Y */
> > +
> >  av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
> >  {
> >      int cpu_flags = av_get_cpu_flags();
> > @@ -65,6 +103,9 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
> >      if (have_neon(cpu_flags)) {
> >          s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
> >          s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
> > +#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
> > +        s->decode_hf  = decode_hf_neon;
> > +#endif
>
> ok but not optimal, I'll rewrite it as external asm, doesn't block this
patch
>
> >      }
> >  }
> >
> > diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
> > index 2d88cb4..371c838 100644
> > --- a/libavcodec/dcadec.c
> > +++ b/libavcodec/dcadec.c
> > @@ -50,14 +50,10 @@
> >  #if ARCH_ARM
> >  #   include "arm/dca.h"
> >  #endif
> > -#if ARCH_X86
> > -#   include "x86/dca.h"
> > -#endif
> >
> >  //#define TRACE
> >
> >  #define DCA_PRIM_CHANNELS_MAX  (7)
> > -#define DCA_SUBBANDS          (32)
> >  #define DCA_ABITS_MAX         (32)      /* Should be 28 */
> >  #define DCA_SUBSUBFRAMES_MAX   (4)
> >  #define DCA_SUBFRAMES_MAX     (16)
> > @@ -340,7 +336,7 @@ typedef struct {
> >      int prediction_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];      ///<
prediction VQ coefs
> >      int bitalloc[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];           ///<
bit allocation index
> >      int transition_mode[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];    ///<
transition mode (transients)
> > -    int scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2];    ///<
scale factors (2 if transient)
> > +    int32_t scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2];///<
scale factors (2 if transient)
>
> Don't we assume that int is 32 bit wide? I guess much more would break
> if that assumption doesn't hold. change of course ok when using it in asm.

I think it's dangerous because:
- natural size of int on x86_64 is 64b
- win64 has some freedom to leave garbage in 32msbs of 64b gprs and thus
x86 sometimes uses intptr_t

But some audio DSP had this potential issue without consequences.

>
> >      int joint_huff[DCA_PRIM_CHANNELS_MAX];                       ///<
joint subband scale factors codebook
> >      int joint_scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///<
joint subband scale factors
> >      float downmix_coef[DCA_PRIM_CHANNELS_MAX + 1][2];            ///<
stereo downmix coefficients
> > @@ -353,7 +349,7 @@ typedef struct {
> >      uint8_t  core_downmix_amode;                                 ///<
audio channel arrangement of embedded downmix
> >      uint16_t core_downmix_codes[DCA_PRIM_CHANNELS_MAX + 1][4];   ///<
embedded downmix coefficients (9-bit codes)
> >
> > -    int high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];       ///<
VQ encoded high frequency subbands
> > +    int32_t  high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];  ///<
VQ encoded high frequency subbands
> >
> >      float lfe_data[2 * DCA_LFE_MAX * (DCA_BLOCKS_MAX + 4)];      ///<
Low frequency effect data
> >      int lfe_scale_factor;
> > @@ -1088,14 +1084,6 @@ static int decode_blockcodes(int code1, int
code2, int levels, int32_t *values)
> >  static const uint8_t abits_sizes[7]  = { 7, 10, 12, 13, 15, 17, 19 };
> >  static const uint8_t abits_levels[7] = { 3,  5,  7,  9, 13, 17, 25 };
> >
> > -#ifndef int8x8_fmul_int32
> > -static inline void int8x8_fmul_int32(DCADSPContext *dsp, float *dst,
> > -                                     const int8_t *src, int scale)
> > -{
> > -    dsp->int8x8_fmul_int32(dst, src, scale);
> > -}
> > -#endif
> > -
> >  static int dca_subsubframe(DCAContext *s, int base_channel, int
block_index)
> >  {
> >      int k, l;
> > @@ -1215,20 +1203,16 @@ static int dca_subsubframe(DCAContext *s, int
base_channel, int block_index)
> >          /*
> >           * Decode VQ encoded high frequencies
> >           */
> > -        for (l = s->vq_start_subband[k]; l < s->subband_activity[k];
l++) {
> > -            /* 1 vector -> 32 samples but we only need the 8 samples
> > -             * for this subsubframe. */
> > -            int hfvq = s->high_freq_vq[k][l];
> > -
> > +        if (s->subband_activity[k] > s->vq_start_subband[k]) {
> >              if (!s->debug_flag & 0x01) {
> >                  av_log(s->avctx, AV_LOG_DEBUG,
> >                         "Stream with high frequencies VQ coding\n");
> >                  s->debug_flag |= 0x01;
> >              }
> > -
> > -            int8x8_fmul_int32(&s->dcadsp, subband_samples[k][l],
> > -                              &high_freq_vq[hfvq][subsubframe * 8],
> > -                              s->scale_factor[k][l][0]);
> > +            s->dcadsp.decode_hf(subband_samples[k], s->high_freq_vq[k],
> > +                                high_freq_vq, subsubframe * 8,
> > +                                s->scale_factor[k],
s->vq_start_subband[k],
> > +                                s->subband_activity[k]);
> >          }
> >      }
> >
> > diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
> > index 30b732a..10fadd6 100644
> > --- a/libavcodec/dcadsp.c
> > +++ b/libavcodec/dcadsp.c
> > @@ -24,12 +24,21 @@
> >  #include "libavutil/intreadwrite.h"
> >  #include "dcadsp.h"
> >
> > -static void int8x8_fmul_int32_c(float *dst, const int8_t *src, int
scale)
> > +static void decode_hf_c(float dst[DCA_SUBBANDS][8],
> > +                        const int32_t vq_num[DCA_SUBBANDS],
> > +                        const int8_t hf_vq[1024][32], intptr_t
vq_offset,
> > +                        int32_t scale[DCA_SUBBANDS][2],
> > +                        intptr_t start, intptr_t end)
> >  {
> > -    float fscale = scale / 16.0;
> > -    int i;
> > -    for (i = 0; i < 8; i++)
> > -        dst[i] = src[i] * fscale;
> > +    int l;
> > +    for (l = start; l < end; l++) {
> > +        /* 1 vector -> 32 samples but we only need the 8 samples
> > +         * for this subsubframe. */
> > +        int   i, hfvq = vq_num[l];
> > +        float fscale = scale[l][0] / 16.0;
> > +        for (i = 0; i < 8; i++)
> > +            dst[l][i] = hf_vq[hfvq][vq_offset + i] * fscale;
> > +    }
> >  }
> >
> >  static inline void
> > @@ -97,7 +106,7 @@ av_cold void ff_dcadsp_init(DCADSPContext *s)
> >      s->lfe_fir[0] = dca_lfe_fir0_c;
> >      s->lfe_fir[1] = dca_lfe_fir1_c;
> >      s->qmf_32_subbands = dca_qmf_32_subbands;
> > -    s->int8x8_fmul_int32 = int8x8_fmul_int32_c;
> > +    s->decode_hf = decode_hf_c;
> >      if (ARCH_ARM) ff_dcadsp_init_arm(s);
> >      if (ARCH_X86) ff_dcadsp_init_x86(s);
> >  }
> > diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
> > index e4c1bc7..0fa75a5 100644
> > --- a/libavcodec/dcadsp.h
> > +++ b/libavcodec/dcadsp.h
> > @@ -22,6 +22,8 @@
> >  #include "avfft.h"
> >  #include "synth_filter.h"
> >
> > +#define DCA_SUBBANDS 32
> > +
> >  typedef struct DCADSPContext {
> >      void (*lfe_fir[2])(float *out, const float *in, const float
*coefs);
> >      void (*qmf_32_subbands)(float samples_in[32][8], int sb_act,
> > @@ -30,7 +32,11 @@ typedef struct DCADSPContext {
> >                              int *synth_buf_offset, float
synth_buf2[32],
> >                              const float window[512], float
*samples_out,
> >                              float raXin[32], float scale);
> > -    void (*int8x8_fmul_int32)(float *dst, const int8_t *src, int
scale);
> > +    void (*decode_hf)(float dst[DCA_SUBBANDS][8],
> > +                      const int32_t vq_num[DCA_SUBBANDS],
> > +                      const int8_t hf_vq[1024][32], intptr_t vq_offset,
> > +                      int32_t scale[DCA_SUBBANDS][2],
> > +                      intptr_t start, intptr_t end);
> >  } DCADSPContext;
> >
> >  void ff_dcadsp_init(DCADSPContext *s);
>
> ok and queued
>
> Janne
> _______________________________________________
> libav-devel mailing list
> [email protected]
> https://lists.libav.org/mailman/listinfo/libav-devel

Sent from a phone so please excuse terseness
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to