Re: [libav-devel] [PATCH] dca: change the core to work with integer coefficients.

Janne Grunau Tue, 15 Dec 2015 14:04:56 -0800

On 2015-12-15 15:53:47 +0100, Alexandra Hájková wrote:
> The DCA core decoder converts integer coefficients read from the
> bitstream to floats just after reading them (along with dequantization).
> All the other steps of the audio reconstruction are done with floats
> which makes the output for the DTS lossless extension (XLL)
> actually lossy.
> This patch changes the DCA core to work with integer coefficients
> until QMF. At this point the integer coefficients are converted to floats.
> The coefficients for the LFE channel (lfe_data) are not touched.
> This is the first step for the really lossless XLL decoding.
> ---
> Applied comments from Janne and Diego:
> mainly: some functions moved to dcadsp.c
>         int32_to_float function was added to fmtconvert.c.
> The patch was examined with perf record and the main slow downs
> are caused by dequantize() and int32_to_float().


I'm currently writting asm for those two functions.

> This patch breaks dca-xll but the waveforms of its sample was
> compared in audacity with "before patch" state and are the same,
> the samples also sounds the same for my ears.

The fate reference file for dca-xll needs to be updated then. Since 
there is no versioning of fate samples we have to add the new reference 
under a different filename. Please change the REF filename in 
tests/fate/audio.mak and generate the new reference with "make 
fate-dca-xll GEN=1" and the ask on irc for instructions how to upload 
the new file,

> The output coefficients
> may be slightly different because the conversion to float happens in
> the different conditions. I suggest to consider the change acceptable.
> 
>  libavcodec/dca.h        |  6 ++--
>  libavcodec/dcadec.c     | 95 
> +++++++++++++++++++++++++++----------------------
>  libavcodec/dcadsp.c     | 34 ++++++++++++++++++
>  libavcodec/dcadsp.h     |  6 ++++
>  libavcodec/fmtconvert.c |  9 +++++
>  libavcodec/fmtconvert.h |  7 ++++
>  6 files changed, 111 insertions(+), 46 deletions(-)
> 
> diff --git a/libavcodec/dca.h b/libavcodec/dca.h
> index 6548d75..d754287 100644
> --- a/libavcodec/dca.h
> +++ b/libavcodec/dca.h
> @@ -139,7 +139,7 @@ typedef struct DCAAudioHeader {
>      int scalefactor_huffman[DCA_PRIM_CHANNELS_MAX]; ///< scale factor code 
> book
>      int bitalloc_huffman[DCA_PRIM_CHANNELS_MAX];    ///< bit allocation 
> quantizer select
>      int quant_index_huffman[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX]; ///< 
> quantization index codebook select
> -    float scalefactor_adj[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX];   ///< 
> scale factor adjustment
> +    int scalefactor_adj[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX];     ///< 
> scale factor adjustment

this seems to hold only positive integers, please use uint32_t instead 
of int

>  
>      int subframes;              ///< number of subframes
>      int total_channels;         ///< number of channels including extensions
> @@ -147,10 +147,10 @@ typedef struct DCAAudioHeader {
>  } DCAAudioHeader;
>  
>  typedef struct DCAChan {
> -    DECLARE_ALIGNED(32, float, 
> subband_samples)[DCA_BLOCKS_MAX][DCA_SUBBANDS][8];
> +    DECLARE_ALIGNED(32, int, 
> subband_samples)[DCA_BLOCKS_MAX][DCA_SUBBANDS][8];

int32_t

>      /* Subband samples history (for ADPCM) */
> -    DECLARE_ALIGNED(16, float, subband_samples_hist)[DCA_SUBBANDS][4];
> +    DECLARE_ALIGNED(32, int32_t, subband_samples_hist)[DCA_SUBBANDS][4];
>      int hist_index;
>  
>      /* Half size is sufficient for core decoding, but for 96 kHz data
> diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
> index aca6ed3..c641142 100644
> --- a/libavcodec/dcadec.c
> +++ b/libavcodec/dcadec.c
> @@ -226,7 +226,7 @@ static inline void get_array(GetBitContext *gb, int *dst, 
> int len, int bits)
>  static int dca_parse_audio_coding_header(DCAContext *s, int base_channel)
>  {
>      int i, j;
> -    static const float adj_table[4] = { 1.0, 1.1250, 1.2500, 1.4375 };
> +    static const int adj_table[4] = { 16, 18, 20, 23 };

uint32_t or uint8_t

>      static const int bitlen[11] = { 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3 
>      };
>      static const int thr[11]    = { 0, 1, 3, 3, 3, 3, 7, 7, 7, 7, 7 };
>  
> @@ -265,7 +265,7 @@ static int dca_parse_audio_coding_header(DCAContext *s, 
> int base_channel)
>      /* Get scale factor adjustment */
>      for (j = 0; j < 11; j++)
>          for (i = base_channel; i < s->audio_header.prim_channels; i++)
> -            s->audio_header.scalefactor_adj[i][j] = 1;
> +            s->audio_header.scalefactor_adj[i][j] = 16;
>  
>      for (j = 1; j < 11; j++)
>          for (i = base_channel; i < s->audio_header.prim_channels; i++)
> @@ -790,10 +790,7 @@ static int dca_subsubframe(DCAContext *s, int 
> base_channel, int block_index)
>  {
>      int k, l;
>      int subsubframe = s->current_subsubframe;
> -
> -    const float *quant_step_table;
> -
> -    LOCAL_ALIGNED_16(int32_t, block, [SAMPLES_PER_SUBBAND * DCA_SUBBANDS]);
> +    const int *quant_step_table;

const uint32_t * as that's the type of the arrays from dcadata

>      /*
>       * Audio data
> @@ -801,13 +798,13 @@ static int dca_subsubframe(DCAContext *s, int 
> base_channel, int block_index)
>  
>      /* Select quantization step size table */
>      if (s->bit_rate_index == 0x1f)
> -        quant_step_table = ff_dca_lossless_quant_d;
> +        quant_step_table = ff_dca_lossless_quant;
>      else
> -        quant_step_table = ff_dca_lossy_quant_d;
> +        quant_step_table = ff_dca_lossy_quant;
>  
>      for (k = base_channel; k < s->audio_header.prim_channels; k++) {
> -        float (*subband_samples)[8] = 
> s->dca_chan[k].subband_samples[block_index];
> -        float rscale[DCA_SUBBANDS];
> +        int (*subband_samples)[8] = 
> s->dca_chan[k].subband_samples[block_index];
> +        int64_t rscale[DCA_SUBBANDS];

the array seems unnecessary since each subband gets scaled/dequantized 
separately

>          if (get_bits_left(&s->gb) < 0)
>              return AVERROR_INVALIDDATA;
> @@ -818,7 +815,7 @@ static int dca_subsubframe(DCAContext *s, int 
> base_channel, int block_index)
>              /* Select the mid-tread linear quantizer */
>              int abits = s->dca_chan[k].bitalloc[l];
>  
> -            float quant_step_size = quant_step_table[abits];
> +            int quant_step_size = quant_step_table[abits];
>  
>              /*
>               * Determine quantization index code book and its type
> @@ -832,13 +829,14 @@ static int dca_subsubframe(DCAContext *s, int 
> base_channel, int block_index)
>               */
>              if (!abits) {
>                  rscale[l] = 0;
> -                memset(block + SAMPLES_PER_SUBBAND * l, 0, 
> SAMPLES_PER_SUBBAND * sizeof(block[0]));
> +                memset(subband_samples[l], 0, SAMPLES_PER_SUBBAND *
> +                       sizeof(subband_samples[l][0]));

the memset can happen directly after determining abits, no need to set 
rscale

>              } else {

the paramters computed between 'int abits = s->dca_chan[k].bitalloc[l];' 
and 'if (!abits) {' are only used in this else clause

>                  /* Deal with transients */
>                  int sfi = s->dca_chan[k].transition_mode[l] &&
>                      subsubframe >= s->dca_chan[k].transition_mode[l];
> -                rscale[l] = quant_step_size * 
> s->dca_chan[k].scale_factor[l][sfi] *
> -                            s->audio_header.scalefactor_adj[k][sel];
> +                rscale[l] = (s->dca_chan[k].scale_factor[l][sfi] *
> +                            s->audio_header.scalefactor_adj[k][sel] + 8) >> 
> 4;
>  
>                  if (abits >= 11 || !dca_smpl_bitalloc[abits].vlc[sel].table) 
> {
>                      if (abits <= 7) {
> @@ -851,7 +849,7 @@ static int dca_subsubframe(DCAContext *s, int 
> base_channel, int block_index)
>                          block_code1 = get_bits(&s->gb, size);
>                          block_code2 = get_bits(&s->gb, size);
>                          err         = decode_blockcodes(block_code1, 
> block_code2,
> -                                                        levels, block + 
> SAMPLES_PER_SUBBAND * l);
> +                                                        levels, 
> subband_samples[l]);
>                          if (err) {
>                              av_log(s->avctx, AV_LOG_ERROR,
>                                     "ERROR: block code look-up failed\n");
> @@ -860,20 +858,18 @@ static int dca_subsubframe(DCAContext *s, int 
> base_channel, int block_index)
>                      } else {
>                          /* no coding */
>                          for (m = 0; m < SAMPLES_PER_SUBBAND; m++)
> -                            block[SAMPLES_PER_SUBBAND * l + m] = 
> get_sbits(&s->gb, abits - 3);
> +                            subband_samples[l][m] = get_sbits(&s->gb, abits 
> - 3);
>                      }
>                  } else {
>                      /* Huffman coded */
>                      for (m = 0; m < SAMPLES_PER_SUBBAND; m++)
> -                        block[SAMPLES_PER_SUBBAND * l + m] = 
> get_bitalloc(&s->gb,
> -                                                        
> &dca_smpl_bitalloc[abits], sel);
> +                        subband_samples[l][m] = get_bitalloc(&s->gb,
> +                                                             
> &dca_smpl_bitalloc[abits], sel);
>                  }
>              }
> +            s->dcadsp.dequantize(subband_samples[l], quant_step_size, 
> rscale[l]);

dequantize() doesn't change anything if the samples are all zero, so it 
can be moved into the else clause

>          }
>  
> -        s->fmt_conv.int32_to_float_fmul_array8(&s->fmt_conv, 
> subband_samples[0],
> -                                               block, rscale, 
> SAMPLES_PER_SUBBAND * s->audio_header.vq_start_subband[k]);
> -
>          for (l = 0; l < s->audio_header.vq_start_subband[k]; l++) {
>              int m;
>              /*
> @@ -883,25 +879,25 @@ static int dca_subsubframe(DCAContext *s, int 
> base_channel, int block_index)
>                  int n;
>                  if (s->predictor_history)
>                      subband_samples[l][0] += 
> (ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
> -                                                 
> s->dca_chan[k].subband_samples_hist[l][3] +
> -                                                 
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][1] *
> -                                                 
> s->dca_chan[k].subband_samples_hist[l][2] +
> -                                                 
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][2] *
> -                                                 
> s->dca_chan[k].subband_samples_hist[l][1] +
> -                                                 
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][3] *
> -                                                 
> s->dca_chan[k].subband_samples_hist[l][0]) *
> -                                                (1.0f / 8192);
> +                                              
> (int64_t)s->dca_chan[k].subband_samples_hist[l][3] +
> +                                              
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][1] *
> +                                              
> (int64_t)s->dca_chan[k].subband_samples_hist[l][2] +
> +                                              
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][2] *
> +                                              
> (int64_t)s->dca_chan[k].subband_samples_hist[l][1] +
> +                                              
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][3] *
> +                                              
> (int64_t)s->dca_chan[k].subband_samples_hist[l][0]) +
> +                                              (1 << 12) >> 13;
>                  for (m = 1; m < SAMPLES_PER_SUBBAND; m++) {
> -                    float sum = 
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
> -                                subband_samples[l][m - 1];
> +                    int64_t sum = 
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
> +                                  (int64_t)subband_samples[l][m - 1];
>                      for (n = 2; n <= 4; n++)
>                          if (m >= n)
>                              sum += 
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][n - 1] *
> -                                   subband_samples[l][m - n];
> +                                   (int64_t)subband_samples[l][m - n];
>                          else if (s->predictor_history)
>                              sum += 
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][n - 1] *
> -                                   s->dca_chan[k].subband_samples_hist[l][m 
> - n + 4];
> -                    subband_samples[l][m] += sum * 1.0f / 8192;
> +                                   
> (int64_t)s->dca_chan[k].subband_samples_hist[l][m - n + 4];
> +                    subband_samples[l][m] += (int)(sum + (1 << 12) >> 13);
>                  }
>              }
>  
> @@ -921,11 +917,12 @@ static int dca_subsubframe(DCAContext *s, int 
> base_channel, int block_index)
>                  s->debug_flag |= 0x01;
>              }
>  
> -            s->dcadsp.decode_hf(subband_samples, s->dca_chan[k].high_freq_vq,
> -                                ff_dca_high_freq_vq, subsubframe * 
> SAMPLES_PER_SUBBAND,
> -                                s->dca_chan[k].scale_factor,
> -                                s->audio_header.vq_start_subband[k],
> -                                s->audio_header.subband_activity[k]);
> +            s->dcadsp.decode_hf_int(subband_samples, 
> s->dca_chan[k].high_freq_vq,
> +                                    ff_dca_high_freq_vq, subsubframe * 
> SAMPLES_PER_SUBBAND,
> +                                    s->dca_chan[k].scale_factor,
> +                                    s->audio_header.vq_start_subband[k],
> +                                    s->audio_header.subband_activity[k]);
> +
>          }
>      }
>  
> @@ -945,6 +942,8 @@ static int dca_filter_channels(DCAContext *s, int 
> block_index, int upsample)
>      int k;
>  
>      if (upsample) {
> +        LOCAL_ALIGNED_16(float, samples, [64], [SAMPLES_PER_SUBBAND]);

should be aligned by 32 not 16 although it does not make a difference 
since there is no qmf64 asm

>          if (!s->qmf64_table) {
>              s->qmf64_table = qmf64_precompute();
>              if (!s->qmf64_table)
> @@ -953,21 +952,31 @@ static int dca_filter_channels(DCAContext *s, int 
> block_index, int upsample)
>  
>          /* 64 subbands QMF */
>          for (k = 0; k < s->audio_header.prim_channels; k++) {
> -            float (*subband_samples)[SAMPLES_PER_SUBBAND] = 
> s->dca_chan[k].subband_samples[block_index];
> +            int (*subband_samples)[SAMPLES_PER_SUBBAND] =
> +                s->dca_chan[k].subband_samples[block_index];
> +
> +            s->fmt_conv.int32_to_float(samples[0], subband_samples[0],
> +                                       64 * SAMPLES_PER_SUBBAND);
>  
>              if (s->channel_order_tab[k] >= 0)
> -                qmf_64_subbands(s, k, subband_samples,
> +                qmf_64_subbands(s, k, samples,
>                                  s->samples_chanptr[s->channel_order_tab[k]],
>                                  /* Upsampling needs a factor 2 here. */
>                                  M_SQRT2 / 32768.0);
>          }
>      } else {
>          /* 32 subbands QMF */
> +        LOCAL_ALIGNED_16(float, samples, [32], [SAMPLES_PER_SUBBAND]);

this needs 32 byte alignment since there is avx asm

> +
>          for (k = 0; k < s->audio_header.prim_channels; k++) {
> -            float (*subband_samples)[SAMPLES_PER_SUBBAND] = 
> s->dca_chan[k].subband_samples[block_index];
> +            int (*subband_samples)[SAMPLES_PER_SUBBAND] =
> +                s->dca_chan[k].subband_samples[block_index];
> +
> +            s->fmt_conv.int32_to_float(samples[0], subband_samples[0],
> +                                       32 * SAMPLES_PER_SUBBAND);
>  
>              if (s->channel_order_tab[k] >= 0)
> -                qmf_32_subbands(s, k, subband_samples,
> +                qmf_32_subbands(s, k, samples,
>                                  s->samples_chanptr[s->channel_order_tab[k]],
>                                  M_SQRT1_2 / 32768.0);
>          }
> diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
> index 34b5da2..c1d3076 100644
> --- a/libavcodec/dcadsp.c
> +++ b/libavcodec/dcadsp.c
> @@ -25,6 +25,7 @@
>  #include "libavutil/intreadwrite.h"
>  
>  #include "dcadsp.h"
> +#include "dcamath.h"
>  
>  static void decode_hf_c(float dst[DCA_SUBBANDS][8],
>                          const int32_t vq_num[DCA_SUBBANDS],
> @@ -44,6 +45,21 @@ static void decode_hf_c(float dst[DCA_SUBBANDS][8],
>      }
>  }
>  
> +static void decode_hf_int_c(int dst[DCA_SUBBANDS][8],

int32_t

> +                            const int32_t vq_num[DCA_SUBBANDS],
> +                            const int8_t hf_vq[1024][32], intptr_t vq_offset,
> +                            int32_t scale[DCA_SUBBANDS][2],
> +                            intptr_t start, intptr_t end)
> +{
> +    int i, j;
> +
> +    for (j = start; j < end; j++) {
> +        const int8_t *ptr = &hf_vq[vq_num[j]][vq_offset];
> +        for (i = 0; i < 8; i++)
> +            dst[j][i] = ptr[i] * scale[j][0] + 8 >> 4;
> +    }
> +}
> +
>  static inline void dca_lfe_fir(float *out, const float *in, const float 
> *coefs,
>                                 int decifactor)
>  {
> @@ -93,6 +109,22 @@ static void dca_qmf_32_subbands(float samples_in[32][8], 
> int sb_act,
>      }
>  }
>  
> +static void dequantize_c(int *samples, int step_size, int scale)

int32_t *samples, uint32_t step_size, uint32_t scale if I'm not 
mistaken. The distinction between signed and unsigned is helpful for 
writing asm.

> +{
> +    int64_t step = (int64_t)step_size * scale;
> +    int shift, i;
> +    int32_t step_scale;
> +
> +    if (step > (1 << 23))
> +        shift = av_log2(step >> 23) + 1;
> +    else
> +        shift = 0;
> +    step_scale = (int32_t)(step >> shift);
> +
> +    for (i = 0; i < 8; i++)
> +        samples[i] = dca_clip23(dca_norm((int64_t)samples[i] * step_scale, 
> 22 - shift));
> +}
> +
>  static void dca_lfe_fir0_c(float *out, const float *in, const float *coefs)
>  {
>      dca_lfe_fir(out, in, coefs, 32);
> @@ -109,6 +141,8 @@ av_cold void ff_dcadsp_init(DCADSPContext *s)
>      s->lfe_fir[1]      = dca_lfe_fir1_c;
>      s->qmf_32_subbands = dca_qmf_32_subbands;
>      s->decode_hf       = decode_hf_c;
> +    s->decode_hf_int   = decode_hf_int_c;
> +    s->dequantize      = dequantize_c;
>  
>      if (ARCH_ARM)
>          ff_dcadsp_init_arm(s);
> diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
> index 0fa75a5..f290633 100644
> --- a/libavcodec/dcadsp.h
> +++ b/libavcodec/dcadsp.h
> @@ -37,6 +37,12 @@ typedef struct DCADSPContext {
>                        const int8_t hf_vq[1024][32], intptr_t vq_offset,
>                        int32_t scale[DCA_SUBBANDS][2],
>                        intptr_t start, intptr_t end);
> +    void (*decode_hf_int)(int dst[DCA_SUBBANDS][8],
> +                          const int32_t vq_num[DCA_SUBBANDS],
> +                          const int8_t hf_vq[1024][32], intptr_t vq_offset,
> +                          int32_t scale[DCA_SUBBANDS][2],
> +                          intptr_t start, intptr_t end);
> +    void (*dequantize)(int *samples, int step_size, int scale);

see above re types

>  } DCADSPContext;
>  
>  void ff_dcadsp_init(DCADSPContext *s);
> diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
> index 2dff704..0416b40 100644
> --- a/libavcodec/fmtconvert.c
> +++ b/libavcodec/fmtconvert.c
> @@ -32,6 +32,14 @@ static void int32_to_float_fmul_scalar_c(float *dst, const 
> int32_t *src,
>          dst[i] = src[i] * mul;
>  }
>  
> +static void int32_to_float_c(float *dst, const int32_t *src, int len)

using intptr_t as type for len avoids a manual sign extend instruction 
in 64-bit asm

> +{
> +    int i;
> +
> +    for (i = 0; i < len; i++)
> +        dst[i] = (float)src[i];
> +}
> +
>  static void int32_to_float_fmul_array8_c(FmtConvertContext *c, float *dst,
>                                           const int32_t *src, const float 
> *mul,
>                                           int len)
> @@ -44,6 +52,7 @@ static void int32_to_float_fmul_array8_c(FmtConvertContext 
> *c, float *dst,
>  av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
>  {
>      c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
> +    c->int32_to_float             = int32_to_float_c;
>      c->int32_to_float_fmul_array8 = int32_to_float_fmul_array8_c;

*NIT*(mode DonDiego): alphabetic order, I see that ther order is already 
reversed

>      if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
> diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
> index 7de890b..3de1817 100644
> --- a/libavcodec/fmtconvert.h
> +++ b/libavcodec/fmtconvert.h
> @@ -37,6 +37,13 @@ typedef struct FmtConvertContext {
>       */
>      void (*int32_to_float_fmul_scalar)(float *dst, const int32_t *src,
>                                         float mul, int len);
> +    /**
> +     * Convert an array of int32_t to float.
> +     * @param dst destination array of float.
> +     * @param src source array of int32_t.
> +     * @param len number of elements to convert.

please add annotion for the alignment of dst/src and the constraint for 
len. look at the despriction for the other functions for reference but 
use 32-byte alignment instead of 16-byte

> +     */
> +    void (*int32_to_float)(float *dst, const int32_t *src, int len);
>  
>      /**
>       * Convert an array of int32_t to float and multiply by a float value 
> from another array,

Janne
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] dca: change the core to work with integer coefficients.

Reply via email to