On 2015-12-15 15:53:47 +0100, Alexandra Hájková wrote:
> The DCA core decoder converts integer coefficients read from the
> bitstream to floats just after reading them (along with dequantization).
> All the other steps of the audio reconstruction are done with floats
> which makes the output for the DTS lossless extension (XLL)
> actually lossy.
> This patch changes the DCA core to work with integer coefficients
> until QMF. At this point the integer coefficients are converted to floats.
> The coefficients for the LFE channel (lfe_data) are not touched.
> This is the first step for the really lossless XLL decoding.
> ---
> Applied comments from Janne and Diego:
> mainly: some functions moved to dcadsp.c
> int32_to_float function was added to fmtconvert.c.
> The patch was examined with perf record and the main slow downs
> are caused by dequantize() and int32_to_float().
I'm currently writting asm for those two functions.
> This patch breaks dca-xll but the waveforms of its sample was
> compared in audacity with "before patch" state and are the same,
> the samples also sounds the same for my ears.
The fate reference file for dca-xll needs to be updated then. Since
there is no versioning of fate samples we have to add the new reference
under a different filename. Please change the REF filename in
tests/fate/audio.mak and generate the new reference with "make
fate-dca-xll GEN=1" and the ask on irc for instructions how to upload
the new file,
> The output coefficients
> may be slightly different because the conversion to float happens in
> the different conditions. I suggest to consider the change acceptable.
>
> libavcodec/dca.h | 6 ++--
> libavcodec/dcadec.c | 95
> +++++++++++++++++++++++++++----------------------
> libavcodec/dcadsp.c | 34 ++++++++++++++++++
> libavcodec/dcadsp.h | 6 ++++
> libavcodec/fmtconvert.c | 9 +++++
> libavcodec/fmtconvert.h | 7 ++++
> 6 files changed, 111 insertions(+), 46 deletions(-)
>
> diff --git a/libavcodec/dca.h b/libavcodec/dca.h
> index 6548d75..d754287 100644
> --- a/libavcodec/dca.h
> +++ b/libavcodec/dca.h
> @@ -139,7 +139,7 @@ typedef struct DCAAudioHeader {
> int scalefactor_huffman[DCA_PRIM_CHANNELS_MAX]; ///< scale factor code
> book
> int bitalloc_huffman[DCA_PRIM_CHANNELS_MAX]; ///< bit allocation
> quantizer select
> int quant_index_huffman[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX]; ///<
> quantization index codebook select
> - float scalefactor_adj[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX]; ///<
> scale factor adjustment
> + int scalefactor_adj[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX]; ///<
> scale factor adjustment
this seems to hold only positive integers, please use uint32_t instead
of int
>
> int subframes; ///< number of subframes
> int total_channels; ///< number of channels including extensions
> @@ -147,10 +147,10 @@ typedef struct DCAAudioHeader {
> } DCAAudioHeader;
>
> typedef struct DCAChan {
> - DECLARE_ALIGNED(32, float,
> subband_samples)[DCA_BLOCKS_MAX][DCA_SUBBANDS][8];
> + DECLARE_ALIGNED(32, int,
> subband_samples)[DCA_BLOCKS_MAX][DCA_SUBBANDS][8];
int32_t
> /* Subband samples history (for ADPCM) */
> - DECLARE_ALIGNED(16, float, subband_samples_hist)[DCA_SUBBANDS][4];
> + DECLARE_ALIGNED(32, int32_t, subband_samples_hist)[DCA_SUBBANDS][4];
> int hist_index;
>
> /* Half size is sufficient for core decoding, but for 96 kHz data
> diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
> index aca6ed3..c641142 100644
> --- a/libavcodec/dcadec.c
> +++ b/libavcodec/dcadec.c
> @@ -226,7 +226,7 @@ static inline void get_array(GetBitContext *gb, int *dst,
> int len, int bits)
> static int dca_parse_audio_coding_header(DCAContext *s, int base_channel)
> {
> int i, j;
> - static const float adj_table[4] = { 1.0, 1.1250, 1.2500, 1.4375 };
> + static const int adj_table[4] = { 16, 18, 20, 23 };
uint32_t or uint8_t
> static const int bitlen[11] = { 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3
> };
> static const int thr[11] = { 0, 1, 3, 3, 3, 3, 7, 7, 7, 7, 7 };
>
> @@ -265,7 +265,7 @@ static int dca_parse_audio_coding_header(DCAContext *s,
> int base_channel)
> /* Get scale factor adjustment */
> for (j = 0; j < 11; j++)
> for (i = base_channel; i < s->audio_header.prim_channels; i++)
> - s->audio_header.scalefactor_adj[i][j] = 1;
> + s->audio_header.scalefactor_adj[i][j] = 16;
>
> for (j = 1; j < 11; j++)
> for (i = base_channel; i < s->audio_header.prim_channels; i++)
> @@ -790,10 +790,7 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
> {
> int k, l;
> int subsubframe = s->current_subsubframe;
> -
> - const float *quant_step_table;
> -
> - LOCAL_ALIGNED_16(int32_t, block, [SAMPLES_PER_SUBBAND * DCA_SUBBANDS]);
> + const int *quant_step_table;
const uint32_t * as that's the type of the arrays from dcadata
> /*
> * Audio data
> @@ -801,13 +798,13 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
>
> /* Select quantization step size table */
> if (s->bit_rate_index == 0x1f)
> - quant_step_table = ff_dca_lossless_quant_d;
> + quant_step_table = ff_dca_lossless_quant;
> else
> - quant_step_table = ff_dca_lossy_quant_d;
> + quant_step_table = ff_dca_lossy_quant;
>
> for (k = base_channel; k < s->audio_header.prim_channels; k++) {
> - float (*subband_samples)[8] =
> s->dca_chan[k].subband_samples[block_index];
> - float rscale[DCA_SUBBANDS];
> + int (*subband_samples)[8] =
> s->dca_chan[k].subband_samples[block_index];
> + int64_t rscale[DCA_SUBBANDS];
the array seems unnecessary since each subband gets scaled/dequantized
separately
> if (get_bits_left(&s->gb) < 0)
> return AVERROR_INVALIDDATA;
> @@ -818,7 +815,7 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
> /* Select the mid-tread linear quantizer */
> int abits = s->dca_chan[k].bitalloc[l];
>
> - float quant_step_size = quant_step_table[abits];
> + int quant_step_size = quant_step_table[abits];
>
> /*
> * Determine quantization index code book and its type
> @@ -832,13 +829,14 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
> */
> if (!abits) {
> rscale[l] = 0;
> - memset(block + SAMPLES_PER_SUBBAND * l, 0,
> SAMPLES_PER_SUBBAND * sizeof(block[0]));
> + memset(subband_samples[l], 0, SAMPLES_PER_SUBBAND *
> + sizeof(subband_samples[l][0]));
the memset can happen directly after determining abits, no need to set
rscale
> } else {
the paramters computed between 'int abits = s->dca_chan[k].bitalloc[l];'
and 'if (!abits) {' are only used in this else clause
> /* Deal with transients */
> int sfi = s->dca_chan[k].transition_mode[l] &&
> subsubframe >= s->dca_chan[k].transition_mode[l];
> - rscale[l] = quant_step_size *
> s->dca_chan[k].scale_factor[l][sfi] *
> - s->audio_header.scalefactor_adj[k][sel];
> + rscale[l] = (s->dca_chan[k].scale_factor[l][sfi] *
> + s->audio_header.scalefactor_adj[k][sel] + 8) >>
> 4;
>
> if (abits >= 11 || !dca_smpl_bitalloc[abits].vlc[sel].table)
> {
> if (abits <= 7) {
> @@ -851,7 +849,7 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
> block_code1 = get_bits(&s->gb, size);
> block_code2 = get_bits(&s->gb, size);
> err = decode_blockcodes(block_code1,
> block_code2,
> - levels, block +
> SAMPLES_PER_SUBBAND * l);
> + levels,
> subband_samples[l]);
> if (err) {
> av_log(s->avctx, AV_LOG_ERROR,
> "ERROR: block code look-up failed\n");
> @@ -860,20 +858,18 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
> } else {
> /* no coding */
> for (m = 0; m < SAMPLES_PER_SUBBAND; m++)
> - block[SAMPLES_PER_SUBBAND * l + m] =
> get_sbits(&s->gb, abits - 3);
> + subband_samples[l][m] = get_sbits(&s->gb, abits
> - 3);
> }
> } else {
> /* Huffman coded */
> for (m = 0; m < SAMPLES_PER_SUBBAND; m++)
> - block[SAMPLES_PER_SUBBAND * l + m] =
> get_bitalloc(&s->gb,
> -
> &dca_smpl_bitalloc[abits], sel);
> + subband_samples[l][m] = get_bitalloc(&s->gb,
> +
> &dca_smpl_bitalloc[abits], sel);
> }
> }
> + s->dcadsp.dequantize(subband_samples[l], quant_step_size,
> rscale[l]);
dequantize() doesn't change anything if the samples are all zero, so it
can be moved into the else clause
> }
>
> - s->fmt_conv.int32_to_float_fmul_array8(&s->fmt_conv,
> subband_samples[0],
> - block, rscale,
> SAMPLES_PER_SUBBAND * s->audio_header.vq_start_subband[k]);
> -
> for (l = 0; l < s->audio_header.vq_start_subband[k]; l++) {
> int m;
> /*
> @@ -883,25 +879,25 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
> int n;
> if (s->predictor_history)
> subband_samples[l][0] +=
> (ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
> -
> s->dca_chan[k].subband_samples_hist[l][3] +
> -
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][1] *
> -
> s->dca_chan[k].subband_samples_hist[l][2] +
> -
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][2] *
> -
> s->dca_chan[k].subband_samples_hist[l][1] +
> -
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][3] *
> -
> s->dca_chan[k].subband_samples_hist[l][0]) *
> - (1.0f / 8192);
> +
> (int64_t)s->dca_chan[k].subband_samples_hist[l][3] +
> +
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][1] *
> +
> (int64_t)s->dca_chan[k].subband_samples_hist[l][2] +
> +
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][2] *
> +
> (int64_t)s->dca_chan[k].subband_samples_hist[l][1] +
> +
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][3] *
> +
> (int64_t)s->dca_chan[k].subband_samples_hist[l][0]) +
> + (1 << 12) >> 13;
> for (m = 1; m < SAMPLES_PER_SUBBAND; m++) {
> - float sum =
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
> - subband_samples[l][m - 1];
> + int64_t sum =
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
> + (int64_t)subband_samples[l][m - 1];
> for (n = 2; n <= 4; n++)
> if (m >= n)
> sum +=
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][n - 1] *
> - subband_samples[l][m - n];
> + (int64_t)subband_samples[l][m - n];
> else if (s->predictor_history)
> sum +=
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][n - 1] *
> - s->dca_chan[k].subband_samples_hist[l][m
> - n + 4];
> - subband_samples[l][m] += sum * 1.0f / 8192;
> +
> (int64_t)s->dca_chan[k].subband_samples_hist[l][m - n + 4];
> + subband_samples[l][m] += (int)(sum + (1 << 12) >> 13);
> }
> }
>
> @@ -921,11 +917,12 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
> s->debug_flag |= 0x01;
> }
>
> - s->dcadsp.decode_hf(subband_samples, s->dca_chan[k].high_freq_vq,
> - ff_dca_high_freq_vq, subsubframe *
> SAMPLES_PER_SUBBAND,
> - s->dca_chan[k].scale_factor,
> - s->audio_header.vq_start_subband[k],
> - s->audio_header.subband_activity[k]);
> + s->dcadsp.decode_hf_int(subband_samples,
> s->dca_chan[k].high_freq_vq,
> + ff_dca_high_freq_vq, subsubframe *
> SAMPLES_PER_SUBBAND,
> + s->dca_chan[k].scale_factor,
> + s->audio_header.vq_start_subband[k],
> + s->audio_header.subband_activity[k]);
> +
> }
> }
>
> @@ -945,6 +942,8 @@ static int dca_filter_channels(DCAContext *s, int
> block_index, int upsample)
> int k;
>
> if (upsample) {
> + LOCAL_ALIGNED_16(float, samples, [64], [SAMPLES_PER_SUBBAND]);
should be aligned by 32 not 16 although it does not make a difference
since there is no qmf64 asm
> if (!s->qmf64_table) {
> s->qmf64_table = qmf64_precompute();
> if (!s->qmf64_table)
> @@ -953,21 +952,31 @@ static int dca_filter_channels(DCAContext *s, int
> block_index, int upsample)
>
> /* 64 subbands QMF */
> for (k = 0; k < s->audio_header.prim_channels; k++) {
> - float (*subband_samples)[SAMPLES_PER_SUBBAND] =
> s->dca_chan[k].subband_samples[block_index];
> + int (*subband_samples)[SAMPLES_PER_SUBBAND] =
> + s->dca_chan[k].subband_samples[block_index];
> +
> + s->fmt_conv.int32_to_float(samples[0], subband_samples[0],
> + 64 * SAMPLES_PER_SUBBAND);
>
> if (s->channel_order_tab[k] >= 0)
> - qmf_64_subbands(s, k, subband_samples,
> + qmf_64_subbands(s, k, samples,
> s->samples_chanptr[s->channel_order_tab[k]],
> /* Upsampling needs a factor 2 here. */
> M_SQRT2 / 32768.0);
> }
> } else {
> /* 32 subbands QMF */
> + LOCAL_ALIGNED_16(float, samples, [32], [SAMPLES_PER_SUBBAND]);
this needs 32 byte alignment since there is avx asm
> +
> for (k = 0; k < s->audio_header.prim_channels; k++) {
> - float (*subband_samples)[SAMPLES_PER_SUBBAND] =
> s->dca_chan[k].subband_samples[block_index];
> + int (*subband_samples)[SAMPLES_PER_SUBBAND] =
> + s->dca_chan[k].subband_samples[block_index];
> +
> + s->fmt_conv.int32_to_float(samples[0], subband_samples[0],
> + 32 * SAMPLES_PER_SUBBAND);
>
> if (s->channel_order_tab[k] >= 0)
> - qmf_32_subbands(s, k, subband_samples,
> + qmf_32_subbands(s, k, samples,
> s->samples_chanptr[s->channel_order_tab[k]],
> M_SQRT1_2 / 32768.0);
> }
> diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
> index 34b5da2..c1d3076 100644
> --- a/libavcodec/dcadsp.c
> +++ b/libavcodec/dcadsp.c
> @@ -25,6 +25,7 @@
> #include "libavutil/intreadwrite.h"
>
> #include "dcadsp.h"
> +#include "dcamath.h"
>
> static void decode_hf_c(float dst[DCA_SUBBANDS][8],
> const int32_t vq_num[DCA_SUBBANDS],
> @@ -44,6 +45,21 @@ static void decode_hf_c(float dst[DCA_SUBBANDS][8],
> }
> }
>
> +static void decode_hf_int_c(int dst[DCA_SUBBANDS][8],
int32_t
> + const int32_t vq_num[DCA_SUBBANDS],
> + const int8_t hf_vq[1024][32], intptr_t vq_offset,
> + int32_t scale[DCA_SUBBANDS][2],
> + intptr_t start, intptr_t end)
> +{
> + int i, j;
> +
> + for (j = start; j < end; j++) {
> + const int8_t *ptr = &hf_vq[vq_num[j]][vq_offset];
> + for (i = 0; i < 8; i++)
> + dst[j][i] = ptr[i] * scale[j][0] + 8 >> 4;
> + }
> +}
> +
> static inline void dca_lfe_fir(float *out, const float *in, const float
> *coefs,
> int decifactor)
> {
> @@ -93,6 +109,22 @@ static void dca_qmf_32_subbands(float samples_in[32][8],
> int sb_act,
> }
> }
>
> +static void dequantize_c(int *samples, int step_size, int scale)
int32_t *samples, uint32_t step_size, uint32_t scale if I'm not
mistaken. The distinction between signed and unsigned is helpful for
writing asm.
> +{
> + int64_t step = (int64_t)step_size * scale;
> + int shift, i;
> + int32_t step_scale;
> +
> + if (step > (1 << 23))
> + shift = av_log2(step >> 23) + 1;
> + else
> + shift = 0;
> + step_scale = (int32_t)(step >> shift);
> +
> + for (i = 0; i < 8; i++)
> + samples[i] = dca_clip23(dca_norm((int64_t)samples[i] * step_scale,
> 22 - shift));
> +}
> +
> static void dca_lfe_fir0_c(float *out, const float *in, const float *coefs)
> {
> dca_lfe_fir(out, in, coefs, 32);
> @@ -109,6 +141,8 @@ av_cold void ff_dcadsp_init(DCADSPContext *s)
> s->lfe_fir[1] = dca_lfe_fir1_c;
> s->qmf_32_subbands = dca_qmf_32_subbands;
> s->decode_hf = decode_hf_c;
> + s->decode_hf_int = decode_hf_int_c;
> + s->dequantize = dequantize_c;
>
> if (ARCH_ARM)
> ff_dcadsp_init_arm(s);
> diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
> index 0fa75a5..f290633 100644
> --- a/libavcodec/dcadsp.h
> +++ b/libavcodec/dcadsp.h
> @@ -37,6 +37,12 @@ typedef struct DCADSPContext {
> const int8_t hf_vq[1024][32], intptr_t vq_offset,
> int32_t scale[DCA_SUBBANDS][2],
> intptr_t start, intptr_t end);
> + void (*decode_hf_int)(int dst[DCA_SUBBANDS][8],
> + const int32_t vq_num[DCA_SUBBANDS],
> + const int8_t hf_vq[1024][32], intptr_t vq_offset,
> + int32_t scale[DCA_SUBBANDS][2],
> + intptr_t start, intptr_t end);
> + void (*dequantize)(int *samples, int step_size, int scale);
see above re types
> } DCADSPContext;
>
> void ff_dcadsp_init(DCADSPContext *s);
> diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
> index 2dff704..0416b40 100644
> --- a/libavcodec/fmtconvert.c
> +++ b/libavcodec/fmtconvert.c
> @@ -32,6 +32,14 @@ static void int32_to_float_fmul_scalar_c(float *dst, const
> int32_t *src,
> dst[i] = src[i] * mul;
> }
>
> +static void int32_to_float_c(float *dst, const int32_t *src, int len)
using intptr_t as type for len avoids a manual sign extend instruction
in 64-bit asm
> +{
> + int i;
> +
> + for (i = 0; i < len; i++)
> + dst[i] = (float)src[i];
> +}
> +
> static void int32_to_float_fmul_array8_c(FmtConvertContext *c, float *dst,
> const int32_t *src, const float
> *mul,
> int len)
> @@ -44,6 +52,7 @@ static void int32_to_float_fmul_array8_c(FmtConvertContext
> *c, float *dst,
> av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
> {
> c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
> + c->int32_to_float = int32_to_float_c;
> c->int32_to_float_fmul_array8 = int32_to_float_fmul_array8_c;
*NIT*(mode DonDiego): alphabetic order, I see that ther order is already
reversed
> if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
> diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
> index 7de890b..3de1817 100644
> --- a/libavcodec/fmtconvert.h
> +++ b/libavcodec/fmtconvert.h
> @@ -37,6 +37,13 @@ typedef struct FmtConvertContext {
> */
> void (*int32_to_float_fmul_scalar)(float *dst, const int32_t *src,
> float mul, int len);
> + /**
> + * Convert an array of int32_t to float.
> + * @param dst destination array of float.
> + * @param src source array of int32_t.
> + * @param len number of elements to convert.
please add annotion for the alignment of dst/src and the constraint for
len. look at the despriction for the other functions for reference but
use 32-byte alignment instead of 16-byte
> + */
> + void (*int32_to_float)(float *dst, const int32_t *src, int len);
>
> /**
> * Convert an array of int32_t to float and multiply by a float value
> from another array,
Janne
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel