On 2015-12-03 13:45:09 +0100, Alexandra Hájková wrote:
> The dca core decoder converts integer coefficients read from the
> bitstream to floats just after reading them (along with dequantization).
> All the other steps of the audio reconstruction are done with floats
> which makes the output for the DTS lossless extension (XLL)
> actually lossy.
> This patch changes the dca core to work with integer coefficients
> till QMF. At this point the integer coefficients are transformed to floats.
> The coefficients for the LFE channel (lfe_data) are not touched.
> This is the first step for the really lossless XLL decoding.
> ---
>
> the output channels waveforms was compared in audacity with the waveforms of
> the
> "before this patch state" and were considered the same
>
> libavcodec/dca.h | 6 +--
> libavcodec/dcadec.c | 117
> ++++++++++++++++++++++++++++++++++------------------
> 2 files changed, 79 insertions(+), 44 deletions(-)
>
> diff --git a/libavcodec/dca.h b/libavcodec/dca.h
> index 6548d75..9947878 100644
> --- a/libavcodec/dca.h
> +++ b/libavcodec/dca.h
> @@ -139,7 +139,7 @@ typedef struct DCAAudioHeader {
> int scalefactor_huffman[DCA_PRIM_CHANNELS_MAX]; ///< scale factor code
> book
> int bitalloc_huffman[DCA_PRIM_CHANNELS_MAX]; ///< bit allocation
> quantizer select
> int quant_index_huffman[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX]; ///<
> quantization index codebook select
> - float scalefactor_adj[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX]; ///<
> scale factor adjustment
> + int scalefactor_adj[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX]; ///<
> scale factor adjustment
>
> int subframes; ///< number of subframes
> int total_channels; ///< number of channels including extensions
> @@ -147,10 +147,10 @@ typedef struct DCAAudioHeader {
> } DCAAudioHeader;
>
> typedef struct DCAChan {
> - DECLARE_ALIGNED(32, float,
> subband_samples)[DCA_BLOCKS_MAX][DCA_SUBBANDS][8];
> + DECLARE_ALIGNED(32, int,
> subband_samples)[DCA_BLOCKS_MAX][DCA_SUBBANDS][8];
>
> /* Subband samples history (for ADPCM) */
> - DECLARE_ALIGNED(16, float, subband_samples_hist)[DCA_SUBBANDS][4];
> + DECLARE_ALIGNED(16, int, subband_samples_hist)[DCA_SUBBANDS][4];
these two should be probably int32_t instead of int to match code
> int hist_index;
>
> /* Half size is sufficient for core decoding, but for 96 kHz data
> diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
> index 7e94638..80da622 100644
> --- a/libavcodec/dcadec.c
> +++ b/libavcodec/dcadec.c
> @@ -44,6 +44,7 @@
> #include "dcadata.h"
> #include "dcadsp.h"
> #include "dcahuff.h"
> +#include "dcamath.h"
> #include "fft.h"
> #include "fmtconvert.h"
> #include "get_bits.h"
> @@ -225,7 +226,7 @@ static inline void get_array(GetBitContext *gb, int *dst,
> int len, int bits)
> static int dca_parse_audio_coding_header(DCAContext *s, int base_channel)
> {
> int i, j;
> - static const float adj_table[4] = { 1.0, 1.1250, 1.2500, 1.4375 };
> + static const int adj_table[4] = { 16, 18, 20, 23 };
> static const int bitlen[11] = { 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3 };
> static const int thr[11] = { 0, 1, 3, 3, 3, 3, 7, 7, 7, 7, 7 };
>
> @@ -785,14 +786,26 @@ static int decode_blockcodes(int code1, int code2, int
> levels, int32_t *values)
> static const uint8_t abits_sizes[7] = { 7, 10, 12, 13, 15, 17, 19 };
> static const uint8_t abits_levels[7] = { 3, 5, 7, 9, 13, 17, 25 };
>
This is probably also a candidate for SIMD optimizations and should go
in dcadsc.c
> +static void dequantize(int *samples, int step_size, int scale) {
> + int64_t step = (int64_t)step_size * scale;
> + int shift, i;
> + int32_t step_scale;
> +
> + if (step > (1 << 23))
> + shift = av_log2(step >> 23) + 1;
> + else
> + shift = 0;
> + step_scale = (int32_t)(step >> shift);
> +
> + for (i = 0; i < SAMPLES_PER_SUBBAND; i++)
> + samples[i] = dca_clip23(dca_norm((int64_t)samples[i] * step_scale,
> 22 - shift));
> +}
> +
> static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
> {
> int k, l;
> int subsubframe = s->current_subsubframe;
> -
> - const float *quant_step_table;
> -
> - LOCAL_ALIGNED_16(int32_t, block, [SAMPLES_PER_SUBBAND * DCA_SUBBANDS]);
> + const int *quant_step_table;
>
> /*
> * Audio data
> @@ -800,13 +813,13 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
>
> /* Select quantization step size table */
> if (s->bit_rate_index == 0x1f)
> - quant_step_table = ff_dca_lossless_quant_d;
> + quant_step_table = ff_dca_lossless_quant;
> else
> - quant_step_table = ff_dca_lossy_quant_d;
> + quant_step_table = ff_dca_lossy_quant;
>
> for (k = base_channel; k < s->audio_header.prim_channels; k++) {
> - float (*subband_samples)[8] =
> s->dca_chan[k].subband_samples[block_index];
> - float rscale[DCA_SUBBANDS];
> + int (*subband_samples)[8] =
> s->dca_chan[k].subband_samples[block_index];
> + int64_t rscale[DCA_SUBBANDS];
>
> if (get_bits_left(&s->gb) < 0)
> return AVERROR_INVALIDDATA;
> @@ -817,7 +830,7 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
> /* Select the mid-tread linear quantizer */
> int abits = s->dca_chan[k].bitalloc[l];
>
> - float quant_step_size = quant_step_table[abits];
> + int quant_step_size = quant_step_table[abits];
>
> /*
> * Determine quantization index code book and its type
> @@ -831,12 +844,13 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
> */
> if (!abits) {
> rscale[l] = 0;
> - memset(block + SAMPLES_PER_SUBBAND * l, 0,
> SAMPLES_PER_SUBBAND * sizeof(block[0]));
> + memset(subband_samples[l], 0, SAMPLES_PER_SUBBAND *
> + sizeof(subband_samples[l][0]));
> } else {
> /* Deal with transients */
> int sfi = s->dca_chan[k].transition_mode[l] &&
> subsubframe >= s->dca_chan[k].transition_mode[l];
> - rscale[l] = quant_step_size *
> s->dca_chan[k].scale_factor[l][sfi] *
> + rscale[l] = s->dca_chan[k].scale_factor[l][sfi] *
> s->audio_header.scalefactor_adj[k][sel];
>
> if (abits >= 11 || !dca_smpl_bitalloc[abits].vlc[sel].table)
> {
> @@ -850,7 +864,7 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
> block_code1 = get_bits(&s->gb, size);
> block_code2 = get_bits(&s->gb, size);
> err = decode_blockcodes(block_code1,
> block_code2,
> - levels, block +
> SAMPLES_PER_SUBBAND * l);
> + levels,
> subband_samples[l]);
> if (err) {
> av_log(s->avctx, AV_LOG_ERROR,
> "ERROR: block code look-up failed\n");
> @@ -859,20 +873,18 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
> } else {
> /* no coding */
> for (m = 0; m < SAMPLES_PER_SUBBAND; m++)
> - block[SAMPLES_PER_SUBBAND * l + m] =
> get_sbits(&s->gb, abits - 3);
> + subband_samples[l][m] = get_sbits(&s->gb, abits
> - 3);
> }
> } else {
> /* Huffman coded */
> for (m = 0; m < SAMPLES_PER_SUBBAND; m++)
> - block[SAMPLES_PER_SUBBAND * l + m] =
> get_bitalloc(&s->gb,
> -
> &dca_smpl_bitalloc[abits], sel);
> + subband_samples[l][m] = get_bitalloc(&s->gb,
> +
> &dca_smpl_bitalloc[abits], sel);
> }
> }
> + dequantize(subband_samples[l], quant_step_size, rscale[l]);
> }
>
> - s->fmt_conv.int32_to_float_fmul_array8(&s->fmt_conv,
> subband_samples[0],
> - block, rscale,
> SAMPLES_PER_SUBBAND * s->audio_header.vq_start_subband[k]);
> -
> for (l = 0; l < s->audio_header.vq_start_subband[k]; l++) {
> int m;
> /*
> @@ -882,25 +894,25 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
> int n;
> if (s->predictor_history)
> subband_samples[l][0] +=
> (ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
> -
> s->dca_chan[k].subband_samples_hist[l][3] +
> -
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][1] *
> -
> s->dca_chan[k].subband_samples_hist[l][2] +
> -
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][2] *
> -
> s->dca_chan[k].subband_samples_hist[l][1] +
> -
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][3] *
> -
> s->dca_chan[k].subband_samples_hist[l][0]) *
> - (1.0f / 8192);
> +
> (int64_t)s->dca_chan[k].subband_samples_hist[l][3] +
> +
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][1] *
> +
> (int64_t)s->dca_chan[k].subband_samples_hist[l][2] +
> +
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][2] *
> +
> (int64_t)s->dca_chan[k].subband_samples_hist[l][1] +
> +
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][3] *
> +
> (int64_t)s->dca_chan[k].subband_samples_hist[l][0]) +
> + (1 << 12) >> 13;
> for (m = 1; m < SAMPLES_PER_SUBBAND; m++) {
> - float sum =
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
> - subband_samples[l][m - 1];
> + int64_t sum =
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
> + (int64_t)subband_samples[l][m - 1];
> for (n = 2; n <= 4; n++)
> if (m >= n)
> sum +=
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][n - 1] *
> - subband_samples[l][m - n];
> + (int64_t)subband_samples[l][m - n];
> else if (s->predictor_history)
> sum +=
> ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][n - 1] *
> - s->dca_chan[k].subband_samples_hist[l][m
> - n + 4];
> - subband_samples[l][m] += sum * 1.0f / 8192;
> +
> (int64_t)s->dca_chan[k].subband_samples_hist[l][m - n + 4];
> + subband_samples[l][m] += (int)(sum + (1 << 12) >> 13);
> }
> }
>
> @@ -914,17 +926,22 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
> * Decode VQ encoded high frequencies
> */
> if (s->audio_header.subband_activity[k] >
> s->audio_header.vq_start_subband[k]) {
> + int i, j;
> +
> if (!s->debug_flag & 0x01) {
> av_log(s->avctx, AV_LOG_DEBUG,
> "Stream with high frequencies VQ coding\n");
> s->debug_flag |= 0x01;
> }
>
> - s->dcadsp.decode_hf(subband_samples, s->dca_chan[k].high_freq_vq,
> - ff_dca_high_freq_vq, subsubframe *
> SAMPLES_PER_SUBBAND,
> - s->dca_chan[k].scale_factor,
> - s->audio_header.vq_start_subband[k],
> - s->audio_header.subband_activity[k]);
> + // this should be SIMDified
please move it to dcadsp.c and call it through a function pointer then
> + for (j = s->audio_header.vq_start_subband[k]; j <
> s->audio_header.subband_activity[k]; j++) {
> + /* 1 vector -> 32 sampjes but we only need the 8 samples
> + * for this subsubframe. */
> + const int8_t *ptr =
> &ff_dca_high_freq_vq[s->dca_chan[k].high_freq_vq[j]][subsubframe *
> SAMPLES_PER_SUBBAND];
> + for (i = 0; i < 8; i++)
> + subband_samples[j][i] = ptr[i] *
> s->dca_chan[k].scale_factor[j][0] + 8 >> 4;
> + }
> }
> }
>
> @@ -942,8 +959,14 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
> static int dca_filter_channels(DCAContext *s, int block_index, int upsample)
> {
> int k;
> + float param[DCA_SUBBANDS];
> +
> + for (k = 0; k < DCA_SUBBANDS; k++)
> + param[k] = 1;
this is a little pointless, we don't seem to have SIMD code for int32_t
to float conversion but that is no good reason to multiply by 1.0
>
> if (upsample) {
> + LOCAL_ALIGNED_16(float, samples, [64], [SAMPLES_PER_SUBBAND]);
> +
> if (!s->qmf64_table) {
> s->qmf64_table = qmf64_precompute();
> if (!s->qmf64_table)
> @@ -952,21 +975,33 @@ static int dca_filter_channels(DCAContext *s, int
> block_index, int upsample)
>
> /* 64 subbands QMF */
> for (k = 0; k < s->audio_header.prim_channels; k++) {
> - float (*subband_samples)[SAMPLES_PER_SUBBAND] =
> s->dca_chan[k].subband_samples[block_index];
> + int (*subband_samples)[SAMPLES_PER_SUBBAND] =
> + s->dca_chan[k].subband_samples[block_index];
> +
> + s->fmt_conv.int32_to_float_fmul_array8(&s->fmt_conv, samples[0],
> + subband_samples[0], param,
> + 64 * SAMPLES_PER_SUBBAND);
int32_to_float_fmul_array8 can be used instead if the factors in param
are all identical, better yet add int32_to_float to fmtconvert.c so that
we can avoid the pointless multiplication.
> if (s->channel_order_tab[k] >= 0)
> - qmf_64_subbands(s, k, subband_samples,
> + qmf_64_subbands(s, k, samples,
> s->samples_chanptr[s->channel_order_tab[k]],
> /* Upsampling needs a factor 2 here. */
> M_SQRT2 / 32768.0);
> }
> } else {
> /* 32 subbands QMF */
> + LOCAL_ALIGNED_16(float, samples, [32], [SAMPLES_PER_SUBBAND]);
> +
> for (k = 0; k < s->audio_header.prim_channels; k++) {
> - float (*subband_samples)[SAMPLES_PER_SUBBAND] =
> s->dca_chan[k].subband_samples[block_index];
> + int (*subband_samples)[SAMPLES_PER_SUBBAND] =
> + s->dca_chan[k].subband_samples[block_index];
> +
> + s->fmt_conv.int32_to_float_fmul_array8(&s->fmt_conv, samples[0],
> + subband_samples[0], param,
> + 32 * SAMPLES_PER_SUBBAND);
same here
>
> if (s->channel_order_tab[k] >= 0)
> - qmf_32_subbands(s, k, subband_samples,
> + qmf_32_subbands(s, k, samples,
> s->samples_chanptr[s->channel_order_tab[k]],
> M_SQRT1_2 / 32768.0);
> }
a 20% slow down in decoding performance looks high for this change. Did
you ran the tests on a 32-bit or 64-bit system?
Janne
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel