On Tue, Apr 05, 2011 at 01:05:23AM -0600, Nathan Caldwell wrote:
> There is still are still a few sections missing relating to TNS (not present)
> and mid/side (contains other bugs).
>
> Overall this improves quality, and vastly improves rate-control.
> ---
> libavcodec/aacenc.c | 4 +-
> libavcodec/aacpsy.c | 295
> ++++++++++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 294 insertions(+), 5 deletions(-)
>
> diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c
> index d4b6112..4ec76d0 100644
> --- a/libavcodec/aacenc.c
> +++ b/libavcodec/aacenc.c
> @@ -606,8 +606,10 @@ static int aac_encode_frame(AVCodecContext *avctx,
> }
>
> frame_bits = put_bits_count(&s->pb);
> - if (frame_bits <= 6144 * avctx->channels - 3)
> + if (frame_bits <= 6144 * avctx->channels - 3) {
> + s->psy.bitres.bits = frame_bits / avctx->channels;
> break;
> + }
>
> s->lambda *= avctx->bit_rate * 1024.0f / avctx->sample_rate /
> frame_bits;
>
> diff --git a/libavcodec/aacpsy.c b/libavcodec/aacpsy.c
> index 4250a5d..413d364 100644
> --- a/libavcodec/aacpsy.c
> +++ b/libavcodec/aacpsy.c
> @@ -41,10 +41,48 @@
> */
> #define PSY_3GPP_THR_SPREAD_HI 1.5f // spreading factor for low-to-hi
> threshold spreading (15 dB/Bark)
> #define PSY_3GPP_THR_SPREAD_LOW 3.0f // spreading factor for hi-to-low
> threshold spreading (30 dB/Bark)
> +/* spreading factor for low-to-hi energy spreading, long block, >
> 22kbps/channel (20dB/Bark) */
> +#define PSY_3GPP_EN_SPREAD_HI_L1 2.0f
> +/* spreading factor for low-to-hi energy spreading, long block, <=
> 22kbps/channel (15dB/Bark) */
> +#define PSY_3GPP_EN_SPREAD_HI_L2 1.5f
> +/* spreading factor for low-to-hi energy spreading, short block (15 dB/Bark)
> */
> +#define PSY_3GPP_EN_SPREAD_HI_S 1.5f
> +/* spreading factor for hi-to-low energy spreading, long block (30dB/Bark) */
> +#define PSY_3GPP_EN_SPREAD_LOW_L 3.0f
> +/* spreading factor for hi-to-low energy spreading, short block (20dB/Bark)
> */
> +#define PSY_3GPP_EN_SPREAD_LOW_S 2.0f
>
> #define PSY_3GPP_RPEMIN 0.01f
> #define PSY_3GPP_RPELEV 2.0f
>
> +#define PSY_3GPP_C1 3.0f /* log2(8) */
> +#define PSY_3GPP_C2 1.3219281f /* log2(2.5) */
> +#define PSY_3GPP_C3 0.55935729f /* 1 - C2 / C1 */
> +
> +#define PSY_3GPP_SAVE_SLOPE_L -0.46666667f
> +#define PSY_3GPP_SAVE_SLOPE_S -0.36363637f
> +#define PSY_3GPP_SAVE_ADD_L -0.84285712f
> +#define PSY_3GPP_SAVE_ADD_S -0.75f
> +#define PSY_3GPP_SPEND_SLOPE_L 0.66666669f
> +#define PSY_3GPP_SPEND_SLOPE_S 0.81818181f
> +#define PSY_3GPP_SPEND_ADD_L -0.35f
> +#define PSY_3GPP_SPEND_ADD_S -0.26111111f
> +#define PSY_3GPP_CLIP_LO_L 0.2f
> +#define PSY_3GPP_CLIP_LO_S 0.2f
> +#define PSY_3GPP_CLIP_HI_L 0.95f
> +#define PSY_3GPP_CLIP_HI_S 0.75f
> +
> +#define PSY_3GPP_AH_THR_LONG 0.5f
> +#define PSY_3GPP_AH_THR_SHORT 0.63f
> +
> +enum {
> + PSY_3GPP_AH_NONE,
> + PSY_3GPP_AH_INACTIVE,
> + PSY_3GPP_AH_ACTIVE
> +};
> +
> +#define PSY_3GPP_BITS_TO_PE(bits) ((bits) * 1.18f)
> +
> /* LAME psy model constants */
> #define PSY_LAME_FIR_LEN 21 ///< LAME psy model FIR order
> #define AAC_BLOCK_SIZE_LONG 1024 ///< long block size
> @@ -63,6 +101,12 @@ typedef struct AacPsyBand{
> float energy; ///< band energy
> float thr; ///< energy threshold
> float thr_quiet; ///< threshold in quiet
> + float nz_lines; ///< number of non-zero spectral lines
> + float active_lines; ///< number of active spectral lines
> + float pe; ///< perceptual entropy
> + float pe_const; ///< constant part of the PE calculation
> + float norm_fac; ///< normalization factor for linearization
> + int avoid_holes; ///< hole avoidance flag
> }AacPsyBand;
>
> /**
> @@ -97,6 +141,15 @@ typedef struct AacPsyCoeffs{
> * 3GPP TS26.403-inspired psychoacoustic model specific data
> */
> typedef struct AacPsyContext{
> + int chan_bitrate; ///< bitrate per channel
> + int frame_bits; ///< average bits per frame
> + int fill_level; ///< bit reservoir fill level
> + struct {
> + float min; ///< minimum allowed PE for bit factor calculation
> + float max; ///< maximum allowed PE for bit factor calculation
> + float previous; ///< allowed PE of the previous frame
> + float correction; ///< PE correction factor
> + } pe;
> AacPsyCoeffs psy_coef[2][64];
> AacPsyChannel *ch;
> }AacPsyContext;
> @@ -235,16 +288,33 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
> AacPsyContext *pctx;
> float bark;
> int i, j, g, start;
> - float prev, minscale, minath;
> + float prev, minscale, minath, minsnr, pe_min;
> + const int chan_bitrate = ctx->avctx->bit_rate / ctx->avctx->channels;
> + /* FIXME: num_bark should use cutoff */
> + const float num_bark = calc_bark(ctx->avctx->sample_rate / 2.0f) -
> calc_bark(0.0f);
>
> ctx->model_priv_data = av_mallocz(sizeof(AacPsyContext));
> pctx = (AacPsyContext*) ctx->model_priv_data;
>
> + pctx->chan_bitrate = chan_bitrate;
> + pctx->frame_bits = chan_bitrate * AAC_BLOCK_SIZE_LONG /
> ctx->avctx->sample_rate;
> + pctx->pe.min = 8192.0f; /* FIXME: 0.8 * 10 * FRAME_LENGTH_LONG *
> bandwidth / (sample_rate / 2) */
> + pctx->pe.max = 12288.0f; /* FIXME: 1.2 * 10 * FRAME_LENGTH_LONG *
> bandwidth / (sample_rate / 2) */
what prevents you from using those numbers in fixme?
> + ctx->bitres.size = 6144 - pctx->frame_bits;
> + ctx->bitres.size -= ctx->bitres.size % 8;
> + pctx->fill_level = ctx->bitres.size;
> minath = ath(3410, ATH_ADD);
> for (j = 0; j < 2; j++) {
> AacPsyCoeffs *coeffs = pctx->psy_coef[j];
> const uint8_t *band_sizes = ctx->bands[j];
> float line_to_frequency = ctx->avctx->sample_rate / (j ? 256.f :
> 2048.0f);
> + float avg_chan_bits = chan_bitrate / ctx->avctx->sample_rate * (j ?
> 128.0f : 1024.0f);
> + /* reference encoder uses 2.4% here instead of 60% like the spec
> says */
> + float bark_pe = 0.024f * PSY_3GPP_BITS_TO_PE(avg_chan_bits) /
> num_bark;
> + float en_spread_low = j ? PSY_3GPP_EN_SPREAD_LOW_S :
> PSY_3GPP_EN_SPREAD_LOW_L;
> + /* High energy spreading for long blocks <= 22kbps/channel and short
> blocks are the same. */
> + float en_spread_hi = (j || (chan_bitrate <= 22.0f)) ?
> PSY_3GPP_EN_SPREAD_HI_S : PSY_3GPP_EN_SPREAD_HI_L1;
> +
> i = 0;
> prev = 0.0;
> for (g = 0; g < ctx->num_bands[j]; g++) {
> @@ -258,6 +328,12 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
> float bark_width = coeffs[g+1].barks - coeffs->barks;
> coeff->spread_low[0] = pow(10.0, -bark_width *
> PSY_3GPP_THR_SPREAD_LOW);
> coeff->spread_hi [0] = pow(10.0, -bark_width *
> PSY_3GPP_THR_SPREAD_HI);
> + coeff->spread_low[1] = pow(10.0, -bark_width * en_spread_low);
> + coeff->spread_hi [1] = pow(10.0, -bark_width * en_spread_hi);
> + pe_min = bark_pe * bark_width;
> + minsnr = pow(2.0f, pe_min / band_sizes[g]) - 1.5f;
> + /* -25dB -1dB */
> + coeff->min_snr = av_clipf(1.0f / minsnr, 3.1622776e-3f,
> 7.9432821e-1f);
this comment looks slightly misaligned, also something like "min SNR should be
in range -1..-25 dB" sounds better
> }
> start = 0;
> for (g = 0; g < ctx->num_bands[j]; g++) {
> @@ -385,6 +461,88 @@ static FFPsyWindowInfo psy_3gpp_window(FFPsyContext *ctx,
> return wi;
> }
>
> +/* 5.6.1.2 "Calculation of Bit Demand" */
> +static int calc_bit_demand(AacPsyContext *ctx, float pe, int bits, int size,
> int short_window)
> +{
> + const float bitsave_slope = short_window ? PSY_3GPP_SAVE_SLOPE_S :
> PSY_3GPP_SAVE_SLOPE_L;
> + const float bitsave_add = short_window ? PSY_3GPP_SAVE_ADD_S :
> PSY_3GPP_SAVE_ADD_L;
> + const float bitspend_slope = short_window ? PSY_3GPP_SPEND_SLOPE_S :
> PSY_3GPP_SPEND_SLOPE_L;
> + const float bitspend_add = short_window ? PSY_3GPP_SPEND_ADD_S :
> PSY_3GPP_SPEND_ADD_L;
> + const float clip_low = short_window ? PSY_3GPP_CLIP_LO_S :
> PSY_3GPP_CLIP_LO_L;
> + const float clip_high = short_window ? PSY_3GPP_CLIP_HI_S :
> PSY_3GPP_CLIP_HI_L;
> + float clipped_pe, bit_save, bit_spend, bit_factor, fill_level;
> +
> + ctx->fill_level += ctx->frame_bits - bits;
> + ctx->fill_level = av_clip(ctx->fill_level, 0, size);
> + fill_level = av_clipf((float)ctx->fill_level / size, clip_low,
> clip_high);
> + clipped_pe = av_clipf(pe, ctx->pe.min, ctx->pe.max);
> + bit_save = (fill_level + bitsave_add) * bitsave_slope;
> + assert(bit_save <= 0.3f && bit_save >= -0.05000001f);
> + bit_spend = (fill_level + bitspend_add) * bitspend_slope;
> + assert(bit_spend <= 0.5f && bit_spend >= -0.1f);
I suspect those asserts may trigger quite often with current encoder, don't
they? If yes then something like reset should be done instead.
[the rest looks ok]
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel