On Tue, Apr 05, 2011 at 01:05:23AM -0600, Nathan Caldwell wrote:
> There is still are still a few sections missing relating to TNS (not present)
> and mid/side (contains other bugs).
> 
> Overall this improves quality, and vastly improves rate-control.
> ---
>  libavcodec/aacenc.c |    4 +-
>  libavcodec/aacpsy.c |  295 
> ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 294 insertions(+), 5 deletions(-)
> 
> diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c
> index d4b6112..4ec76d0 100644
> --- a/libavcodec/aacenc.c
> +++ b/libavcodec/aacenc.c
> @@ -606,8 +606,10 @@ static int aac_encode_frame(AVCodecContext *avctx,
>          }
>  
>          frame_bits = put_bits_count(&s->pb);
> -        if (frame_bits <= 6144 * avctx->channels - 3)
> +        if (frame_bits <= 6144 * avctx->channels - 3) {
> +            s->psy.bitres.bits = frame_bits / avctx->channels;
>              break;
> +        }
>  
>          s->lambda *= avctx->bit_rate * 1024.0f / avctx->sample_rate / 
> frame_bits;
>  
> diff --git a/libavcodec/aacpsy.c b/libavcodec/aacpsy.c
> index 4250a5d..413d364 100644
> --- a/libavcodec/aacpsy.c
> +++ b/libavcodec/aacpsy.c
> @@ -41,10 +41,48 @@
>   */
>  #define PSY_3GPP_THR_SPREAD_HI   1.5f // spreading factor for low-to-hi 
> threshold spreading  (15 dB/Bark)
>  #define PSY_3GPP_THR_SPREAD_LOW  3.0f // spreading factor for hi-to-low 
> threshold spreading  (30 dB/Bark)
> +/* spreading factor for low-to-hi energy spreading, long block, > 
> 22kbps/channel (20dB/Bark) */
> +#define PSY_3GPP_EN_SPREAD_HI_L1 2.0f
> +/* spreading factor for low-to-hi energy spreading, long block, <= 
> 22kbps/channel (15dB/Bark) */
> +#define PSY_3GPP_EN_SPREAD_HI_L2 1.5f
> +/* spreading factor for low-to-hi energy spreading, short block (15 dB/Bark) 
> */
> +#define PSY_3GPP_EN_SPREAD_HI_S  1.5f
> +/* spreading factor for hi-to-low energy spreading, long block (30dB/Bark) */
> +#define PSY_3GPP_EN_SPREAD_LOW_L 3.0f
> +/* spreading factor for hi-to-low energy spreading, short block (20dB/Bark) 
> */
> +#define PSY_3GPP_EN_SPREAD_LOW_S 2.0f
>  
>  #define PSY_3GPP_RPEMIN      0.01f
>  #define PSY_3GPP_RPELEV      2.0f
>  
> +#define PSY_3GPP_C1          3.0f        /* log2(8) */
> +#define PSY_3GPP_C2          1.3219281f  /* log2(2.5) */
> +#define PSY_3GPP_C3          0.55935729f /* 1 - C2 / C1 */
> +
> +#define PSY_3GPP_SAVE_SLOPE_L  -0.46666667f
> +#define PSY_3GPP_SAVE_SLOPE_S  -0.36363637f
> +#define PSY_3GPP_SAVE_ADD_L    -0.84285712f
> +#define PSY_3GPP_SAVE_ADD_S    -0.75f
> +#define PSY_3GPP_SPEND_SLOPE_L  0.66666669f
> +#define PSY_3GPP_SPEND_SLOPE_S  0.81818181f
> +#define PSY_3GPP_SPEND_ADD_L   -0.35f
> +#define PSY_3GPP_SPEND_ADD_S   -0.26111111f
> +#define PSY_3GPP_CLIP_LO_L      0.2f
> +#define PSY_3GPP_CLIP_LO_S      0.2f
> +#define PSY_3GPP_CLIP_HI_L      0.95f
> +#define PSY_3GPP_CLIP_HI_S      0.75f
> +
> +#define PSY_3GPP_AH_THR_LONG    0.5f
> +#define PSY_3GPP_AH_THR_SHORT   0.63f
> +
> +enum {
> +    PSY_3GPP_AH_NONE,
> +    PSY_3GPP_AH_INACTIVE,
> +    PSY_3GPP_AH_ACTIVE
> +};
> +
> +#define PSY_3GPP_BITS_TO_PE(bits) ((bits) * 1.18f)
> +
>  /* LAME psy model constants */
>  #define PSY_LAME_FIR_LEN 21         ///< LAME psy model FIR order
>  #define AAC_BLOCK_SIZE_LONG 1024    ///< long block size
> @@ -63,6 +101,12 @@ typedef struct AacPsyBand{
>      float energy;    ///< band energy
>      float thr;       ///< energy threshold
>      float thr_quiet; ///< threshold in quiet
> +    float nz_lines;     ///< number of non-zero spectral lines
> +    float active_lines; ///< number of active spectral lines
> +    float pe;           ///< perceptual entropy
> +    float pe_const;     ///< constant part of the PE calculation
> +    float norm_fac;     ///< normalization factor for linearization
> +    int   avoid_holes;  ///< hole avoidance flag
>  }AacPsyBand;
>  
>  /**
> @@ -97,6 +141,15 @@ typedef struct AacPsyCoeffs{
>   * 3GPP TS26.403-inspired psychoacoustic model specific data
>   */
>  typedef struct AacPsyContext{
> +    int chan_bitrate;     ///< bitrate per channel
> +    int frame_bits;       ///< average bits per frame
> +    int fill_level;       ///< bit reservoir fill level
> +    struct {
> +        float min;        ///< minimum allowed PE for bit factor calculation
> +        float max;        ///< maximum allowed PE for bit factor calculation
> +        float previous;   ///< allowed PE of the previous frame
> +        float correction; ///< PE correction factor
> +    } pe;
>      AacPsyCoeffs psy_coef[2][64];
>      AacPsyChannel *ch;
>  }AacPsyContext;
> @@ -235,16 +288,33 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
>      AacPsyContext *pctx;
>      float bark;
>      int i, j, g, start;
> -    float prev, minscale, minath;
> +    float prev, minscale, minath, minsnr, pe_min;
> +    const int chan_bitrate = ctx->avctx->bit_rate / ctx->avctx->channels;
> +    /* FIXME: num_bark should use cutoff */
> +    const float num_bark = calc_bark(ctx->avctx->sample_rate / 2.0f) - 
> calc_bark(0.0f);
>  
>      ctx->model_priv_data = av_mallocz(sizeof(AacPsyContext));
>      pctx = (AacPsyContext*) ctx->model_priv_data;
>  
> +    pctx->chan_bitrate = chan_bitrate;
> +    pctx->frame_bits = chan_bitrate * AAC_BLOCK_SIZE_LONG / 
> ctx->avctx->sample_rate;
> +    pctx->pe.min =  8192.0f;   /* FIXME: 0.8 * 10 * FRAME_LENGTH_LONG * 
> bandwidth / (sample_rate / 2) */
> +    pctx->pe.max = 12288.0f;   /* FIXME: 1.2 * 10 * FRAME_LENGTH_LONG * 
> bandwidth / (sample_rate / 2) */

what prevents you from using those numbers in fixme?

> +    ctx->bitres.size  = 6144 - pctx->frame_bits;
> +    ctx->bitres.size -= ctx->bitres.size % 8;
> +    pctx->fill_level  = ctx->bitres.size;
>      minath = ath(3410, ATH_ADD);
>      for (j = 0; j < 2; j++) {
>          AacPsyCoeffs *coeffs = pctx->psy_coef[j];
>          const uint8_t *band_sizes = ctx->bands[j];
>          float line_to_frequency = ctx->avctx->sample_rate / (j ? 256.f : 
> 2048.0f);
> +        float avg_chan_bits = chan_bitrate / ctx->avctx->sample_rate * (j ? 
> 128.0f : 1024.0f);
> +        /* reference encoder uses 2.4% here instead of 60% like the spec 
> says */
> +        float bark_pe = 0.024f * PSY_3GPP_BITS_TO_PE(avg_chan_bits) / 
> num_bark;
> +        float en_spread_low = j ? PSY_3GPP_EN_SPREAD_LOW_S : 
> PSY_3GPP_EN_SPREAD_LOW_L;
> +        /* High energy spreading for long blocks <= 22kbps/channel and short 
> blocks are the same. */
> +        float en_spread_hi  = (j || (chan_bitrate <= 22.0f)) ? 
> PSY_3GPP_EN_SPREAD_HI_S : PSY_3GPP_EN_SPREAD_HI_L1;
> +
>          i = 0;
>          prev = 0.0;
>          for (g = 0; g < ctx->num_bands[j]; g++) {
> @@ -258,6 +328,12 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
>              float bark_width = coeffs[g+1].barks - coeffs->barks;
>              coeff->spread_low[0] = pow(10.0, -bark_width * 
> PSY_3GPP_THR_SPREAD_LOW);
>              coeff->spread_hi [0] = pow(10.0, -bark_width * 
> PSY_3GPP_THR_SPREAD_HI);
> +            coeff->spread_low[1] = pow(10.0, -bark_width * en_spread_low);
> +            coeff->spread_hi [1] = pow(10.0, -bark_width * en_spread_hi);
> +            pe_min = bark_pe * bark_width;
> +            minsnr = pow(2.0f, pe_min / band_sizes[g]) - 1.5f;
> +                                                  /* -25dB          -1dB */
> +            coeff->min_snr = av_clipf(1.0f / minsnr, 3.1622776e-3f, 
> 7.9432821e-1f);

this comment looks slightly misaligned, also something like "min SNR should be
in range -1..-25 dB" sounds better

>          }
>          start = 0;
>          for (g = 0; g < ctx->num_bands[j]; g++) {
> @@ -385,6 +461,88 @@ static FFPsyWindowInfo psy_3gpp_window(FFPsyContext *ctx,
>      return wi;
>  }
>  
> +/* 5.6.1.2 "Calculation of Bit Demand" */
> +static int calc_bit_demand(AacPsyContext *ctx, float pe, int bits, int size, 
> int short_window)
> +{
> +    const float bitsave_slope  = short_window ? PSY_3GPP_SAVE_SLOPE_S  : 
> PSY_3GPP_SAVE_SLOPE_L;
> +    const float bitsave_add    = short_window ? PSY_3GPP_SAVE_ADD_S    : 
> PSY_3GPP_SAVE_ADD_L;
> +    const float bitspend_slope = short_window ? PSY_3GPP_SPEND_SLOPE_S : 
> PSY_3GPP_SPEND_SLOPE_L;
> +    const float bitspend_add   = short_window ? PSY_3GPP_SPEND_ADD_S   : 
> PSY_3GPP_SPEND_ADD_L;
> +    const float clip_low       = short_window ? PSY_3GPP_CLIP_LO_S     : 
> PSY_3GPP_CLIP_LO_L;
> +    const float clip_high      = short_window ? PSY_3GPP_CLIP_HI_S     : 
> PSY_3GPP_CLIP_HI_L;
> +    float clipped_pe, bit_save, bit_spend, bit_factor, fill_level;
> +
> +    ctx->fill_level += ctx->frame_bits - bits;
> +    ctx->fill_level  = av_clip(ctx->fill_level, 0, size);
> +    fill_level = av_clipf((float)ctx->fill_level / size, clip_low, 
> clip_high);
> +    clipped_pe = av_clipf(pe, ctx->pe.min, ctx->pe.max);
> +    bit_save   = (fill_level + bitsave_add) * bitsave_slope;
> +    assert(bit_save <= 0.3f && bit_save >= -0.05000001f);
> +    bit_spend  = (fill_level + bitspend_add) * bitspend_slope;
> +    assert(bit_spend <= 0.5f && bit_spend >= -0.1f);

I suspect those asserts may trigger quite often with current encoder, don't
they? If yes then something like reset should be done instead.

[the rest looks ok]
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to