This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit d1943afbb787955db996ae7f2254f2d9f22d35f9 Author: Lynne <[email protected]> AuthorDate: Wed Jun 10 14:46:00 2026 +0900 Commit: Lynne <[email protected]> CommitDate: Wed Jul 1 21:05:27 2026 +0900 aaccoder: add NMR-based coder --- libavcodec/aaccoder.c | 14 + libavcodec/aaccoder_nmr.h | 700 ++++++++++++++++++++++++++++++++++++++++++ libavcodec/aaccoder_twoloop.h | 29 +- libavcodec/aacenc.c | 356 +++++++++++++++++++-- libavcodec/aacenc.h | 26 ++ libavcodec/aacencdsp.c | 32 +- libavcodec/aacencdsp.h | 6 + libavcodec/aacpsy.c | 43 ++- libavcodec/psymodel.c | 4 +- libavcodec/psymodel.h | 3 +- 10 files changed, 1162 insertions(+), 51 deletions(-) diff --git a/libavcodec/aaccoder.c b/libavcodec/aaccoder.c index 7f1c4cdcc1..18076bda78 100644 --- a/libavcodec/aaccoder.c +++ b/libavcodec/aaccoder.c @@ -59,6 +59,7 @@ #define NOISE_LAMBDA_REPLACE 1.948f #include "libavcodec/aaccoder_trellis.h" +#include "libavcodec/aaccoder_nmr.h" typedef float (*quantize_and_encode_band_func)(struct AACEncContext *s, PutBitContext *pb, const float *in, float *quant, const float *scaled, @@ -867,4 +868,17 @@ const AACCoefficientsEncoder ff_aac_coders[AAC_CODER_NB] = { search_for_ms, ff_aac_search_for_is, }, + [AAC_CODER_NMR] = { + search_for_quantizers_nmr, + codebook_trellis_rate, + quantize_and_encode_band, + ff_aac_encode_tns_info, + ff_aac_apply_tns, + set_special_band_scalefactors, + NULL, /* PNS decided in the trellis (search_for_quantizers_nmr) */ + mark_pns, + ff_aac_search_for_tns, + NULL, + NULL, + }, }; diff --git a/libavcodec/aaccoder_nmr.h b/libavcodec/aaccoder_nmr.h new file mode 100644 index 0000000000..7a01a57570 --- /dev/null +++ b/libavcodec/aaccoder_nmr.h @@ -0,0 +1,700 @@ +/* + * AAC encoder NMR (noise-to-mask ratio) scalefactor coder + * Copyright (c) 2026 Lynne <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * AAC encoder NMR scalefactor coder. + * + * Optimizes the same noise-to-mask objective as the two-loop coder, but with an + * optimal Viterbi search over scalefactors instead of a heuristic loop. For each + * coded band the per-scalefactor distortion/bits curve is precomputed, then a + * trellis over the (window-group, band) coding sequence minimizes + * sum_g = dist_g(sf_g)/threshold_g + + * lambda * (spectral_bits_g(sf_g) + scalefactor_differential_bits) + * with |sf_g - sf_{g-1}| <= SCALE_MAX_DIFF as a constraint, and lambda + * binary-searched so the coded size meets the per-frame bit budget + * + * Perceptual noise substitution (PNS) is integrated into the same objective: once + * the trellis settles on its operating lambda, each noise-like band (flagged by + * mark_pns) is offered a terminal "code as noise" candidate whose cost is + * nmr_pns + lambda*NMR_PNS_BITS. Because NMR_PNS_BITS is far below a band's spectral bit + * count, this candidate only wins when lambda is large, i.e. when the encoder is + * struggling to hold the bitrate. The bits freed by the chosen PNS bands are + * then re-spent by a second trellis pass over the remaining bands. + */ + +#ifndef AVCODEC_AACCODER_NMR_H +#define AVCODEC_AACCODER_NMR_H + +#include <float.h> +#include <string.h> +#include "libavutil/mathematics.h" +#include "mathops.h" +#include "avcodec.h" +#include "put_bits.h" +#include "aac.h" +#include "aacenc.h" +#include "aactab.h" +#include "aacenctab.h" + +/* differential scalefactor coding cost, clamped to the legal delta range */ +#define NMR_SFBITS(d) ff_aac_scalefactor_bits[av_clip((d) + SCALE_DIFF_ZERO, 0, 2*SCALE_MAX_DIFF)] + +#define NMR_ITERS 14 /* lambda binary-search iters */ +#define NMR_IFINE 9 /* fine-pass lambda iters */ +#define NMR_CITERS 7 /* coarse-pass lambda iters */ +#define NMR_CWARM 5 /* coarse-pass iters when warm-started off the previous frame's + * lambda: the bracket spans 10 octaves instead of ~43, so fewer + * bisection steps reach the same resolution */ +#define NMR_COARSE 8 /* two-pass coarse->fine grid step, cuts the Viterbi ncand^2 with no + * quality loss, 0 disables it (single full-resolution pass) */ +#define NMR_STEP 1 /* fine-pass scalefactor candidate granularity */ + +#define NMR_PNS_BITS 9 /* approx cost in bits of signalling PNS */ + +/* Spectral-hole fill: noise-like bands the trellis left mostly empty are filled with + * energy-matched noise (PNS); an audible hole sounds worse than matched noise. */ +#define NMR_PNS_HOLE_FRAC 0.5f +#define NMR_PNS_HOLE_SPREAD 0.5f + +/* RC servo gain: scale the corridor centre by exp2(-K*fill/R) each frame to hold + * the long-run mean rate; without it a bad centre drifts for dozens of frames. */ +#define NMR_RC_K_CBR 0.5f + +#define NMR_RC_ITERS 8 /* lambda bisection iters when clamping an over-cap frame */ +/* Corridor: bisect within [lam_rc/NMR_RC_CORR, lam_rc*NMR_RC_CORR] so quality stays + * smooth while per-frame demand is tracked; 1.5 cuts lambda jitter ~25%. */ +#define NMR_RC_CORR 1.5f + +/* Leaky-bucket half-depth (bits/ch); 512 is the sweet spot — tighter rebounds as + * frames cannot hit the narrow window. Clamped to the 6144 bits/ch decoder buffer. */ +#define NMR_CBR_BUF 512 +#define NMR_RC_CITERS 3 /* corridor coarse-pass iters */ + +/* Transient bit-burst: an isolated onset (preceded by >= NMR_BURST_GAP long frames) + * is coded NMR_BURST_GAIN x finer, held uniform across the run, repaid from steady stretches. */ +#define NMR_BURST_GAP 10 +#define NMR_BURST_GAIN 8.0f +#define NMR_RC_FITERS 4 /* corridor fine-pass iters */ +#define NMR_RC_TRACK 0.1f /* per-frame pull of the corridor centre toward the realized lambda */ + +/* PNS noise-distortion gate: only bands coded well above the masking floor become noise. */ +#define NMR_PNS_NDGATE 4.0f + +/* Energy/threshold cap for PNS: loud bands (energy >> mask) yield clipping random peaks; + * only near-masked bands are safe substitution targets. */ +#define NMR_PNS_MAX_ET 8.0f + +/* Operating-lambda floor for PNS: below it the encoder is not struggling, so + * substituting real texture for 9 signalling bits is net-negative. */ +#define NMR_PNS_LAM 100.0f + +/** + * Viterbi over the coding sequence act[0..nact-1] (indices into the per-band + * curves nd/nb), with lambda binary-searched so the coded size ~ destbits. + * Fills chosen[band] for every band referenced by act. Returns the operating + * lambda. node cost = dist/threshold + lambda*spectral_bits; + * edge cost = lambda*sf_differential_bits; |delta sf| <= SCALE_MAX_DIFF hard. + */ +static float nmr_solve(AACEncContext *s, + const float (*nd)[NMR_NCAND], const int (*nb)[NMR_NCAND], + const int *blo, const int *bnc, int step, + const int *act, int nact, int destbits, int *chosen, + float lo_l, float hi_l, int iters) +{ + float dp[NMR_NCAND], dpp[NMR_NCAND], node[NMR_NCAND]; + float lamsf[2*SCALE_MAX_DIFF + 1]; /* lam*sfdiff bit cost, per lambda */ + uint8_t bp[128][NMR_NCAND]; + float lam = 1.0f; + + if (nact <= 0) + return lam; + + for (int it = 0; it < iters; it++) { + lam = sqrtf(lo_l * hi_l); + for (int i = 0; i <= 2*SCALE_MAX_DIFF; i++) + lamsf[i] = lam * ff_aac_scalefactor_bits[i]; /* edge cost for this lambda */ + + int b0 = act[0]; + for (int o = 0; o < bnc[b0]; o++) + dp[o] = nd[b0][o] + lam * nb[b0][o]; /* anchor band node cost */ + + for (int k = 1; k < nact; k++) { + int b = act[k], pb = act[k-1]; + memcpy(dpp, dp, sizeof(dp)); + for (int o = 0; o < bnc[b]; o++) + node[o] = nd[b][o] + lam * nb[b][o]; + /* dp[o] = node[o] + min_op(dpp[op] + edge cost) */ + s->aacdsp.nmr_trellis_step(dp, bp[k], dpp, node, lamsf, + bnc[b], bnc[pb], blo[b] - blo[pb], step, + SCALE_MAX_DIFF); + } + + /* backtrack */ + int beo = 0, b = act[nact-1]; + float bec = FLT_MAX; + for (int o = 0; o < bnc[b]; o++) + if (dp[o] < bec) { bec = dp[o]; beo = o; } + chosen[b] = beo; + for (int k = nact-1; k > 0; k--) + chosen[act[k-1]] = bp[k][chosen[act[k]]]; + + /* calc cost */ + int total = 0; + for (int k = 0; k < nact; k++) + total += nb[act[k]][chosen[act[k]]]; + for (int k = 1; k < nact; k++) + total += NMR_SFBITS((blo[act[k]]+chosen[act[k]]*step) - (blo[act[k-1]]+chosen[act[k-1]]*step)); + + if (it == iters - 1) + break; + + /* check if we went over budget, go coarser if we did */ + if (total > destbits) + lo_l = lam; + else + hi_l = lam; + } + return lam; +} + +/* Build one coded band's (dist/threshold, bits) cost curve, candidates sf = lo + o*step + * for o in [0,maxn), stopping when the band would drop (cb <= 0). Returns the bit count. */ +static int nmr_band_curve(AACEncContext *s, SingleChannelElement *sce, int w, int g, + int start, int lo, int step, int maxn, float invthr, + float maxval, float *nd_row, int *nb_row) +{ + int ncand = 0; + for (int o = 0; o < maxn && lo + o*step <= SCALE_MAX_POS; o++) { + int sf = lo + o*step, btot = 0, cb = find_min_book(maxval, sf); + float dist = 0.0f; + if (cb <= 0) + break; + for (int w2 = 0; w2 < sce->ics.group_len[w]; w2++) { + int bb; + dist += quantize_band_cost_cached(s, w + w2, g, sce->coeffs + start + w2*128, + s->scoefs + start + w2*128, sce->ics.swb_sizes[g], + sf, cb, 1.0f, INFINITY, &bb, NULL, 0); + btot += bb; + } + nd_row[ncand] = (dist - btot) * invthr; + nb_row[ncand] = btot; + ncand++; + } + return ncand; +} + +static void search_for_quantizers_nmr(AVCodecContext *avctx, + AACEncContext *s, + SingleChannelElement *sce, + const float lambda) +{ + int bch = ((avctx->flags & AV_CODEC_FLAG_QSCALE) ? 2.0f : avctx->ch_layout.nb_channels); + int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / bch * (lambda / 120.f); + int allz = 0, cutoff = 1024, nbnd = 0; + + float thr[128]; /* allocation-law effective threshold (drives the trellis) */ + float thr_real[128]; /* real masking threshold (perceptual gates: PNS) */ + float pener[128]; /* band energy (for PNS noise target) */ + float pspread[128]; /* band tonality spread (1 = noise) */ + int minsf[128]; + float maxvals[128]; + + /* coded-band trellis state (indexed 0..nbnd-1) */ + int bidx[128]; /* sce band index (w*16+g) */ + int bw[128], bg[128], bst[128]; /* window group, swb, coef start per coded band */ + int blo[128]; /* finest candidate scalefactor */ + int bnc[128]; /* number of candidates */ + int chosen[128]; + int act[128]; /* active (non-PNS) band coding order */ + uint8_t is_pns[128]; /* trellis band coded as noise */ + + float (*nd)[NMR_NCAND] = s->nmr->nd; /* dist / threshold per candidate (heap) */ + int (*nb)[NMR_NCAND] = s->nmr->nb; /* spectral bits per candidate (heap) */ + + /* two-pass coarse->fine grid step (see NMR_COARSE), the lambda search runs on + * the cheap coarse grid, PASS 2 refines the winner at NMR_STEP granularity */ + const int cstep = NMR_COARSE > 0 ? NMR_COARSE : NMR_STEP; + + s->nmr->counted[s->cur_channel] = 0; + + /* Global-lambda RC: one solve per frame at a servoed centre lambda; the reservoir + * holds the long-run mean rate. Bypassed for VBR (-q:a) and the bootstrap frame. */ + int rc_eligible = !(avctx->flags & AV_CODEC_FLAG_QSCALE) && avctx->bit_rate > 0 && + avctx->bit_rate_tolerance != 0; + /* Leaky-bucket reservoir: rc_fill (signed +-rc_bmax); the spend-floor/cap below force + * lambda so no frame banks past +rc_bmax or borrows past -rc_bmax. */ + int rc_rate_frame = avctx->bit_rate * 1024.0 / avctx->sample_rate; + int rc_bmax = FFMIN(FFMAX(6144 * s->channels - rc_rate_frame, 256), NMR_CBR_BUF * s->channels); + if (rc_eligible && avctx->frame_num != s->nmr->rc_frame_num) { + if (s->nmr->rc_frame_num > 0 && s->nmr->lam_rc > 0.0f) + s->nmr->rc_fill = av_clip(s->nmr->rc_fill + rc_rate_frame - s->last_frame_pb_count, + -rc_bmax, rc_bmax); + s->nmr->rc_frame_num = avctx->frame_num; + + /* Transient burst run state: set at run start and held across the run so + * coding stays uniform; repaid from the reservoir's steady stretches. */ + int is_short = sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE; + if (is_short) { + if (!s->nmr->prev_was_short) /* run start */ + s->nmr->run_burst = s->nmr->frames_since_short >= NMR_BURST_GAP + ? NMR_BURST_GAIN : 1.0f; + s->nmr->frames_since_short = 0; + } else { + s->nmr->run_burst = 1.0f; + s->nmr->frames_since_short++; + } + s->nmr->prev_was_short = is_short; + } + int rc_global = rc_eligible && s->nmr->lam_rc > 0.0f; + + if (s->psy.bitres.alloc >= 0) + destbits = s->psy.bitres.alloc * + (lambda / (avctx->global_quality ? avctx->global_quality : 120)); + if (rc_global && s->psy.bitres.alloc >= 0) + /* uniform CBR target: nominal rate plus fast reservoir repayment */ + destbits = (avctx->bit_rate * 1024.0 / avctx->sample_rate + + s->nmr->rc_fill / 2.0) / s->channels; + destbits = FFMIN(destbits, 5800); + /* honest budget: subtract the measured non-trellis overhead (section data, ICS, + * sf/PNS signalling), which is rate-dependent hence adaptive. */ + if (s->nmr->side_inited) + destbits = av_clip(destbits - (int)(s->nmr->side_ema / s->channels), 64, 5800); + + /* Apply the held transient burst factor (set in the run-state machine above). */ + if (sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE && s->nmr->run_burst > 1.0f) + destbits = av_clip((int)(destbits * s->nmr->run_burst), 64, 6800); + + /* band cutoff index for this frame's window size; the bandwidth is fixed + * at init and shared with the psy model */ + cutoff = s->bandwidth * 2 * (1024 / sce->ics.num_windows) / avctx->sample_rate; + + /* Short-block transient noise shaping (pairs with short-block TNS): temporal + * premasking clamps each window's threshold toward the preceding windows' + * (Apple's preEchoReduction), and flat-residual flattens each window's thresholds + * to their per-window mean so TNS synthesis has a white floor to concentrate. */ + if (sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE) { + const float pm_p1 = 0.1f, pm_p2 = 2.0f, pm_p3 = 4.0f; + for (int g = 0; g < sce->ics.num_swb; g++) { + float t1 = FLT_MAX, t2 = FLT_MAX; /* original thr of w-1, w-2 */ + for (int w = 0; w < sce->ics.num_windows; w++) { + FFPsyBand *b = &s->psy.ch[s->cur_channel].psy_bands[w*16+g]; + float t = b->threshold; + float c = FFMIN(t, FFMIN(t1*pm_p2, t2*pm_p3)); + b->threshold = FFMAX(c, t*pm_p1); + t2 = t1; t1 = t; + } + } + { + for (int w = 0; w < sce->ics.num_windows; w++) { + float sum = 0.0f; int n = 0; + for (int g = 0; g < sce->ics.num_swb; g++) { + FFPsyBand *b = &s->psy.ch[s->cur_channel].psy_bands[w*16+g]; + if (b->energy > b->threshold && b->threshold > 0.0f) { sum += b->threshold; n++; } + } + if (n > 0) { + float mean = sum / n; + for (int g = 0; g < sce->ics.num_swb; g++) { + FFPsyBand *b = &s->psy.ch[s->cur_channel].psy_bands[w*16+g]; + if (b->energy > b->threshold && b->threshold > 0.0f) + b->threshold = mean; + } + } + } + } + } + + /* Allocation curve to favour high frequencies */ + const float a_ae = 0.443f, a_at = 0.111f; + for (int w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { + int start = 0; + for (int g = 0; g < sce->ics.num_swb; start += sce->ics.swb_sizes[g++]) { + float uplim = 0.0f, ener = 0.0f, spread = 2.0f; + int nz = 0; + if (sce->band_type[w*16+g] == INTENSITY_BT || + sce->band_type[w*16+g] == INTENSITY_BT2) { + /* pre-decided intensity band (right channel): keep its + * signalling, it is not trellis-coded */ + for (int w2 = 0; w2 < sce->ics.group_len[w]; w2++) + sce->zeroes[(w+w2)*16+g] = 0; + continue; + } + for (int w2 = 0; w2 < sce->ics.group_len[w]; w2++) { + FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g]; + ener += band->energy; + spread = FFMIN(spread, band->spread); + if (start >= cutoff || band->energy <= band->threshold || band->threshold == 0.0f) { + sce->zeroes[(w+w2)*16+g] = 1; + continue; + } + uplim += band->threshold; + nz = 1; + } + sce->zeroes[w*16+g] = !nz; + thr_real[w*16+g] = uplim; /* real mask, before the allocation law (PNS gate) */ + if (nz && ener > 0.0f && uplim > 0.0f) + uplim = expf(a_ae * logf(ener) + a_at * logf(uplim)); + thr[w*16+g] = uplim; + pener[w*16+g] = ener; + pspread[w*16+g] = spread; + allz |= nz; + } + } + if (!allz) + goto bail; + + s->aacdsp.abs_pow34(s->scoefs, sce->coeffs, 1024); + ff_quantize_band_cost_cache_init(s); + + /* finest codeable scalefactor and max value per band */ + for (int w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { + int start = w*128; + for (int g = 0; g < sce->ics.num_swb; g++) { + maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], s->scoefs + start); + minsf[w*16+g] = maxvals[w*16+g] > 0 ? coef2minsf(maxvals[w*16+g]) : 0; + start += sce->ics.swb_sizes[g]; + } + } + + /* PASS 1: + * precompute each coded band's cost curve at the coarse candidate step + * (the lambda search runs on this cheap grid, PASS 2 refines the winner) */ + { + for (int w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { + int start = w*128; + for (int g = 0; g < sce->ics.num_swb; g++) { + if (!sce->zeroes[w*16+g] && maxvals[w*16+g] > 0 && nbnd < 128) { + int lo = av_clip(minsf[w*16+g], 0, SCALE_MAX_POS); + float invthr = 1.0f / FFMAX(thr[w*16+g], 1e-9f); + int ncand = nmr_band_curve(s, sce, w, g, start, lo, cstep, NMR_NCAND, + invthr, maxvals[w*16+g], nd[nbnd], nb[nbnd]); + if (ncand == 0) { + /* nothing codeable -> drop the whole group band. The + * subwindow flags must be cleared too: the encoder later + * re-derives the group flag by ANDing them, which would + * resurrect the band with a never-assigned scalefactor. */ + for (int w2 = 0; w2 < sce->ics.group_len[w]; w2++) + sce->zeroes[(w+w2)*16+g] = 1; + } else { + bidx[nbnd] = w*16+g; + bw[nbnd] = w; + bg[nbnd] = g; + bst[nbnd] = start; + blo[nbnd] = lo; + bnc[nbnd] = ncand; + nbnd++; + } + } + start += sce->ics.swb_sizes[g]; + } + } + } + if (!nbnd) + goto bail; + + /* solve the trellis over all coded bands, then offer PNS at the operating + * lambda and re-solve over the survivors with the freed budget */ + { + int nact = nbnd, pns_count = 0; + float lam0 = s->nmr->lam[s->cur_channel]; + float lam; + + for (int b = 0; b < nbnd; b++) { + act[b] = b; + is_pns[b] = 0; + } + if (rc_global) { + /* bisect to this frame's bit demand within the corridor around the + * servoed lambda: per-frame psy demand is tracked, but lambda cannot + * jump, which keeps quality smooth across frames */ + float lo = s->nmr->lam_rc / NMR_RC_CORR; + /* Transient burst: widen the lower lambda bound so the bisection can actually + * pour the boosted destbits into an onset frame (finer coding kills the + * pre-echo); reservoir servo repays it from the steady frames. run_burst==1 on + * non-onset frames leaves the corridor unchanged. */ + if (sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE && s->nmr->run_burst > 1.0f) + lo /= s->nmr->run_burst; + lam = nmr_solve(s, nd, nb, blo, bnc, cstep, act, nact, destbits, chosen, + lo, s->nmr->lam_rc * NMR_RC_CORR, + NMR_RC_CITERS); + + int tot = 0; + for (int k = 0; k < nact; k++) + tot += nb[act[k]][chosen[act[k]]]; + for (int k = 1; k < nact; k++) + tot += NMR_SFBITS((blo[act[k]]+chosen[act[k]]*cstep) - (blo[act[k-1]]+chosen[act[k-1]]*cstep)); + int hardcap = av_clip((int)(5800.f * FFMIN(1.f, lambda / 120.f)), 256, 5800); + /* leaky-bucket window: don't borrow past -rc_bmax (cap) or bank past +rc_bmax (floor) */ + int rc_cap = FFMIN(hardcap, (s->nmr->rc_fill + rc_rate_frame + rc_bmax) / s->channels); + int rc_floor = FFMAX(0, (s->nmr->rc_fill + rc_rate_frame - rc_bmax) / s->channels); + if (tot > rc_cap) + lam = nmr_solve(s, nd, nb, blo, bnc, cstep, act, nact, rc_cap, chosen, + lam, 1e4f, NMR_CITERS); + else if (tot < rc_floor) + lam = nmr_solve(s, nd, nb, blo, bnc, cstep, act, nact, rc_floor, chosen, + 1e-9f, lam, NMR_CITERS); + } else if (NMR_COARSE > 0 && lam0 > 0.0f) { + /* per-frame bisection; lambda is strongly frame-correlated, so when a + * previous frame's operating lambda exists, bisect a narrow bracket + * around it. A result near the bracket edge means the budget crossing + * lies outside (hard content transition) == redo the full search. */ + lam = nmr_solve(s, nd, nb, blo, bnc, cstep, act, nact, destbits, chosen, + lam0/32.0f, lam0*32.0f, NMR_CWARM); + if (lam < lam0/16.0f || lam > lam0*16.0f) + lam0 = 0.0f; + } + if (!rc_global && lam0 <= 0.0f) + lam = nmr_solve(s, nd, nb, blo, bnc, cstep, act, nact, destbits, chosen, + 1e-9f, 1e4f, NMR_COARSE > 0 ? NMR_CITERS : NMR_ITERS); + + /* PASS 2: + * refine each band at full granularity (NMR_STEP) in a +/-cstep window + * around the coarse pick, then re-solve. Recovers single-pass quality while the + * lambda search stayed cheap on the coarse grid. */ + if (NMR_COARSE > 0) { + /* nmr_speed, 0 = slowest/best, higher = faster. It narrows the fine + * refine +/-window (scalefactors) below NMR_COARSE: at speed 0 the window + * spans the whole coarse-grid gap, so the two-pass result matches the + * exhaustive single-pass search. + * Each speed level shaves one sf off the window. + * At @64k mono (Zim / xRT): speed 0 -> 0.00095/15x, + * 2 -> 0.00096/18x, 3 -> 0.00100/20x, 4 -> 0.00103/22x */ + int win = NMR_COARSE - av_clip(s->options.nmr_speed, 0, 4); + for (int b = 0; b < nbnd; b++) { + int center = blo[b] + chosen[b]*cstep; + int flo = av_clip(center - win, av_clip(minsf[bidx[b]], 0, SCALE_MAX_POS), SCALE_MAX_POS); + int maxn = FFMIN(NMR_NCAND, 2*win/NMR_STEP + 1); + float invthr = 1.0f / FFMAX(thr[bidx[b]], 1e-9f); + int ncand = nmr_band_curve(s, sce, bw[b], bg[b], bst[b], flo, NMR_STEP, maxn, + invthr, maxvals[bidx[b]], nd[b], nb[b]); + blo[b] = flo; + bnc[b] = FFMAX(1, ncand); + } + /* fine pass: narrow corridor around the coarse solve */ + if (rc_global) + lam = nmr_solve(s, nd, nb, blo, bnc, NMR_STEP, act, nact, destbits, chosen, + lam/2.0f, lam*2.0f, NMR_RC_FITERS); + else + lam = nmr_solve(s, nd, nb, blo, bnc, NMR_STEP, act, nact, destbits, chosen, + lam/16.0f, lam*16.0f, NMR_IFINE); + } + + if (rc_global) { + /* leaky-bucket clamp: keep the frame within [rc_floor, rc_cap] so the reservoir + * stays in +-rc_bmax -- clamp lambda UP if it would borrow past the cap, DOWN if it + * would bank past the floor (spend-floor). The hard cap follows the encoder's outer + * lambda so the (rare) hard-overflow re-encode -- which shrinks that lambda -- always + * converges; on the first pass lambda is nominal and this is 5800. */ + int hardcap = av_clip((int)(5800.f * FFMIN(1.f, lambda / 120.f)), 256, 5800); + int tot = 0; + for (int k = 0; k < nact; k++) + tot += nb[act[k]][chosen[act[k]]]; + for (int k = 1; k < nact; k++) + tot += NMR_SFBITS((blo[act[k]]+chosen[act[k]]*NMR_STEP) - (blo[act[k-1]]+chosen[act[k-1]]*NMR_STEP)); + int rc_cap = FFMIN(hardcap, (s->nmr->rc_fill + rc_rate_frame + rc_bmax) / s->channels); + int rc_floor = FFMAX(0, (s->nmr->rc_fill + rc_rate_frame - rc_bmax) / s->channels); + if (tot > rc_cap) + lam = nmr_solve(s, nd, nb, blo, bnc, NMR_STEP, act, nact, rc_cap, chosen, + lam, 1e4f, NMR_RC_ITERS); + else if (tot < rc_floor) + lam = nmr_solve(s, nd, nb, blo, bnc, NMR_STEP, act, nact, rc_floor, chosen, + 1e-9f, lam, NMR_RC_ITERS); + } + + s->nmr->lam[s->cur_channel] = lam; /* warm start for the next frame */ + if (rc_global) { + /* drag the corridor centre toward the realized lambda so it follows + * content drift faster than the reservoir term alone */ + float c = s->nmr->lam_rc * powf(lam / s->nmr->lam_rc, NMR_RC_TRACK); + /* then servo the centre off the reservoir error so the long-run rate + * returns to nominal. rc_fill>0 = bits banked (undershooting) -> lower + * lambda to spend them; <0 -> raise it. This is what holds the mean; + * the corridor tracking alone has no rate authority and a bad centre + * would otherwise drift for dozens of frames, starving each one. */ + float R = avctx->bit_rate * 1024.0 / avctx->sample_rate; + c *= exp2f(-NMR_RC_K_CBR * s->nmr->rc_fill / R); + s->nmr->lam_rc = av_clipf(c, 1e-6f, 1e4f); + } else if (rc_eligible && nbnd >= 8) { + /* bootstrap the servo off the first substantive frame; near-silent + * lead-in frames have degenerate budgets that rail the bisection to + * a nonsense lambda and would poison the whole stream */ + s->nmr->lam_rc = av_clipf(lam, 1e-4f, 10.0f); + } + + { /* PNS */ + const float pns_lam = NMR_PNS_LAM; + /* band 0 (lowest freq) is kept as the global-gain / sf-chain anchor */ + for (int b = 1; b < nbnd; b++) { + int bi = bidx[b]; + float spread = pspread[bi]; + float nmr_pns, cost_keep, cost_pns, frac; + if (!sce->can_pns[bi]) + continue; + + /* Loud-band guard: never substitute a band whose energy is far above the + * masking threshold -- energy-matched noise on a dominant band clips/pops + * (and is audibly wrong). PNS is for near-masked noise only. */ + if (pener[bi] > NMR_PNS_MAX_ET * thr_real[bi]) + continue; + + /* Struggle gate: no PNS at all unless the encoder is genuinely under bit + * pressure (high operating lambda). */ + if (lam <= pns_lam) + continue; + + /* Spectral-hole fill: a noise-like band the trellis left mostly empty */ + frac = nd[b][chosen[b]] * thr[bi] / FFMAX(pener[bi], 1e-9f); + if (spread > NMR_PNS_HOLE_SPREAD && frac > NMR_PNS_HOLE_FRAC) { + is_pns[b] = 1; + pns_count++; + continue; + } + + /* Only replace a band that is being coded audibly badly */ + if (nd[b][chosen[b]] * thr[bi] <= NMR_PNS_NDGATE * thr_real[bi]) + continue; + + /* perceptual cost of replacing the band with energy-matched noise: + * the non-noise-like fraction of its energy, in dist/threshold units */ + nmr_pns = FFMAX(0.0f, pener[bi] * (1.0f - spread*spread)) + / FFMAX(thr[bi], 1e-9f); + cost_keep = nd[b][chosen[b]] + lam * nb[b][chosen[b]]; + cost_pns = nmr_pns + lam * NMR_PNS_BITS; + if (cost_pns < cost_keep) { + is_pns[b] = 1; + pns_count++; + } + } + if (pns_count) { + int budget2 = destbits - pns_count * NMR_PNS_BITS; + nact = 0; + for (int b = 0; b < nbnd; b++) + if (!is_pns[b]) + act[nact++] = b; + /* re-solve over the survivors: at fixed lambda the allocation is + * the same except for the repaired sf-delta chain; in bisection + * mode re-spend the freed budget */ + if (rc_global) + nmr_solve(s, nd, nb, blo, bnc, NMR_STEP, act, nact, budget2, chosen, + lam, lam, 1); + else + nmr_solve(s, nd, nb, blo, bnc, NMR_STEP, act, nact, budget2, chosen, + 1e-9f, 1e4f, NMR_ITERS); + } + } + for (int b = 0; b < nbnd; b++) { + int bi = bidx[b]; + if (is_pns[b]) { + sce->band_type[bi] = NOISE_BT; + sce->zeroes[bi] = 0; + sce->pns_ener[bi] = pener[bi] * FFMIN(1.0f, pspread[bi]*pspread[bi]); + } else { + sce->sf_idx[bi] = av_clip(blo[b] + chosen[b]*NMR_STEP, 0, SCALE_MAX_POS); + } + } + + { /* record the bits this solve accounted for; the encoder compares them + * against the channel's real output to keep the budget honest */ + int tot = 0, prevb = -1; + for (int b = 0; b < nbnd; b++) { + if (is_pns[b]) + continue; + tot += nb[b][chosen[b]]; + if (prevb >= 0) + tot += NMR_SFBITS((blo[b]+chosen[b]*NMR_STEP) - (blo[prevb]+chosen[prevb]*NMR_STEP)); + prevb = b; + } + s->nmr->counted[s->cur_channel] = tot; + } + } + + /* SCALE_MAX_DIFF condition: + * re-clamp, codebook fixup, drop uncodeable, set global gain + * NOISE_BT bands keep their own scalefactor chain via set_special_band_scalefactors) */ + { + uint8_t nextband[128]; + int prev = -1; + ff_init_nextband_map(sce, nextband); + for (int w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { + for (int g = 0; g < sce->ics.num_swb; g++) { + if (sce->band_type[w*16+g] == NOISE_BT || + sce->band_type[w*16+g] == INTENSITY_BT || + sce->band_type[w*16+g] == INTENSITY_BT2) + continue; + if (sce->zeroes[w*16+g]) { + sce->band_type[w*16+g] = 0; + continue; + } + + if (prev != -1) + sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], prev - SCALE_MAX_DIFF, prev + SCALE_MAX_DIFF); + sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]); + if (sce->band_type[w*16+g] <= 0) { + if (!ff_sfdelta_can_remove_band(sce, nextband, prev, w*16+g)) { + sce->band_type[w*16+g] = 1; + } else { + /* drop subwindow flags too, see the PASS 1 drop above */ + for (int w2 = 0; w2 < sce->ics.group_len[w]; w2++) + sce->zeroes[(w+w2)*16+g] = 1; + sce->band_type[w*16+g] = 0; + continue; + } + } + if (prev == -1) + sce->sf_idx[0] = sce->sf_idx[w*16+g]; /* global gain */ + prev = sce->sf_idx[w*16+g]; + } + } + + /* Every band, coded or not, must carry a chain-legal scalefactor: the + * codebook trellis (encode_window_bands_info) may later absorb a dropped + * band into a nonzero section, resurrecting it, and its sf then gets + * coded. Forward-fill with the previous coded sf (delta 0, cheapest); + * leading bands get the global gain. */ + if (prev != -1) { + int last = sce->sf_idx[0]; + for (int w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { + for (int g = 0; g < sce->ics.num_swb; g++) { + if (!sce->zeroes[w*16+g] && sce->band_type[w*16+g] != NOISE_BT && + sce->band_type[w*16+g] < RESERVED_BT) + last = sce->sf_idx[w*16+g]; + else if (sce->band_type[w*16+g] < RESERVED_BT && (w*16+g) > 0) + sce->sf_idx[w*16+g] = last; + } + } + } + } + return; + +bail: + /* Nothing codeable in this channel. Leave a fully consistent state: any + * stale nonzero band_type acts as a codebook lower bound in the encoder's + * section trellis (encode_window_bands_info), which would forbid the zero + * section and resurrect the band with a stale, chain-illegal scalefactor. + * Pre-decided intensity bands keep their signalling. */ + for (int i = 0; i < 128; i++) { + if (sce->band_type[i] == INTENSITY_BT || sce->band_type[i] == INTENSITY_BT2) + continue; + sce->zeroes[i] = 1; + sce->band_type[i] = 0; + } +} + +#endif /* AVCODEC_AACCODER_NMR_H */ diff --git a/libavcodec/aaccoder_twoloop.h b/libavcodec/aaccoder_twoloop.h index 6ac2af51cb..333d14d9de 100644 --- a/libavcodec/aaccoder_twoloop.h +++ b/libavcodec/aaccoder_twoloop.h @@ -71,7 +71,6 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx, int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / ((avctx->flags & AV_CODEC_FLAG_QSCALE) ? 2.0f : avctx->ch_layout.nb_channels) * (lambda / 120.f); - int refbits = destbits; int toomanybits, toofewbits; char nzs[128]; uint8_t nextband[128]; @@ -172,32 +171,8 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx, /** and zero out above cutoff frequency */ { int wlen = 1024 / sce->ics.num_windows; - int bandwidth; - - /** - * Scale, psy gives us constant quality, this LP only scales - * bitrate by lambda, so we save bits on subjectively unimportant HF - * rather than increase quantization noise. Adjust nominal bitrate - * to effective bitrate according to encoding parameters, - * AAC_CUTOFF_FROM_BITRATE is calibrated for effective bitrate. - */ - float rate_bandwidth_multiplier = 1.5f; - int frame_bit_rate = (avctx->flags & AV_CODEC_FLAG_QSCALE) - ? (refbits * rate_bandwidth_multiplier * avctx->sample_rate / 1024) - : (avctx->bit_rate / avctx->ch_layout.nb_channels); - - /** Compensate for extensions that increase efficiency */ - if (s->options.pns || s->options.intensity_stereo) - frame_bit_rate *= 1.15f; - - if (avctx->cutoff > 0) { - bandwidth = avctx->cutoff; - } else { - bandwidth = FFMAX(3000, AAC_CUTOFF_FROM_BITRATE(frame_bit_rate, 1, avctx->sample_rate)); - s->psy.cutoff = bandwidth; - } - - cutoff = bandwidth * 2 * wlen / avctx->sample_rate; + /* the bandwidth is fixed at init and shared with the psy model */ + cutoff = s->bandwidth * 2 * wlen / avctx->sample_rate; pns_start_pos = NOISE_LOW_LIMIT * 2 * wlen / avctx->sample_rate; } diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c index 94558ff23f..274e79955a 100644 --- a/libavcodec/aacenc.c +++ b/libavcodec/aacenc.c @@ -577,6 +577,222 @@ static void apply_intensity_stereo(ChannelElement *cpe) } } +/* Intensity stereo is only allowed when its irreducible image error */ +#define NMR_IS_IMG_GATE 0.5f + +/* Frequency in Hz for the lower limit of intensity stereo */ +#define NMR_IS_LOW_LIMIT 6100 + +/* Rate ceiling (bits/sample/channel) above which intensity is skipped, ~145kbps */ +#define NMR_IS_MAXBPS 1.52f + +/* The rate ceiling is lifted on hard-to-code frames. The signal is the bit + * reservoir going into deficit: a negative fill means the trellis is spending + * more than the nominal rate to hold quality (operating lambda has climbed). */ +#define NMR_IS_FILLGAIN 0.27f +#define NMR_IS_FILLMAX 0.40f + +/* M/S thresholds: a band is recoded as mid+side when the side is negligible */ +#define NMR_MS_EQUIV 0.01f +#define NMR_MS_MASK 0.0f + +/* PNS-stereo decorrelation gate: a band may be noise-substituted in a CPE only if its + * side energy is at least this fraction of its mid energy, i.e. the image is genuinely + * wide (channels decorrelated). PNS renders uncorrelated noise per channel, so it only + * preserves the image on already-wide bands; a much stricter bar than I/S (which can + * collapse correlated bands). Lower = more PNS / more imaging risk. */ +#define NMR_PNS_STEREO_DECORR 0.6f + +/* Recode one band's window group as mid+side in place, updating the psy band + * energies/thresholds to the M/S spectra. The threshold is halved as a coarse guard + * against L/R unmasking of the independently-quantized M/S noise (M/S is a lossless + * rotation but lossy coding). Used for the M/S decision and the intensity fallback. */ +static void nmr_apply_ms_band(AACEncContext *s, ChannelElement *cpe, + int w, int g, int start, int len, int gl) +{ + SingleChannelElement *sce0 = &cpe->ch[0]; + SingleChannelElement *sce1 = &cpe->ch[1]; + cpe->ms_mask[w*16+g] = 1; + for (int w2 = 0; w2 < gl; w2++) { + FFPsyBand *b0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g]; + FFPsyBand *b1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g]; + float *L = sce0->coeffs + start + (w+w2)*128; + float *R = sce1->coeffs + start + (w+w2)*128; + float em = 0.0f, es = 0.0f; + for (int i = 0; i < len; i++) { + float m = (L[i] + R[i]) * 0.5f; + R[i] = m - R[i]; L[i] = m; + em += L[i]*L[i]; es += R[i]*R[i]; + } + b0->threshold = b1->threshold = FFMIN(b0->threshold, b1->threshold) * 0.5f; + b0->energy = em; b1->energy = es; + } +} + +/* Intensity-stereo perceptual test for one band's window group: collapse the pair + * to a single carrier (L + p*R)*scale that the decoder rescales per channel, and + * check that the irreducible image error, which no bit budget can reduce, is + * masked in both channels. On success returns 1 and fills the carrier scale, the + * decoder's R/carrier ratio sr_, and the phase p. The caller restricts this to HF + * bands with energy in both channels. */ +static int nmr_is_image_masked(AACEncContext *s, ChannelElement *cpe, + int w, int g, int start, int len, int gl, + float ener0, float ener1, float dot, + float minthr0, float minthr1, + float *scale_out, float *sr_out, int *p_out) +{ + int p = dot >= 0.0f ? 1 : -1; + float ener01 = ener0 + ener1 + 2*p*dot; /* energy of L + p*R */ + if (ener01 <= FLT_MIN) + return 0; + float scale = sqrtf(ener0 / ener01); /* carrier = (L + p*R)*scale */ + float sr_ = sqrtf(ener1 / ener0); /* decoder: R = p*sr_*carrier */ + float img0 = 0.0f, img1 = 0.0f; + for (int w2 = 0; w2 < gl; w2++) { + const float *L = cpe->ch[0].coeffs + start + (w+w2)*128; + const float *R = cpe->ch[1].coeffs + start + (w+w2)*128; + for (int i = 0; i < len; i++) { + float c = (L[i] + p*R[i]) * scale; + float dl = L[i] - c, dr = R[i] - p*sr_*c; + img0 += dl*dl; img1 += dr*dr; + } + } + if (img0 >= NMR_IS_IMG_GATE * minthr0 * gl || + img1 >= NMR_IS_IMG_GATE * minthr1 * gl) + return 0; + *scale_out = scale; *sr_out = sr_; *p_out = p; + return 1; +} + +/* Recode one band's window group as intensity stereo in place: replace L with the + * carrier, zero R, signal the phase via the side channel's band type, and fold the + * pair's masking into the surviving (carrier) channel. */ +static void nmr_apply_is_band(AACEncContext *s, ChannelElement *cpe, + int w, int g, int start, int len, int gl, + float scale, float sr_, int p, + float ener0, float ener1) +{ + cpe->is_mask[w*16+g] = 1; + cpe->ch[0].is_ener[w*16+g] = scale; + cpe->ch[1].is_ener[w*16+g] = ener0 / ener1; + cpe->ch[1].band_type[w*16+g] = p > 0 ? INTENSITY_BT : INTENSITY_BT2; + for (int w2 = 0; w2 < gl; w2++) { + FFPsyBand *b0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g]; + FFPsyBand *b1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g]; + float *L = cpe->ch[0].coeffs + start + (w+w2)*128; + float *R = cpe->ch[1].coeffs + start + (w+w2)*128; + float ec = 0.0f; + for (int i = 0; i < len; i++) { + L[i] = (L[i] + p*R[i]) * scale; + R[i] = 0.0f; + ec += L[i]*L[i]; + } + b0->threshold = FFMIN(b0->threshold, b1->threshold / FFMAX(sr_*sr_, 1e-9f)); + b0->energy = ec; b1->energy = 0.0f; + } +} + +/* + * Per-band stereo-mode decision (L/R vs M/S vs intensity) for the NMR coder, + * made before quantization from the psychoacoustic model alone, so the + * quantizer search allocates natively on the spectra that are actually coded. + */ +static void nmr_decide_stereo(AACEncContext *s, ChannelElement *cpe) +{ + SingleChannelElement *sce0 = &cpe->ch[0]; + SingleChannelElement *sce1 = &cpe->ch[1]; + IndividualChannelStream *ics = &sce0->ics; + const AVCodecContext *avctx = s->psy.avctx; + const float freq_mult = avctx->sample_rate / (1024.0f / ics->num_windows) / 2.0f; + const float bps = avctx->bit_rate > 0 ? + (float)avctx->bit_rate / avctx->sample_rate / avctx->ch_layout.nb_channels : 0.0f; + int is_count = 0; + + /* Stereo decision, with no bitrate dependence. Start from full L/R and depart from + * it only where the change is inaudible. M/S and I/S differ in what they trade: + * M/S recodes the pair as mid+side -- an invertible rotation, but the M and S + * are quantized independently, so it is lossy coding whose noise un-mixes + * back to L/R. Used where it barely changes the result (the side is + * negligible vs the mid, so it is ~equivalent to L/R at the same rate) -- + * OR where the doubled side energy is masked. + * I/S drops the side phase and keeps its energy, where the residual image error + * is masked. Used for the decorrelated HF that M/S cannot help. + * Both tests are content/perceptual and frame-stable, so the image holds. */ + + /* I/S rate gate: eligible at/below ~128 kbps, with the ceiling lifted on hard + * frames (bit reservoir in deficit) so a starved high-rate passage can still + * call on intensity. Where an I/S candidate is found but IS is not eligible, fall + * back to M/S: not free, but ~equivalent to L/R there and it lets the energy + * compact into the mid. */ + const float rate_frame = avctx->bit_rate * 1024.0f / FFMAX(avctx->sample_rate, 1); + const float deficit = (s->nmr && rate_frame > 0.0f) + ? FFMAX(0.0f, -(float)s->nmr->rc_fill / rate_frame) : 0.0f; + const float is_bonus = FFMIN(NMR_IS_FILLMAX, NMR_IS_FILLGAIN * deficit); + const int allow_is = s->options.intensity_stereo && bps < NMR_IS_MAXBPS + is_bonus; + + for (int w = 0; w < ics->num_windows; w += ics->group_len[w]) { + int start = 0; + for (int g = 0; g < ics->num_swb; start += ics->swb_sizes[g++]) { + int len = ics->swb_sizes[g], gl = ics->group_len[w]; + float ener0 = 0.0f, ener1 = 0.0f, dot = 0.0f, es_tot = 0.0f, em_tot = 0.0f; + float minthr0 = FLT_MAX, minthr1 = FLT_MAX; + + cpe->is_mask[w*16+g] = 0; + cpe->ms_mask[w*16+g] = 0; + + for (int w2 = 0; w2 < gl; w2++) { + FFPsyBand *b0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g]; + FFPsyBand *b1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g]; + const float *L = sce0->coeffs + start + (w+w2)*128; + const float *R = sce1->coeffs + start + (w+w2)*128; + float el = 0.0f, er = 0.0f, em = 0.0f, es = 0.0f, d = 0.0f; + for (int i = 0; i < len; i++) { + float m = (L[i] + R[i]) * 0.5f; + float sv = m - R[i]; + el += L[i]*L[i]; er += R[i]*R[i]; + em += m*m; es += sv*sv; d += L[i]*R[i]; + } + ener0 += el; ener1 += er; dot += d; es_tot += es; em_tot += em; + minthr0 = FFMIN(minthr0, b0->threshold); + minthr1 = FFMIN(minthr1, b1->threshold); + } + float thr_g = FFMIN(minthr0, minthr1) * gl; /* group masking budget */ + + /* PNS-stereo reservation. Reserve a band for noise substitution only if it + * is noise-like in both channels (intersected can_pns) and clearly + * decorrelated (wide image). */ + if (cpe->ch[0].can_pns[w*16+g] && cpe->ch[1].can_pns[w*16+g] && + es_tot > NMR_PNS_STEREO_DECORR * em_tot) + continue; + cpe->ch[0].can_pns[w*16+g] = cpe->ch[1].can_pns[w*16+g] = 0; + + int ms_ok = s->options.mid_side && + (s->options.mid_side == 1 || + es_tot < NMR_MS_EQUIV * em_tot || + es_tot < NMR_MS_MASK * thr_g); + float scale, sr_; int p; + int is_ok = !ms_ok && + start * freq_mult > NMR_IS_LOW_LIMIT && + ener0 > FLT_MIN && ener1 > FLT_MIN && + nmr_is_image_masked(s, cpe, w, g, start, len, gl, + ener0, ener1, dot, minthr0, minthr1, + &scale, &sr_, &p); + + if (ms_ok) { + nmr_apply_ms_band(s, cpe, w, g, start, len, gl); + } else if (is_ok && allow_is) { + nmr_apply_is_band(s, cpe, w, g, start, len, gl, + scale, sr_, p, ener0, ener1); + is_count++; + } else if (is_ok && s->options.mid_side) { + nmr_apply_ms_band(s, cpe, w, g, start, len, gl); + } + /* else: keep full L/R stereo */ + } + } + cpe->is_mode = !!is_count; +} + static void apply_mid_side_stereo(ChannelElement *cpe) { int w, w2, g, i; @@ -950,12 +1166,6 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, s->psy.bitres.alloc /= chans; } s->cur_type = tag; - for (ch = 0; ch < chans; ch++) { - s->cur_channel = start_ch + ch; - if (s->options.pns && s->coder->mark_pns) - s->coder->mark_pns(s, avctx, &cpe->ch[ch]); - s->coder->search_for_quantizers(avctx, s, &cpe->ch[ch], s->lambda); - } if (chans > 1 && wi[0].window_type[0] == wi[1].window_type[0] && wi[0].window_shape == wi[1].window_shape) { @@ -968,26 +1178,75 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, } } } - for (ch = 0; ch < chans; ch++) { /* TNS and PNS */ + + const int use_tns = s->options.tns && s->coder->search_for_tns && + s->coder->apply_tns_filt; + + /* The NMR coder rate-controls itself and never re-quantizes, so TNS must run + * before the quantizer */ + const int tns_first = s->options.coder == AAC_CODER_NMR; + if (tns_first && use_tns) { + for (ch = 0; ch < chans; ch++) { + sce = &cpe->ch[ch]; + s->cur_channel = start_ch + ch; + /* mono: mark_pns before TNS so the region cap sees PNS bands. Stereo + * PNS is marked in its own block (below) after the stereo decision. */ + if (chans == 1 && s->options.pns && s->coder->mark_pns) + s->coder->mark_pns(s, avctx, sce); + s->coder->search_for_tns(s, sce); + s->coder->apply_tns_filt(s, sce); + if (sce->tns.present) + tns_mode = 1; + } + } + + /* NMR stereo PNS (imaging-safe). Mark each channel's noise-like bands on the + * original L/R psy, then keep PNS only where BOTH channels are noise-like. */ + if (chans == 2 && cpe->common_window && tns_first && + s->options.pns && s->coder->mark_pns) { + s->cur_channel = start_ch; s->coder->mark_pns(s, avctx, &cpe->ch[0]); + s->cur_channel = start_ch + 1; s->coder->mark_pns(s, avctx, &cpe->ch[1]); + for (int b = 0; b < 128; b++) + if (!cpe->ch[0].can_pns[b] || !cpe->ch[1].can_pns[b]) + cpe->ch[0].can_pns[b] = cpe->ch[1].can_pns[b] = 0; + } + + /* The NMR coder decides I/S and M/S BEFORE quantization, from the psy model, + * and the trellis then allocates natively on the coeffs actually coded. */ + if (chans == 2 && cpe->common_window && s->options.coder == AAC_CODER_NMR && + (s->options.mid_side || s->options.intensity_stereo)) { + s->cur_channel = start_ch; + nmr_decide_stereo(s, cpe); + } + for (ch = 0; ch < chans; ch++) { + s->cur_channel = start_ch + ch; + /* NMR PNS is mono-only */ + if (s->options.pns && s->coder->mark_pns && !tns_first) + s->coder->mark_pns(s, avctx, &cpe->ch[ch]); + s->coder->search_for_quantizers(avctx, s, &cpe->ch[ch], s->lambda); + } + for (ch = 0; ch < chans; ch++) { /* TNS (non-NMR) and PNS */ sce = &cpe->ch[ch]; s->cur_channel = start_ch + ch; - if (s->options.tns && s->coder->search_for_tns) + if (!tns_first && use_tns) { s->coder->search_for_tns(s, sce); - if (s->options.tns && s->coder->apply_tns_filt) s->coder->apply_tns_filt(s, sce); - if (sce->tns.present) - tns_mode = 1; + if (sce->tns.present) + tns_mode = 1; + } if (s->options.pns && s->coder->search_for_pns) s->coder->search_for_pns(s, avctx, sce); } s->cur_channel = start_ch; if (s->options.intensity_stereo) { /* Intensity Stereo */ - if (s->coder->search_for_is) - s->coder->search_for_is(s, avctx, cpe); + if (s->options.coder != AAC_CODER_NMR) { /* NMR: decided pre-search */ + if (s->coder->search_for_is) + s->coder->search_for_is(s, avctx, cpe); + apply_intensity_stereo(cpe); + } if (cpe->is_mode) is_mode = 1; - apply_intensity_stereo(cpe); } - if (s->options.mid_side) { /* Mid/Side stereo */ + if (s->options.mid_side && s->options.coder != AAC_CODER_NMR) { /* Mid/Side stereo */ if (s->options.mid_side == -1 && s->coder->search_for_ms) s->coder->search_for_ms(s, cpe); else if (cpe->common_window) @@ -1015,11 +1274,19 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, break; } + frame_bits = put_bits_count(&s->pb); + + /* The NMR coder rate-controls itself (global-lambda reservoir servo): + * per-frame bits intentionally float around the nominal rate, so skip + * the lambda rate loop and only intervene on a hard overflow. */ + if (s->options.coder == AAC_CODER_NMR && avctx->bit_rate_tolerance != 0 && + frame_bits < 6144 * s->channels - 3) + break; + /* rate control stuff * allow between the nominal bitrate, and what psy's bit reservoir says to target * but drift towards the nominal bitrate always */ - frame_bits = put_bits_count(&s->pb); rate_bits = avctx->bit_rate * 1024 / avctx->sample_rate; rate_bits = FFMIN(rate_bits, 6144 * s->channels - 3); too_many_bits = FFMAX(target_bits, rate_bits); @@ -1087,9 +1354,26 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, flush_put_bits(&s->pb); s->last_frame_pb_count = put_bits_count(&s->pb); + + /* NMR rate accounting: how many bits the frame really took beyond what the + * trellis counted; feeds the next frame's budget correction */ + if (s->nmr) { + int counted = 0; + for (i = 0; i < s->channels; i++) + counted += s->nmr->counted[i]; + if (counted > 0) { + float side = (float)s->last_frame_pb_count - counted; + if (s->nmr->side_inited) { + s->nmr->side_ema += 0.125f * (side - s->nmr->side_ema); + } else { + s->nmr->side_ema = side; + s->nmr->side_inited = 1; + } + } + } avpkt->size = put_bytes_output(&s->pb); - s->lambda_sum += s->lambda; + s->lambda_sum += (s->nmr && s->nmr->lam_rc > 0.0f) ? s->nmr->lam_rc : s->lambda; s->lambda_count++; ff_af_queue_remove(&s->afq, avctx->frame_size, &avpkt->pts, @@ -1114,6 +1398,7 @@ static av_cold int aac_encode_end(AVCodecContext *avctx) av_freep(&s->buffer.samples); av_freep(&s->cpe); av_freep(&s->fdsp); + av_freep(&s->nmr); ff_af_queue_close(&s->afq); return 0; } @@ -1147,6 +1432,12 @@ static av_cold int alloc_buffers(AVCodecContext *avctx, AACEncContext *s) for(ch = 0; ch < s->channels; ch++) s->planar_samples[ch] = s->buffer.samples + 3 * 1024 * ch; + if (s->options.coder == AAC_CODER_NMR) { + s->nmr = av_mallocz(sizeof(*s->nmr)); + if (!s->nmr) + return AVERROR(ENOMEM); + } + return 0; } @@ -1243,6 +1534,33 @@ static av_cold int aac_encode_init(AVCodecContext *avctx) if (s->channels > 3) s->options.mid_side = 0; + /* Coding bandwidth, fixed at init time */ + if (avctx->cutoff > 0) { + s->bandwidth = avctx->cutoff; + } else { + int frame_br = (avctx->flags & AV_CODEC_FLAG_QSCALE) ? + (avctx->bit_rate / 2.0f * (s->lambda / 120.f) * 1.5f) : + (avctx->bit_rate / avctx->ch_layout.nb_channels); + + /* For NMR, the rate to bandwidth conversion was tuned to maximize metrics + * over a variable cutoff x bitrate combo */ + if (s->options.coder == AAC_CODER_NMR && frame_br >= 32000) { + static const int rates[] = { 32000, 48000, 64000, 96000 }; + static const int bws[] = { 14000, 15000, 18000, 20000 }; + for (int i = 0; i < FF_ARRAY_ELEMS(rates) - 2 && frame_br > rates[i + 1]; i++); + s->bandwidth = bws[i] + (int)((int64_t)(bws[i + 1] - bws[i]) * + (frame_br - rates[i]) / (rates[i + 1] - rates[i])); + s->bandwidth = FFMIN3(s->bandwidth, 22000, avctx->sample_rate / 2); + } else { + if (s->options.pns || s->options.intensity_stereo) + frame_br *= 1.15f; + s->bandwidth = FFMAX(3000, AAC_CUTOFF_FROM_BITRATE(frame_br, 1, + avctx->sample_rate)); + } + + s->bandwidth = FFMIN(FFMAX(s->bandwidth, 8000), avctx->sample_rate / 2); + } + // Initialize static tables ff_aac_float_common_init(); @@ -1262,7 +1580,7 @@ static av_cold int aac_encode_init(AVCodecContext *avctx) for (i = 0; i < s->chan_map[0]; i++) grouping[i] = s->chan_map[i + 1] == TYPE_CPE; if ((ret = ff_psy_init(&s->psy, avctx, 2, sizes, lengths, - s->chan_map[0], grouping)) < 0) + s->chan_map[0], grouping, s->bandwidth)) < 0) return ret; ff_lpc_init(&s->lpc, 2*avctx->frame_size, TNS_MAX_ORDER, FF_LPC_TYPE_LEVINSON); s->random_state = 0x1f2e3d4c; @@ -1279,11 +1597,13 @@ static const AVOption aacenc_options[] = { {"aac_coder", "Coding algorithm", offsetof(AACEncContext, options.coder), AV_OPT_TYPE_INT, {.i64 = AAC_CODER_TWOLOOP}, 0, AAC_CODER_NB-1, AACENC_FLAGS, .unit = "coder"}, {"twoloop", "Two loop searching method", 0, AV_OPT_TYPE_CONST, {.i64 = AAC_CODER_TWOLOOP}, INT_MIN, INT_MAX, AACENC_FLAGS, .unit = "coder"}, {"fast", "Fast search", 0, AV_OPT_TYPE_CONST, {.i64 = AAC_CODER_FAST}, INT_MIN, INT_MAX, AACENC_FLAGS, .unit = "coder"}, + {"nmr", "Noise-to-mask ratio scalefactor trellis", 0, AV_OPT_TYPE_CONST, {.i64 = AAC_CODER_NMR}, INT_MIN, INT_MAX, AACENC_FLAGS, .unit = "coder"}, {"aac_ms", "Force M/S stereo coding", offsetof(AACEncContext, options.mid_side), AV_OPT_TYPE_BOOL, {.i64 = -1}, -1, 1, AACENC_FLAGS}, {"aac_is", "Intensity stereo coding", offsetof(AACEncContext, options.intensity_stereo), AV_OPT_TYPE_BOOL, {.i64 = 1}, -1, 1, AACENC_FLAGS}, {"aac_pns", "Perceptual noise substitution", offsetof(AACEncContext, options.pns), AV_OPT_TYPE_BOOL, {.i64 = 1}, -1, 1, AACENC_FLAGS}, {"aac_tns", "Temporal noise shaping", offsetof(AACEncContext, options.tns), AV_OPT_TYPE_BOOL, {.i64 = 1}, -1, 1, AACENC_FLAGS}, {"aac_pce", "Forces the use of PCEs", offsetof(AACEncContext, options.pce), AV_OPT_TYPE_BOOL, {.i64 = 0}, -1, 1, AACENC_FLAGS}, + {"aac_nmr_speed", "NMR coder speed level: 0 = slowest/best, higher trades quality for speed", offsetof(AACEncContext, options.nmr_speed), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 4, AACENC_FLAGS}, FF_AAC_PROFILE_OPTS {NULL} }; diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h index 61a9e6102b..1e394595a0 100644 --- a/libavcodec/aacenc.h +++ b/libavcodec/aacenc.h @@ -44,6 +44,7 @@ typedef enum AACCoder { AAC_CODER_TWOLOOP, AAC_CODER_FAST, + AAC_CODER_NMR, AAC_CODER_NB, }AACCoder; @@ -69,6 +70,7 @@ typedef struct AACEncOptions { int pce; int mid_side; int intensity_stereo; + int nmr_speed; ///< NMR coder speed level: 0 = slowest/best, higher is faster } AACEncOptions; /** @@ -165,6 +167,28 @@ typedef struct AACQuantizeBandCostCacheEntry { uint16_t generation; } AACQuantizeBandCostCacheEntry; +/** per-band scalefactor candidates above the finest codeable sf (NMR coder) */ +#define NMR_NCAND 96 + +/** + * NMR coder per-band candidate cost curves (~96 KiB) and rate-control carry-over + */ +typedef struct AACNMRCurves { + float nd[128][NMR_NCAND]; ///< dist / threshold per candidate + int nb[128][NMR_NCAND]; ///< spectral bits per candidate + float lam[16]; ///< per-channel operating lambda of the previous frame, 0 = none yet + int counted[16]; ///< per-channel bits the trellis accounted for in the last solve + float side_ema; ///< running estimate of real-minus-counted bits per frame + int side_inited; ///< side_ema holds a measurement + + int64_t rc_frame_num; ///< frame the reservoir was last advanced for + float lam_rc; ///< global-lambda rate control: operating lambda, 0 until bootstrapped + int rc_fill; ///< virtual bit reservoir fill, + = bits saved vs nominal + int frames_since_short; ///< long-block frames since the last short run (the "gap"): large = isolated transient + int prev_was_short; ///< previous frame was a short block (for run-start detection) + float run_burst; ///< transient bit-burst factor, set at run start and held across the short run +} AACNMRCurves; + typedef struct AACPCEInfo { AVChannelLayout layout; uint8_t num_ele[4]; ///< front, side, back, lfe @@ -194,6 +218,7 @@ typedef struct AACEncContext { LPCContext lpc; ///< used by TNS int samplerate_index; ///< MPEG-4 samplerate index int channels; ///< channel count + int bandwidth; ///< coding bandwidth in Hz, fixed at init; the psy model and the coders' band cutoff agree on it const uint8_t *reorder_map; ///< lavc to aac reorder map const uint8_t *chan_map; ///< channel configuration map @@ -216,6 +241,7 @@ typedef struct AACEncContext { AACQuantizeBandCostCacheEntry quantize_band_cost_cache[256][128]; ///< memoization area for quantize_band_cost AACEncDSPContext aacdsp; + AACNMRCurves *nmr; ///< NMR coder scratch (NULL unless coder == nmr) struct { float *samples; diff --git a/libavcodec/aacencdsp.c b/libavcodec/aacencdsp.c index fb809405f7..5ccc7e8fc8 100644 --- a/libavcodec/aacencdsp.c +++ b/libavcodec/aacencdsp.c @@ -16,6 +16,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include <float.h> #include <math.h> #include "config.h" @@ -45,10 +46,37 @@ static void quantize_bands(int *out, const float *in, const float *scaled, } } +/* One NMR scalefactor-trellis Viterbi step, for each current-band candidate, find the + * previous-band candidate minimising dpp[op] + lamsf[d] then set + * dp[o] = node[o] + that cost and record the back-pointer bp[o] */ +static void nmr_trellis_step_c(float *dp, uint8_t *bp, const float *dpp, + const float *node, const float *lamsf, + int n_cur, int n_prev, int base, int step, int mdiff) +{ + for (int o = 0; o < n_cur; o++) { + int best = -1; + float bestc = FLT_MAX; + for (int op = 0; op < n_prev; op++) { + int d = base + (o - op) * step; + float c; + if (d < -mdiff || d > mdiff) + continue; + c = dpp[op] + lamsf[d + mdiff]; + if (c < bestc) { + bestc = c; + best = op; + } + } + bp[o] = best < 0 ? 0 : best; + dp[o] = best < 0 ? FLT_MAX : node[o] + bestc; + } +} + void ff_aacenc_dsp_init(AACEncDSPContext *s) { - s->abs_pow34 = abs_pow34_v; - s->quant_bands = quantize_bands; + s->abs_pow34 = abs_pow34_v; + s->quant_bands = quantize_bands; + s->nmr_trellis_step = nmr_trellis_step_c; #if ARCH_RISCV ff_aacenc_dsp_init_riscv(s); diff --git a/libavcodec/aacencdsp.h b/libavcodec/aacencdsp.h index 6d9ae221d1..4ead54669d 100644 --- a/libavcodec/aacencdsp.h +++ b/libavcodec/aacencdsp.h @@ -19,11 +19,17 @@ #ifndef AVCODEC_AACENCDSP_H #define AVCODEC_AACENCDSP_H +#include <stdint.h> + typedef struct AACEncDSPContext { void (*abs_pow34)(float *out, const float *in, const int size); void (*quant_bands)(int *out, const float *in, const float *scaled, int size, int is_signed, int maxval, const float Q34, const float rounding); + + void (*nmr_trellis_step)(float *dp, uint8_t *bp, const float *dpp, + const float *node, const float *lamsf, + int n_cur, int n_prev, int base, int step, int mdiff); } AACEncDSPContext; void ff_aacenc_dsp_init(AACEncDSPContext *s); diff --git a/libavcodec/aacpsy.c b/libavcodec/aacpsy.c index 299a0a9f0a..78c217f8f4 100644 --- a/libavcodec/aacpsy.c +++ b/libavcodec/aacpsy.c @@ -134,6 +134,10 @@ typedef struct AacPsyChannel{ float prev_energy_subshort[AAC_NUM_BLOCKS_SHORT * PSY_LAME_NUM_SUBBLOCKS]; int prev_attack; ///< attack value for the last short block in the previous sequence int next_attack0_zero; ///< whether attack[0] of the next frame is zero + + /* rate-loop re-analysis rewind state, see psy_3gpp_analyze() */ + int64_t rc_frame_num; ///< frame this channel last saved rewind state for + AacPsyBand rc_prev_band[128]; ///< prev_band as it was entering the frame }AacPsyChannel; /** @@ -163,6 +167,12 @@ typedef struct AacPsyContext{ AacPsyCoeffs psy_coef[2][64]; AacPsyChannel *ch; float global_quality; ///< normalized global quality taken from avctx + + /* rate-loop re-analysis rewind state, see psy_3gpp_analyze() */ + int64_t rc_frame_num; ///< frame the rewind state was saved for + int rc_first_ch; ///< first channel analyzed in that frame + int rc_fill_level; + float rc_pe_min, rc_pe_max, rc_pe_previous; }AacPsyContext; /** @@ -374,6 +384,10 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) { return AVERROR(ENOMEM); } + pctx->rc_frame_num = -1; + for (i = 0; i < ctx->avctx->ch_layout.nb_channels; i++) + pctx->ch[i].rc_frame_num = -1; + lame_window_init(pctx, ctx->avctx); return 0; @@ -844,9 +858,36 @@ static void psy_3gpp_analyze(FFPsyContext *ctx, int channel, { int ch; FFPsyChannelGroup *group = ff_psy_find_group(ctx, channel); + AacPsyContext *pctx = ctx->model_priv_data; + + /* The encoder's rate-control loop may re-run the analysis for the same + * frame; carried state (bit reservoir, PE history, previous-frame + * thresholds) must advance exactly once per frame, so save it on the + * frame's first run and rewind on re-runs. */ + if (ctx->avctx->frame_num != pctx->rc_frame_num) { + pctx->rc_frame_num = ctx->avctx->frame_num; + pctx->rc_first_ch = channel; + pctx->rc_fill_level = pctx->fill_level; + pctx->rc_pe_min = pctx->pe.min; + pctx->rc_pe_max = pctx->pe.max; + pctx->rc_pe_previous = pctx->pe.previous; + } else if (channel == pctx->rc_first_ch) { + pctx->fill_level = pctx->rc_fill_level; + pctx->pe.min = pctx->rc_pe_min; + pctx->pe.max = pctx->rc_pe_max; + pctx->pe.previous = pctx->rc_pe_previous; + } - for (ch = 0; ch < group->num_ch; ch++) + for (ch = 0; ch < group->num_ch; ch++) { + AacPsyChannel *pch = &pctx->ch[channel + ch]; + if (ctx->avctx->frame_num != pch->rc_frame_num) { + pch->rc_frame_num = ctx->avctx->frame_num; + memcpy(pch->rc_prev_band, pch->prev_band, sizeof(pch->prev_band)); + } else { + memcpy(pch->prev_band, pch->rc_prev_band, sizeof(pch->prev_band)); + } psy_3gpp_analyze_channel(ctx, channel + ch, coeffs[ch], &wi[ch]); + } } static av_cold void psy_3gpp_end(FFPsyContext *apc) diff --git a/libavcodec/psymodel.c b/libavcodec/psymodel.c index 87f7b216cd..1c2b7908e3 100644 --- a/libavcodec/psymodel.c +++ b/libavcodec/psymodel.c @@ -27,7 +27,7 @@ extern const FFPsyModel ff_aac_psy_model; av_cold int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx, int num_lens, const uint8_t **bands, const int* num_bands, - int num_groups, const uint8_t *group_map) + int num_groups, const uint8_t *group_map, int cutoff) { int i, j, k = 0; @@ -36,7 +36,7 @@ av_cold int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx, int num_lens, ctx->group = av_calloc(num_groups, sizeof(ctx->group[0])); ctx->bands = av_memdup(bands, num_lens * sizeof(ctx->bands[0])); ctx->num_bands = av_memdup(num_bands, num_lens * sizeof(ctx->num_bands[0])); - ctx->cutoff = avctx->cutoff; + ctx->cutoff = cutoff ? cutoff : avctx->cutoff; if (!ctx->ch || !ctx->group || !ctx->bands || !ctx->num_bands) { ff_psy_end(ctx); diff --git a/libavcodec/psymodel.h b/libavcodec/psymodel.h index 22899a38d9..9dbce89780 100644 --- a/libavcodec/psymodel.h +++ b/libavcodec/psymodel.h @@ -151,12 +151,13 @@ typedef struct FFPsyModel { * @param num_bands number of scalefactor bands for all frame lengths * @param num_groups number of channel groups * @param group_map array with # of channels in group - 1, for each group + * @param cutoff analysis bandwidth in Hz, 0 to derive it from avctx * * @return zero if successful, a negative value if not */ int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx, int num_lens, const uint8_t **bands, const int *num_bands, - int num_groups, const uint8_t *group_map); + int num_groups, const uint8_t *group_map, int cutoff); /** * Determine what group a channel belongs to. _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
