--- Decode up to four symbols per step with larger lookup tables. This is highly finicky because a lot of internal state has to be tracked and it therefore doesn't fit at all into the existing multisymbol VLC strcture.
On a big core (Alder Lake P core) this makes the whole decoder 30-90% faster (largest gains at high bitrate). On a smaller core (Alder Lake E core) the gain is less, but is still there. There may be some better tuning for the smaller core here (make tables smaller by skipping lookup in some cases?) but I have not pursued it. Thanks, - Mark libavcodec/apv_decode.c | 5 +- libavcodec/apv_decode.h | 59 ++++-- libavcodec/apv_entropy.c | 442 ++++++++++++++++++++++++++++----------- 3 files changed, 363 insertions(+), 143 deletions(-) diff --git a/libavcodec/apv_decode.c b/libavcodec/apv_decode.c index e15c125b58..eb47298e2e 100644 --- a/libavcodec/apv_decode.c +++ b/libavcodec/apv_decode.c @@ -160,6 +160,7 @@ static int apv_decode_block(AVCodecContext *avctx, int err; LOCAL_ALIGNED_32(int16_t, coeff, [64]); + memset(coeff, 0, 64 * sizeof(int16_t)); err = ff_apv_entropy_decode_block(coeff, gbc, entropy_state); if (err < 0) @@ -216,8 +217,8 @@ static int apv_decode_tile_component(AVCodecContext *avctx, void *data, .log_ctx = avctx, .decode_lut = &decode_lut, .prev_dc = 0, - .prev_dc_diff = 20, - .prev_1st_ac_level = 0, + .prev_k_dc = 5, + .prev_k_level = 0, }; int err; diff --git a/libavcodec/apv_decode.h b/libavcodec/apv_decode.h index 34c6176ea0..d94802e208 100644 --- a/libavcodec/apv_decode.h +++ b/libavcodec/apv_decode.h @@ -33,14 +33,39 @@ #define APV_VLC_LUT_BITS 9 #define APV_VLC_LUT_SIZE (1 << APV_VLC_LUT_BITS) -typedef struct APVVLCLUTEntry { +typedef struct APVSingleVLCLUTEntry { uint16_t result; // Return value if not reading more. uint8_t consume; // Number of bits to consume. uint8_t more; // Whether to read additional bits. -} APVVLCLUTEntry; +} APVSingleVLCLUTEntry; + +typedef struct APVMultiVLCLUTEntry { + // Number of symbols this bit stream resolves to. + uint8_t count; + // k_run after decoding all symbols. + uint8_t k_run : 2; + // k_level after decoding the first level symbol. + uint8_t k_level_0 : 3; + // k_level after decoding the all symbols. + uint8_t k_level_1 : 3; + // Run output values. + uint8_t run[2]; + // Level output values. + int16_t level[2]; + // Bit index of the end of each code. + uint8_t offset[4]; +} APVMultiVLCLUTEntry; typedef struct APVVLCLUT { - APVVLCLUTEntry lut[6][APV_VLC_LUT_SIZE]; + // Single-symbol LUT for VLCs. + // Applies to all coefficients, but used only for DC coefficients + // in the decoder. + APVSingleVLCLUTEntry single_lut[6][APV_VLC_LUT_SIZE]; + // Multi-symbol LUT for run/level combinations, decoding up to four + // symbols per step. Comes in two versions, which to use depends on + // whether the next symbol is a run or a level. + APVMultiVLCLUTEntry run_first_lut[3][5][APV_VLC_LUT_SIZE]; + APVMultiVLCLUTEntry level_first_lut[3][5][APV_VLC_LUT_SIZE]; } APVVLCLUT; typedef struct APVEntropyState { @@ -48,33 +73,29 @@ typedef struct APVEntropyState { const APVVLCLUT *decode_lut; + // Previous DC level value. int16_t prev_dc; - int16_t prev_dc_diff; - int16_t prev_1st_ac_level; + // k parameter implied by the previous DC level value. + uint8_t prev_k_dc; + // k parameter implied by the previous first AC level value. + uint8_t prev_k_level; } APVEntropyState; /** - * Build the decoder VLC look-up table. + * Build the decoder VLC look-up tables. */ void ff_apv_entropy_build_decode_lut(APVVLCLUT *decode_lut); /** * Entropy decode a single 8x8 block to coefficients. * - * Outputs in block order (dezigzag already applied). + * Outputs nonzero coefficients only to the block row-major order + * (dezigzag is applied within the function). The output block + * must have been filled with zeroes before calling this function. */ -int ff_apv_entropy_decode_block(int16_t *coeff, - GetBitContext *gbc, - APVEntropyState *state); - -/** - * Read a single APV VLC code. - * - * This entrypoint is exposed for testing. - */ -unsigned int ff_apv_read_vlc(GetBitContext *gbc, int k_param, - const APVVLCLUT *lut); - +int ff_apv_entropy_decode_block(int16_t *restrict coeff, + GetBitContext *restrict gbc, + APVEntropyState *restrict state); #endif /* AVCODEC_APV_DECODE_H */ diff --git a/libavcodec/apv_entropy.c b/libavcodec/apv_entropy.c index a5648c09b4..49b568c094 100644 --- a/libavcodec/apv_entropy.c +++ b/libavcodec/apv_entropy.c @@ -19,15 +19,55 @@ #include "apv.h" #include "apv_decode.h" +#include "put_bits.h" + + +av_always_inline +static unsigned int apv_read_vlc(GetBitContext *restrict gbc, int k_param, + const APVVLCLUT *restrict lut) +{ + unsigned int next_bits; + const APVSingleVLCLUTEntry *ent; + + next_bits = show_bits(gbc, APV_VLC_LUT_BITS); + ent = &lut->single_lut[k_param][next_bits]; + + if (ent->more) { + unsigned int leading_zeroes; + + skip_bits(gbc, ent->consume); + + next_bits = show_bits(gbc, 16); + leading_zeroes = 15 - av_log2(next_bits); + + if (leading_zeroes == 0) { + // This can't happen mid-stream because the lookup would + // have resolved a leading one into a shorter code, but it + // can happen if we are hitting the end of the buffer. + // Return an invalid code to propagate as an error. + return APV_MAX_TRANS_COEFF + 1; + } + + skip_bits(gbc, leading_zeroes + 1); + + return (2 << k_param) + + ((1 << leading_zeroes) - 1) * (1 << k_param) + + get_bits(gbc, leading_zeroes + k_param); + } else { + skip_bits(gbc, ent->consume); + return ent->result; + } +} void ff_apv_entropy_build_decode_lut(APVVLCLUT *decode_lut) { const int code_len = APV_VLC_LUT_BITS; const int lut_size = APV_VLC_LUT_SIZE; + // Build the single-symbol VLC table. for (int k = 0; k <= 5; k++) { for (unsigned int code = 0; code < lut_size; code++) { - APVVLCLUTEntry *ent = &decode_lut->lut[k][code]; + APVSingleVLCLUTEntry *ent = &decode_lut->single_lut[k][code]; unsigned int first_bit = code & (1 << code_len - 1); unsigned int remaining_bits = code ^ first_bit; @@ -64,152 +104,310 @@ void ff_apv_entropy_build_decode_lut(APVVLCLUT *decode_lut) } } } -} - -av_always_inline -static unsigned int apv_read_vlc(GetBitContext *gbc, int k_param, - const APVVLCLUT *lut) -{ - unsigned int next_bits; - const APVVLCLUTEntry *ent; - next_bits = show_bits(gbc, APV_VLC_LUT_BITS); - ent = &lut->lut[k_param][next_bits]; - - if (ent->more) { - unsigned int leading_zeroes; - - skip_bits(gbc, ent->consume); - - next_bits = show_bits(gbc, 16); - leading_zeroes = 15 - av_log2(next_bits); - - if (leading_zeroes == 0) { - // This can't happen mid-stream because the lookup would - // have resolved a leading one into a shorter code, but it - // can happen if we are hitting the end of the buffer. - // Return an invalid code to propagate as an error. - return APV_MAX_TRANS_COEFF + 1; + // Build the multi-symbol VLC table. + for (int start_run = 0; start_run <= 2; start_run++) { + for (int start_level = 0; start_level <= 4; start_level++) { + for (unsigned int code = 0; code < lut_size; code++) { + APVMultiVLCLUTEntry *ent; + int k_run, k_level; + GetBitContext gbc; + PutBitContext pbc; + uint8_t buffer[16]; + uint8_t run_first_buffer[16]; + uint8_t level_first_buffer[16]; + + memset(buffer, 0, sizeof(buffer)); + init_put_bits(&pbc, buffer, sizeof(buffer)); + put_bits(&pbc, APV_VLC_LUT_BITS, code); + flush_put_bits(&pbc); + + memcpy(run_first_buffer, buffer, sizeof(buffer)); + memcpy(level_first_buffer, buffer, sizeof(buffer)); + + k_run = start_run; + k_level = start_level; + + ent = &decode_lut->run_first_lut[k_run][k_level][code]; + memset(ent, 0, sizeof(*ent)); + init_get_bits8(&gbc, run_first_buffer, sizeof(run_first_buffer)); + + ent->count = 0; + for (int i = 0; i <= 1; i++) { + int value, sign, pos; + + value = apv_read_vlc(&gbc, k_run, decode_lut); + pos = get_bits_count(&gbc); + if (pos > APV_VLC_LUT_BITS) + break; + ent->run[i] = value; + ent->offset[ent->count] = pos; + ++ent->count; + k_run = FFMIN(value >> 2, 2); + + value = apv_read_vlc(&gbc, k_level, decode_lut); + sign = get_bits1(&gbc); + pos = get_bits_count(&gbc); + if (pos > APV_VLC_LUT_BITS) + break; + ++value; + ent->level[i] = sign ? -value : value; + ent->offset[ent->count] = pos; + ++ent->count; + k_level = FFMIN(value >> 2, 4); + if (i == 0) + ent->k_level_0 = k_level; + } + if (ent->count > 0 && ent->count < 4) + ent->offset[3] = ent->offset[ent->count - 1]; + ent->k_run = k_run; + ent->k_level_1 = k_level; + + k_run = start_run; + k_level = start_level; + + ent = &decode_lut->level_first_lut[k_run][k_level][code]; + memset(ent, 0, sizeof(*ent)); + init_get_bits8(&gbc, level_first_buffer, sizeof(level_first_buffer)); + + ent->count = 0; + for (int i = 0; i <= 1; i++) { + int value, sign, pos; + + value = apv_read_vlc(&gbc, k_level, decode_lut); + sign = get_bits1(&gbc); + pos = get_bits_count(&gbc); + if (pos > APV_VLC_LUT_BITS) + break; + ++value; + ent->level[i] = sign ? -value : value; + ent->offset[ent->count] = pos; + ++ent->count; + k_level = FFMIN(value >> 2, 4); + if (i == 0) + ent->k_level_0 = k_level; + + value = apv_read_vlc(&gbc, k_run, decode_lut); + pos = get_bits_count(&gbc); + if (pos > APV_VLC_LUT_BITS) + break; + ent->run[i] = value; + ent->offset[ent->count] = pos; + ++ent->count; + k_run = FFMIN(value >> 2, 2); + } + if (ent->count > 0 && ent->count < 4) + ent->offset[3] = ent->offset[ent->count - 1]; + ent->k_run = k_run; + ent->k_level_1 = k_level; + } } - - skip_bits(gbc, leading_zeroes + 1); - - return (2 << k_param) + - ((1 << leading_zeroes) - 1) * (1 << k_param) + - get_bits(gbc, leading_zeroes + k_param); - } else { - skip_bits(gbc, ent->consume); - return ent->result; } } -unsigned int ff_apv_read_vlc(GetBitContext *gbc, int k_param, - const APVVLCLUT *lut) -{ - return apv_read_vlc(gbc, k_param, lut); -} - -int ff_apv_entropy_decode_block(int16_t *coeff, - GetBitContext *gbc, - APVEntropyState *state) +int ff_apv_entropy_decode_block(int16_t *restrict coeff, + GetBitContext *restrict gbc, + APVEntropyState *restrict state) { const APVVLCLUT *lut = state->decode_lut; - int k_param; + int scan_pos, next_is_run; + int k_dc = state->prev_k_dc; + int k_run = 0; + int k_level = state->prev_k_level; + int first_ac = 1; - // DC coefficient. + // Read one DC coefficient. { - int abs_dc_coeff_diff; - int sign_dc_coeff_diff; - int dc_coeff; - - k_param = av_clip(state->prev_dc_diff >> 1, 0, 5); - abs_dc_coeff_diff = apv_read_vlc(gbc, k_param, lut); - - if (abs_dc_coeff_diff > 0) - sign_dc_coeff_diff = get_bits1(gbc); - else - sign_dc_coeff_diff = 0; - - if (sign_dc_coeff_diff) - dc_coeff = state->prev_dc - abs_dc_coeff_diff; - else - dc_coeff = state->prev_dc + abs_dc_coeff_diff; - - if (dc_coeff < APV_MIN_TRANS_COEFF || - dc_coeff > APV_MAX_TRANS_COEFF) { - av_log(state->log_ctx, AV_LOG_ERROR, - "Out-of-range DC coefficient value: %d " - "(from prev_dc %d abs_dc_coeff_diff %d sign_dc_coeff_diff %d)\n", - dc_coeff, state->prev_dc, abs_dc_coeff_diff, sign_dc_coeff_diff); - return AVERROR_INVALIDDATA; + int dc, abs_diff, sign; + + abs_diff = apv_read_vlc(gbc, k_dc, lut); + + if (abs_diff) { + sign = get_bits1(gbc); + if (sign) + dc = state->prev_dc - abs_diff; + else + dc = state->prev_dc + abs_diff; + } else { + dc = state->prev_dc; } - coeff[0] = dc_coeff; + coeff[0] = dc; - state->prev_dc = dc_coeff; - state->prev_dc_diff = abs_dc_coeff_diff; + state->prev_dc = dc; + state->prev_k_dc = FFMIN(abs_diff >> 1, 5); } - // AC coefficients. - { - int scan_pos = 1; - int first_ac = 1; - int prev_level = state->prev_1st_ac_level; - int prev_run = 0; - - do { - int coeff_zero_run; - - k_param = av_clip(prev_run >> 2, 0, 2); - coeff_zero_run = apv_read_vlc(gbc, k_param, lut); - - if (coeff_zero_run > APV_BLK_COEFFS - scan_pos) { - av_log(state->log_ctx, AV_LOG_ERROR, - "Out-of-range zero-run value: %d (at scan pos %d)\n", - coeff_zero_run, scan_pos); - return AVERROR_INVALIDDATA; - } + // Alternate reading run and level until reaching the end of + // the block. + scan_pos = 1; + next_is_run = 1; + while (1) { + uint32_t next_bits, lut_bits; + const APVMultiVLCLUTEntry *ent; + + // Read 18 bits and look it up the first part in either the + // run-first or the level-first table. If the next code is too + // long the 18 bits will allow resolving a run code (up to 63) + // without reading any more bits, and will allow the exact + // length of a level code to be determined. (Note that the + // single-symbol LUT is never useful here as the multisymbol + // lookup has already determined that the code is too long.) + + next_bits = show_bits(gbc, 18); + lut_bits = next_bits >> (18 - APV_VLC_LUT_BITS); + + if (next_is_run) { + + ent = &lut->run_first_lut[k_run][k_level][lut_bits]; + + if (ent->count == 0) { + // One long code. + uint32_t bits, low_bits; + unsigned int leading_zeroes, low_bit_count, low_bit_shift; + int run; + + // Remove the prefix bits. + bits = next_bits & 0xffff; + // Determine code length. + leading_zeroes = 15 - av_log2(bits); + // Extract the low bits. + low_bit_count = leading_zeroes + k_run; + low_bit_shift = 16 - (1 + 2 * leading_zeroes + k_run); + low_bits = (bits >> low_bit_shift) & ((1 << low_bit_count) - 1); + // Construct run code. + run = (2 << k_run) + + ((1 << leading_zeroes) - 1) * (1 << k_run) + + low_bits; + // Skip over the bits just used. + skip_bits(gbc, 2 + leading_zeroes + 1 + low_bit_count); + + scan_pos += run; + if (scan_pos >= 64) + break; + k_run = FFMIN(run >> 2, 2); + next_is_run = 0; - for (int i = 0; i < coeff_zero_run; i++) { - coeff[ff_zigzag_direct[scan_pos]] = 0; - ++scan_pos; - } - prev_run = coeff_zero_run; - - if (scan_pos < APV_BLK_COEFFS) { - int abs_ac_coeff_minus1; - int sign_ac_coeff; - int level; - - k_param = av_clip(prev_level >> 2, 0, 4); - abs_ac_coeff_minus1 = apv_read_vlc(gbc, k_param, lut); - sign_ac_coeff = get_bits(gbc, 1); + } else { + // One or more short codes. - if (sign_ac_coeff) - level = -abs_ac_coeff_minus1 - 1; - else - level = abs_ac_coeff_minus1 + 1; - - if (level < APV_MIN_TRANS_COEFF || - level > APV_MAX_TRANS_COEFF) { - av_log(state->log_ctx, AV_LOG_ERROR, - "Out-of-range AC coefficient value: %d " - "(from prev_level %d abs_ac_coeff_minus1 %d sign_ac_coeff %d)\n", - level, prev_level, abs_ac_coeff_minus1, sign_ac_coeff); + scan_pos += ent->run[0]; + if (scan_pos >= 64) { + skip_bits(gbc, ent->offset[0]); + break; } + if (ent->count > 1) { + coeff[ff_zigzag_direct[scan_pos]] = ent->level[0]; + ++scan_pos; + if (first_ac) { + state->prev_k_level = ent->k_level_0; + first_ac = 0; + } + if (scan_pos >= 64) { + skip_bits(gbc, ent->offset[1]); + break; + } + } + if (ent->count > 2) { + scan_pos += ent->run[1]; + if (scan_pos >= 64) { + skip_bits(gbc, ent->offset[2]); + break; + } + } + if (ent->count > 3) { + coeff[ff_zigzag_direct[scan_pos]] = ent->level[1]; + ++scan_pos; + if (scan_pos >= 64) { + skip_bits(gbc, ent->offset[3]); + break; + } + } + skip_bits(gbc, ent->offset[3]); + k_run = ent->k_run; + k_level = ent->k_level_1; + next_is_run = !(ent->count & 1); + } + } else { + + ent = &lut->level_first_lut[k_run][k_level][lut_bits]; + + if (ent->count == 0) { + // One long code. + uint32_t bits; + unsigned int leading_zeroes; + int level, abs_level, sign; + + // Remove the prefix bits. + bits = next_bits & 0xffff; + // Determine code length. + leading_zeroes = 15 - av_log2(bits); + // Skip the prefix and length bits. + skip_bits(gbc, 2 + leading_zeroes + 1); + // Read the rest of the code and construct the level. + // Include the + 1 offset for nonzero value here. + abs_level = (2 << k_level) + + ((1 << leading_zeroes) - 1) * (1 << k_level) + + get_bits(gbc, leading_zeroes + k_level) + 1; + + sign = get_bits(gbc, 1); + if (sign) + level = -abs_level; + else + level = abs_level; coeff[ff_zigzag_direct[scan_pos]] = level; - - prev_level = abs_ac_coeff_minus1 + 1; + ++scan_pos; + k_level = FFMIN(abs_level >> 2, 4); if (first_ac) { - state->prev_1st_ac_level = prev_level; + state->prev_k_level = k_level; first_ac = 0; } + if (scan_pos >= 64) + break; + next_is_run = 1; + + } else { + // One or more short codes. + coeff[ff_zigzag_direct[scan_pos]] = ent->level[0]; ++scan_pos; + if (first_ac) { + state->prev_k_level = ent->k_level_0; + first_ac = 0; + } + if (scan_pos >= 64) { + skip_bits(gbc, ent->offset[0]); + break; + } + if (ent->count > 1) { + scan_pos += ent->run[0]; + if (scan_pos >= 64) { + skip_bits(gbc, ent->offset[1]); + break; + } + } + if (ent->count > 2) { + coeff[ff_zigzag_direct[scan_pos]] = ent->level[1]; + ++scan_pos; + if (scan_pos >= 64) { + skip_bits(gbc, ent->offset[2]); + break; + } + } + if (ent->count > 3) { + scan_pos += ent->run[1]; + if (scan_pos >= 64) { + skip_bits(gbc, ent->offset[3]); + break; + } + } + skip_bits(gbc, ent->offset[3]); + k_run = ent->k_run; + k_level = ent->k_level_1; + next_is_run = ent->count & 1; } - - } while (scan_pos < APV_BLK_COEFFS); + } } return 0; -- 2.47.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".