---
Decode up to four symbols per step with larger lookup tables.  This is highly 
finicky because a lot of internal state has to be tracked and it therefore 
doesn't fit at all into the existing multisymbol VLC strcture.

On a big core (Alder Lake P core) this makes the whole decoder 30-90% faster 
(largest gains at high bitrate).  On a smaller core (Alder Lake E core) the 
gain is less, but is still there.  There may be some better tuning for the 
smaller core here (make tables smaller by skipping lookup in some cases?) but I 
have not pursued it.

Thanks,

- Mark

 libavcodec/apv_decode.c  |   5 +-
 libavcodec/apv_decode.h  |  59 ++++--
 libavcodec/apv_entropy.c | 442 ++++++++++++++++++++++++++++-----------
 3 files changed, 363 insertions(+), 143 deletions(-)

diff --git a/libavcodec/apv_decode.c b/libavcodec/apv_decode.c
index e15c125b58..eb47298e2e 100644
--- a/libavcodec/apv_decode.c
+++ b/libavcodec/apv_decode.c
@@ -160,6 +160,7 @@ static int apv_decode_block(AVCodecContext *avctx,
     int err;
 
     LOCAL_ALIGNED_32(int16_t, coeff, [64]);
+    memset(coeff, 0, 64 * sizeof(int16_t));
 
     err = ff_apv_entropy_decode_block(coeff, gbc, entropy_state);
     if (err < 0)
@@ -216,8 +217,8 @@ static int apv_decode_tile_component(AVCodecContext *avctx, 
void *data,
         .log_ctx           = avctx,
         .decode_lut        = &decode_lut,
         .prev_dc           = 0,
-        .prev_dc_diff      = 20,
-        .prev_1st_ac_level = 0,
+        .prev_k_dc         = 5,
+        .prev_k_level      = 0,
     };
 
     int err;
diff --git a/libavcodec/apv_decode.h b/libavcodec/apv_decode.h
index 34c6176ea0..d94802e208 100644
--- a/libavcodec/apv_decode.h
+++ b/libavcodec/apv_decode.h
@@ -33,14 +33,39 @@
 #define APV_VLC_LUT_BITS 9
 #define APV_VLC_LUT_SIZE (1 << APV_VLC_LUT_BITS)
 
-typedef struct APVVLCLUTEntry {
+typedef struct APVSingleVLCLUTEntry {
     uint16_t result;  // Return value if not reading more.
     uint8_t  consume; // Number of bits to consume.
     uint8_t  more;    // Whether to read additional bits.
-} APVVLCLUTEntry;
+} APVSingleVLCLUTEntry;
+
+typedef struct APVMultiVLCLUTEntry {
+    // Number of symbols this bit stream resolves to.
+    uint8_t count;
+    // k_run after decoding all symbols.
+    uint8_t k_run     : 2;
+    // k_level after decoding the first level symbol.
+    uint8_t k_level_0 : 3;
+    // k_level after decoding the all symbols.
+    uint8_t k_level_1 : 3;
+    // Run output values.
+    uint8_t run[2];
+    // Level output values.
+    int16_t level[2];
+    // Bit index of the end of each code.
+    uint8_t offset[4];
+} APVMultiVLCLUTEntry;
 
 typedef struct APVVLCLUT {
-    APVVLCLUTEntry lut[6][APV_VLC_LUT_SIZE];
+    // Single-symbol LUT for VLCs.
+    // Applies to all coefficients, but used only for DC coefficients
+    // in the decoder.
+    APVSingleVLCLUTEntry single_lut[6][APV_VLC_LUT_SIZE];
+    // Multi-symbol LUT for run/level combinations, decoding up to four
+    // symbols per step.  Comes in two versions, which to use depends on
+    // whether the next symbol is a run or a level.
+    APVMultiVLCLUTEntry run_first_lut[3][5][APV_VLC_LUT_SIZE];
+    APVMultiVLCLUTEntry level_first_lut[3][5][APV_VLC_LUT_SIZE];
 } APVVLCLUT;
 
 typedef struct APVEntropyState {
@@ -48,33 +73,29 @@ typedef struct APVEntropyState {
 
     const APVVLCLUT *decode_lut;
 
+    // Previous DC level value.
     int16_t prev_dc;
-    int16_t prev_dc_diff;
-    int16_t prev_1st_ac_level;
+    // k parameter implied by the previous DC level value.
+    uint8_t prev_k_dc;
+    // k parameter implied by the previous first AC level value.
+    uint8_t prev_k_level;
 } APVEntropyState;
 
 
 /**
- * Build the decoder VLC look-up table.
+ * Build the decoder VLC look-up tables.
  */
 void ff_apv_entropy_build_decode_lut(APVVLCLUT *decode_lut);
 
 /**
  * Entropy decode a single 8x8 block to coefficients.
  *
- * Outputs in block order (dezigzag already applied).
+ * Outputs nonzero coefficients only to the block row-major order
+ * (dezigzag is applied within the function).  The output block
+ * must have been filled with zeroes before calling this function.
  */
-int ff_apv_entropy_decode_block(int16_t *coeff,
-                                GetBitContext *gbc,
-                                APVEntropyState *state);
-
-/**
- * Read a single APV VLC code.
- *
- * This entrypoint is exposed for testing.
- */
-unsigned int ff_apv_read_vlc(GetBitContext *gbc, int k_param,
-                             const APVVLCLUT *lut);
-
+int ff_apv_entropy_decode_block(int16_t *restrict coeff,
+                                GetBitContext *restrict gbc,
+                                APVEntropyState *restrict state);
 
 #endif /* AVCODEC_APV_DECODE_H */
diff --git a/libavcodec/apv_entropy.c b/libavcodec/apv_entropy.c
index a5648c09b4..49b568c094 100644
--- a/libavcodec/apv_entropy.c
+++ b/libavcodec/apv_entropy.c
@@ -19,15 +19,55 @@
 #include "apv.h"
 #include "apv_decode.h"
 
+#include "put_bits.h"
+
+
+av_always_inline
+static unsigned int apv_read_vlc(GetBitContext *restrict gbc, int k_param,
+                                 const APVVLCLUT *restrict lut)
+{
+    unsigned int next_bits;
+    const APVSingleVLCLUTEntry *ent;
+
+    next_bits = show_bits(gbc, APV_VLC_LUT_BITS);
+    ent = &lut->single_lut[k_param][next_bits];
+
+    if (ent->more) {
+        unsigned int leading_zeroes;
+
+        skip_bits(gbc, ent->consume);
+
+        next_bits = show_bits(gbc, 16);
+        leading_zeroes = 15 - av_log2(next_bits);
+
+        if (leading_zeroes == 0) {
+            // This can't happen mid-stream because the lookup would
+            // have resolved a leading one into a shorter code, but it
+            // can happen if we are hitting the end of the buffer.
+            // Return an invalid code to propagate as an error.
+            return APV_MAX_TRANS_COEFF + 1;
+        }
+
+        skip_bits(gbc, leading_zeroes + 1);
+
+        return (2 << k_param) +
+            ((1 << leading_zeroes) - 1) * (1 << k_param) +
+            get_bits(gbc, leading_zeroes + k_param);
+    } else {
+        skip_bits(gbc, ent->consume);
+        return ent->result;
+    }
+}
 
 void ff_apv_entropy_build_decode_lut(APVVLCLUT *decode_lut)
 {
     const int code_len = APV_VLC_LUT_BITS;
     const int lut_size = APV_VLC_LUT_SIZE;
 
+    // Build the single-symbol VLC table.
     for (int k = 0; k <= 5; k++) {
         for (unsigned int code = 0; code < lut_size; code++) {
-            APVVLCLUTEntry *ent = &decode_lut->lut[k][code];
+            APVSingleVLCLUTEntry   *ent = &decode_lut->single_lut[k][code];
             unsigned int first_bit      = code & (1 << code_len - 1);
             unsigned int remaining_bits = code ^ first_bit;
 
@@ -64,152 +104,310 @@ void ff_apv_entropy_build_decode_lut(APVVLCLUT 
*decode_lut)
             }
         }
     }
-}
-
-av_always_inline
-static unsigned int apv_read_vlc(GetBitContext *gbc, int k_param,
-                                 const APVVLCLUT *lut)
-{
-    unsigned int next_bits;
-    const APVVLCLUTEntry *ent;
 
-    next_bits = show_bits(gbc, APV_VLC_LUT_BITS);
-    ent = &lut->lut[k_param][next_bits];
-
-    if (ent->more) {
-        unsigned int leading_zeroes;
-
-        skip_bits(gbc, ent->consume);
-
-        next_bits = show_bits(gbc, 16);
-        leading_zeroes = 15 - av_log2(next_bits);
-
-        if (leading_zeroes == 0) {
-            // This can't happen mid-stream because the lookup would
-            // have resolved a leading one into a shorter code, but it
-            // can happen if we are hitting the end of the buffer.
-            // Return an invalid code to propagate as an error.
-            return APV_MAX_TRANS_COEFF + 1;
+    // Build the multi-symbol VLC table.
+    for (int start_run = 0; start_run <= 2; start_run++) {
+        for (int start_level = 0; start_level <= 4; start_level++) {
+            for (unsigned int code = 0; code < lut_size; code++) {
+                APVMultiVLCLUTEntry *ent;
+                int k_run, k_level;
+                GetBitContext gbc;
+                PutBitContext pbc;
+                uint8_t buffer[16];
+                uint8_t run_first_buffer[16];
+                uint8_t level_first_buffer[16];
+
+                memset(buffer, 0, sizeof(buffer));
+                init_put_bits(&pbc, buffer, sizeof(buffer));
+                put_bits(&pbc, APV_VLC_LUT_BITS, code);
+                flush_put_bits(&pbc);
+
+                memcpy(run_first_buffer,   buffer, sizeof(buffer));
+                memcpy(level_first_buffer, buffer, sizeof(buffer));
+
+                k_run   = start_run;
+                k_level = start_level;
+
+                ent = &decode_lut->run_first_lut[k_run][k_level][code];
+                memset(ent, 0, sizeof(*ent));
+                init_get_bits8(&gbc, run_first_buffer, 
sizeof(run_first_buffer));
+
+                ent->count = 0;
+                for (int i = 0; i <= 1; i++) {
+                    int value, sign, pos;
+
+                    value = apv_read_vlc(&gbc, k_run, decode_lut);
+                    pos = get_bits_count(&gbc);
+                    if (pos > APV_VLC_LUT_BITS)
+                        break;
+                    ent->run[i] = value;
+                    ent->offset[ent->count] = pos;
+                    ++ent->count;
+                    k_run = FFMIN(value >> 2, 2);
+
+                    value = apv_read_vlc(&gbc, k_level, decode_lut);
+                    sign = get_bits1(&gbc);
+                    pos = get_bits_count(&gbc);
+                    if (pos > APV_VLC_LUT_BITS)
+                        break;
+                    ++value;
+                    ent->level[i] = sign ? -value : value;
+                    ent->offset[ent->count] = pos;
+                    ++ent->count;
+                    k_level = FFMIN(value >> 2, 4);
+                    if (i == 0)
+                        ent->k_level_0 = k_level;
+                }
+                if (ent->count > 0 && ent->count < 4)
+                    ent->offset[3] = ent->offset[ent->count - 1];
+                ent->k_run     = k_run;
+                ent->k_level_1 = k_level;
+
+                k_run   = start_run;
+                k_level = start_level;
+
+                ent = &decode_lut->level_first_lut[k_run][k_level][code];
+                memset(ent, 0, sizeof(*ent));
+                init_get_bits8(&gbc, level_first_buffer, 
sizeof(level_first_buffer));
+
+                ent->count = 0;
+                for (int i = 0; i <= 1; i++) {
+                    int value, sign, pos;
+
+                    value = apv_read_vlc(&gbc, k_level, decode_lut);
+                    sign = get_bits1(&gbc);
+                    pos = get_bits_count(&gbc);
+                    if (pos > APV_VLC_LUT_BITS)
+                        break;
+                    ++value;
+                    ent->level[i] = sign ? -value : value;
+                    ent->offset[ent->count] = pos;
+                    ++ent->count;
+                    k_level = FFMIN(value >> 2, 4);
+                    if (i == 0)
+                        ent->k_level_0 = k_level;
+
+                    value = apv_read_vlc(&gbc, k_run, decode_lut);
+                    pos = get_bits_count(&gbc);
+                    if (pos > APV_VLC_LUT_BITS)
+                        break;
+                    ent->run[i] = value;
+                    ent->offset[ent->count] = pos;
+                    ++ent->count;
+                    k_run = FFMIN(value >> 2, 2);
+                }
+                if (ent->count > 0 && ent->count < 4)
+                    ent->offset[3] = ent->offset[ent->count - 1];
+                ent->k_run     = k_run;
+                ent->k_level_1 = k_level;
+            }
         }
-
-        skip_bits(gbc, leading_zeroes + 1);
-
-        return (2 << k_param) +
-            ((1 << leading_zeroes) - 1) * (1 << k_param) +
-            get_bits(gbc, leading_zeroes + k_param);
-    } else {
-        skip_bits(gbc, ent->consume);
-        return ent->result;
     }
 }
 
-unsigned int ff_apv_read_vlc(GetBitContext *gbc, int k_param,
-                             const APVVLCLUT *lut)
-{
-    return apv_read_vlc(gbc, k_param, lut);
-}
-
-int ff_apv_entropy_decode_block(int16_t *coeff,
-                                GetBitContext *gbc,
-                                APVEntropyState *state)
+int ff_apv_entropy_decode_block(int16_t *restrict coeff,
+                                GetBitContext *restrict gbc,
+                                APVEntropyState *restrict state)
 {
     const APVVLCLUT *lut = state->decode_lut;
-    int k_param;
+    int scan_pos, next_is_run;
+    int k_dc = state->prev_k_dc;
+    int k_run = 0;
+    int k_level = state->prev_k_level;
+    int first_ac = 1;
 
-    // DC coefficient.
+    // Read one DC coefficient.
     {
-        int abs_dc_coeff_diff;
-        int sign_dc_coeff_diff;
-        int dc_coeff;
-
-        k_param = av_clip(state->prev_dc_diff >> 1, 0, 5);
-        abs_dc_coeff_diff = apv_read_vlc(gbc, k_param, lut);
-
-        if (abs_dc_coeff_diff > 0)
-            sign_dc_coeff_diff = get_bits1(gbc);
-        else
-            sign_dc_coeff_diff = 0;
-
-        if (sign_dc_coeff_diff)
-            dc_coeff = state->prev_dc - abs_dc_coeff_diff;
-        else
-            dc_coeff = state->prev_dc + abs_dc_coeff_diff;
-
-        if (dc_coeff < APV_MIN_TRANS_COEFF ||
-            dc_coeff > APV_MAX_TRANS_COEFF) {
-            av_log(state->log_ctx, AV_LOG_ERROR,
-                   "Out-of-range DC coefficient value: %d "
-                   "(from prev_dc %d abs_dc_coeff_diff %d sign_dc_coeff_diff 
%d)\n",
-                   dc_coeff, state->prev_dc, abs_dc_coeff_diff, 
sign_dc_coeff_diff);
-            return AVERROR_INVALIDDATA;
+        int dc, abs_diff, sign;
+
+        abs_diff = apv_read_vlc(gbc, k_dc, lut);
+
+        if (abs_diff) {
+            sign = get_bits1(gbc);
+            if (sign)
+                dc = state->prev_dc - abs_diff;
+            else
+                dc = state->prev_dc + abs_diff;
+        } else {
+            dc = state->prev_dc;
         }
 
-        coeff[0] = dc_coeff;
+        coeff[0] = dc;
 
-        state->prev_dc      = dc_coeff;
-        state->prev_dc_diff = abs_dc_coeff_diff;
+        state->prev_dc   = dc;
+        state->prev_k_dc = FFMIN(abs_diff >> 1, 5);
     }
 
-    // AC coefficients.
-    {
-        int scan_pos   = 1;
-        int first_ac   = 1;
-        int prev_level = state->prev_1st_ac_level;
-        int prev_run   = 0;
-
-        do {
-            int coeff_zero_run;
-
-            k_param = av_clip(prev_run >> 2, 0, 2);
-            coeff_zero_run = apv_read_vlc(gbc, k_param, lut);
-
-            if (coeff_zero_run > APV_BLK_COEFFS - scan_pos) {
-                av_log(state->log_ctx, AV_LOG_ERROR,
-                       "Out-of-range zero-run value: %d (at scan pos %d)\n",
-                       coeff_zero_run, scan_pos);
-                return AVERROR_INVALIDDATA;
-            }
+    // Alternate reading run and level until reaching the end of
+    // the block.
+    scan_pos    = 1;
+    next_is_run = 1;
+    while (1) {
+        uint32_t next_bits, lut_bits;
+        const APVMultiVLCLUTEntry *ent;
+
+        // Read 18 bits and look it up the first part in either the
+        // run-first or the level-first table.  If the next code is too
+        // long the 18 bits will allow resolving a run code (up to 63)
+        // without reading any more bits, and will allow the exact
+        // length of a level code to be determined.  (Note that the
+        // single-symbol LUT is never useful here as the multisymbol
+        // lookup has already determined that the code is too long.)
+
+        next_bits = show_bits(gbc, 18);
+        lut_bits = next_bits >> (18 - APV_VLC_LUT_BITS);
+
+        if (next_is_run) {
+
+            ent = &lut->run_first_lut[k_run][k_level][lut_bits];
+
+            if (ent->count == 0) {
+                // One long code.
+                uint32_t bits, low_bits;
+                unsigned int leading_zeroes, low_bit_count, low_bit_shift;
+                int run;
+
+                // Remove the prefix bits.
+                bits = next_bits & 0xffff;
+                // Determine code length.
+                leading_zeroes = 15 - av_log2(bits);
+                // Extract the low bits.
+                low_bit_count = leading_zeroes + k_run;
+                low_bit_shift = 16 - (1 + 2 * leading_zeroes + k_run);
+                low_bits = (bits >> low_bit_shift) & ((1 << low_bit_count) - 
1);
+                // Construct run code.
+                run = (2 << k_run) +
+                    ((1 << leading_zeroes) - 1) * (1 << k_run) +
+                    low_bits;
+                // Skip over the bits just used.
+                skip_bits(gbc, 2 + leading_zeroes + 1 + low_bit_count);
+
+                scan_pos += run;
+                if (scan_pos >= 64)
+                    break;
+                k_run = FFMIN(run >> 2, 2);
+                next_is_run = 0;
 
-            for (int i = 0; i < coeff_zero_run; i++) {
-                coeff[ff_zigzag_direct[scan_pos]] = 0;
-                ++scan_pos;
-            }
-            prev_run = coeff_zero_run;
-
-            if (scan_pos < APV_BLK_COEFFS) {
-                int abs_ac_coeff_minus1;
-                int sign_ac_coeff;
-                int level;
-
-                k_param = av_clip(prev_level >> 2, 0, 4);
-                abs_ac_coeff_minus1 = apv_read_vlc(gbc, k_param, lut);
-                sign_ac_coeff = get_bits(gbc, 1);
+            } else {
+                // One or more short codes.
 
-                if (sign_ac_coeff)
-                    level = -abs_ac_coeff_minus1 - 1;
-                else
-                    level = abs_ac_coeff_minus1 + 1;
-
-                if (level < APV_MIN_TRANS_COEFF ||
-                    level > APV_MAX_TRANS_COEFF) {
-                    av_log(state->log_ctx, AV_LOG_ERROR,
-                           "Out-of-range AC coefficient value: %d "
-                           "(from prev_level %d abs_ac_coeff_minus1 %d 
sign_ac_coeff %d)\n",
-                           level, prev_level, abs_ac_coeff_minus1, 
sign_ac_coeff);
+                scan_pos += ent->run[0];
+                if (scan_pos >= 64) {
+                    skip_bits(gbc, ent->offset[0]);
+                    break;
                 }
+                if (ent->count > 1) {
+                    coeff[ff_zigzag_direct[scan_pos]] = ent->level[0];
+                    ++scan_pos;
+                    if (first_ac) {
+                        state->prev_k_level = ent->k_level_0;
+                        first_ac = 0;
+                    }
+                    if (scan_pos >= 64) {
+                        skip_bits(gbc, ent->offset[1]);
+                        break;
+                    }
+                }
+                if (ent->count > 2) {
+                    scan_pos += ent->run[1];
+                    if (scan_pos >= 64) {
+                        skip_bits(gbc, ent->offset[2]);
+                        break;
+                    }
+                }
+                if (ent->count > 3) {
+                    coeff[ff_zigzag_direct[scan_pos]] = ent->level[1];
+                    ++scan_pos;
+                    if (scan_pos >= 64) {
+                        skip_bits(gbc, ent->offset[3]);
+                        break;
+                    }
+                }
+                skip_bits(gbc, ent->offset[3]);
+                k_run   = ent->k_run;
+                k_level = ent->k_level_1;
+                next_is_run = !(ent->count & 1);
+            }
 
+        } else {
+
+            ent = &lut->level_first_lut[k_run][k_level][lut_bits];
+
+            if (ent->count == 0) {
+                // One long code.
+                uint32_t bits;
+                unsigned int leading_zeroes;
+                int level, abs_level, sign;
+
+                // Remove the prefix bits.
+                bits = next_bits & 0xffff;
+                // Determine code length.
+                leading_zeroes = 15 - av_log2(bits);
+                // Skip the prefix and length bits.
+                skip_bits(gbc, 2 + leading_zeroes + 1);
+                // Read the rest of the code and construct the level.
+                // Include the + 1 offset for nonzero value here.
+                abs_level = (2 << k_level) +
+                    ((1 << leading_zeroes) - 1) * (1 << k_level) +
+                    get_bits(gbc, leading_zeroes + k_level) + 1;
+
+                sign = get_bits(gbc, 1);
+                if (sign)
+                    level = -abs_level;
+                else
+                    level = abs_level;
                 coeff[ff_zigzag_direct[scan_pos]] = level;
-
-                prev_level = abs_ac_coeff_minus1 + 1;
+                ++scan_pos;
+                k_level = FFMIN(abs_level >> 2, 4);
                 if (first_ac) {
-                    state->prev_1st_ac_level = prev_level;
+                    state->prev_k_level = k_level;
                     first_ac = 0;
                 }
+                if (scan_pos >= 64)
+                    break;
+                next_is_run = 1;
+
+            } else {
+                // One or more short codes.
 
+                coeff[ff_zigzag_direct[scan_pos]] = ent->level[0];
                 ++scan_pos;
+                if (first_ac) {
+                    state->prev_k_level = ent->k_level_0;
+                    first_ac = 0;
+                }
+                if (scan_pos >= 64) {
+                    skip_bits(gbc, ent->offset[0]);
+                    break;
+                }
+                if (ent->count > 1) {
+                    scan_pos += ent->run[0];
+                    if (scan_pos >= 64) {
+                        skip_bits(gbc, ent->offset[1]);
+                        break;
+                    }
+                }
+                if (ent->count > 2) {
+                    coeff[ff_zigzag_direct[scan_pos]] = ent->level[1];
+                    ++scan_pos;
+                    if (scan_pos >= 64) {
+                        skip_bits(gbc, ent->offset[2]);
+                        break;
+                    }
+                }
+                if (ent->count > 3) {
+                    scan_pos += ent->run[1];
+                    if (scan_pos >= 64) {
+                        skip_bits(gbc, ent->offset[3]);
+                        break;
+                    }
+                }
+                skip_bits(gbc, ent->offset[3]);
+                k_run   = ent->k_run;
+                k_level = ent->k_level_1;
+                next_is_run = ent->count & 1;
             }
-
-        } while (scan_pos < APV_BLK_COEFFS);
+        }
     }
 
     return 0;
-- 
2.47.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to