[f265 dev team] Algorithmic patch 2

Laurent Birtz Mon, 14 Jul 2014 15:55:26 -0700

Update quality=10, quality=15 presets.
Add thresholds for mode analysis.
Add transform tree early termination.
Transform block encoding function tweaks.
Fix finline macro for clang.

Note, I had to invert the control flow of the TT exploration function,you can skip that part in the review.


Thanks,
Laurent

diff --git a/f265/analyze.c b/f265/analyze.c
index 266b020..210e602 100644
--- a/f265/analyze.c
+++ b/f265/analyze.c
@@ -33,7 +33,29 @@
 //   Bit 8 enables the second RDO pass.
 //
 // Bit 10 enables fake luma interpolation.
+// Bit 11 enables the use of thresholds for the mode analysis.
 // Bit 12 forces the reconstruction of blocks in RDM.
+// Bit 14 early terminates the transform tree exploration.
+
+// Description of some failed experiments.
+//
+// Deferred RDO transform tree analysis. For each CB mode, analyze only the
+// largest transform block in RDO. Refine the full transform tree of the best CB
+// mode found. It turns out that this lowers both quality and performance. The
+// transform tree seems to matter much for choosing the best CB mode. Aggressive
+// filtering using the RDM costs yield only one or a few candidates per CB, so
+// the overhead of processing the largest transform block twice cancels the
+// performance benefits.
+//
+// Skipping the analysis of 32x32 intra if the children do not choose intra
+// 16x16. This causes a large quality penalty. 32x32 intra is chosen regularly
+// even when inter, intra 8x8 and intra 4x4 are present in the smaller CBs. The
+// correlation between the intra 32x32 RDM cost and the 16x16 intra RDM costs is
+// very poor.
+
+
+// FIXME. Used to estimate the quality loss if we skip the chroma RDO process.
+//#define NO_CHROMA_RDO
 
 
 // FIXME:
@@ -48,12 +70,17 @@
 ///////////////////////////////////////////////////////////////////////////////
 // Temporary debugging code.
 
-// Print the analysis costs. Obsolete.
+// Print the analysis costs.
 //#define VAN_TRACE_ANALYSIS
 
 // Verify the CB inter cost in RDM mode. No effect in RDO mode.
 //#define VAN_VERIFY_CB_INTER_COST
 
+#ifdef VAN_TRACE_ANALYSIS
+// Set to 0 to disable tracing.
+int fenc_trace_analysis_flag = 0;
+#endif
+
 void fenc_an_cb_loc(f265_enc_thread *t, f265_cb *cb, int comp)
 {
     int csf = !!comp;
@@ -750,45 +777,14 @@ static int64_t fenc_analyze_intra_luma_tt(f265_enc_thread *t, f265_cb *cb, int m
 
     // Get the split flag costs.
     uint16_t split_flag_costs[2] = { 0, 0 };
-    #if 0
-    if (split_present_flag)
-    #else
     if (t->an.rdm_flag && split_present_flag)
-    #endif
     {
         int off = (5-lg_bs)<<1;
         for (int i = 0; i < 2; i++) split_flag_costs[i] = t->an.se_costs[F265_SE_SPLIT_TRANSFORM + off + i];
     }
 
-    // FIXME: consider extracting this code in a function.
-    if (split_flag)
-    {
-        // Add a split transform node.
-        *t->tt.tn++ = 15;
-
-        int64_t split_flag_cost = 0;
-        if (t->an.rdm_flag) split_flag_cost = split_flag_costs[1];
-        else if (split_present_flag)
-        {
-            F265_RDO_COST(split_flag_cost,
-                          fenc_encode_context_bin(&t->cbs, F265_CO_SPLIT_TRANSFORM+5-lg_bs, 1));
-        }
-
-        fenc_stash_push(t);
-        int64_t subtree_cost = 0;
-        for (int i = 0, sbs = 1<<(lg_bs-1); i < 4; i++)
-            subtree_cost += fenc_analyze_intra_luma_tt(t, cb, mode, depth_range, depth+1, lg_bs-1,
-                                                       cb_ox + (i&1)*sbs, cb_oy + (i>>1)*sbs);
-
-        fenc_stash_pop(t);
-
-        split_cost = split_flag_cost + subtree_cost;
-    }
-
     if (unsplit_flag)
     {
-        if (stash_flag) fenc_stash_save_reset(t);
-
         // Add an unsplit transform node.
         *t->tt.tn++ = 7;
 
@@ -820,16 +816,46 @@ static int64_t fenc_analyze_intra_luma_tt(f265_enc_thread *t, f265_cb *cb, int m
         }
 
         unsplit_cost = unsplit_flag_cost + cbf_cost + fudge + tb_cost;
+
+        // Early terminate the exploration.
+        if (t->enc->gd.algo&(1<<14) && !nz_flag) split_flag = stash_flag = 0;
     }
 
+    if (split_flag)
+    {
+        if (stash_flag) fenc_stash_save_reset(t);
+
+        // Add a split transform node.
+        *t->tt.tn++ = 15;
+
+        int64_t split_flag_cost = 0;
+        if (t->an.rdm_flag) split_flag_cost = split_flag_costs[1];
+        else if (split_present_flag)
+        {
+            F265_RDO_COST(split_flag_cost,
+                          fenc_encode_context_bin(&t->cbs, F265_CO_SPLIT_TRANSFORM+5-lg_bs, 1));
+        }
+
+        fenc_stash_push(t);
+        int64_t subtree_cost = 0;
+        for (int i = 0, sbs = 1<<(lg_bs-1); i < 4; i++)
+            subtree_cost += fenc_analyze_intra_luma_tt(t, cb, mode, depth_range, depth+1, lg_bs-1,
+                                                       cb_ox + (i&1)*sbs, cb_oy + (i>>1)*sbs);
+
+        fenc_stash_pop(t);
+
+        split_cost = split_flag_cost + subtree_cost;
+    }
+
+
     if (split_cost < unsplit_cost)
     {
-        if (stash_flag) fenc_stash_restore(t);
         best_cost = split_cost;
     }
 
     else
     {
+        if (stash_flag) fenc_stash_restore(t);
         best_cost = unsplit_cost;
     }
 
@@ -920,6 +946,10 @@ static int64_t fenc_analyze_inter_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
 
     int64_t cost = coeff_cost + dist;
 
+    #ifdef NO_CHROMA_RDO
+    if (comp) cost = 0;
+    #endif
+
     return cost;
 }
 
@@ -967,6 +997,10 @@ static int64_t fenc_analyze_inter_tb_full(f265_enc_thread *t, f265_cb *cb, int *
 
     fenc_stash_pop(t);
 
+    #ifdef NO_CHROMA_RDO
+    if (comp) ret_cost = 0;
+    #endif
+
     return ret_cost;
 }
 
@@ -984,8 +1018,58 @@ static int64_t fenc_analyze_inter_sub_tt(f265_enc_thread *t, f265_cb *cb, int *y
 
     if (stash_flag) fenc_stash_init_cb_off(t, cb, cb_ox, cb_oy, 1<<lg_bs, 1<<lg_bs);
 
+    if (unsplit_flag)
+    {
+        // Add an unsplit transform node.
+        uint8_t *tn = t->tt.tn++;
+
+        int64_t unsplit_flag_cost = 0;
+        if (split_present_flag)
+        {
+            F265_RDO_COST(unsplit_flag_cost,
+                          fenc_encode_context_bin(&t->cbs, F265_CO_SPLIT_TRANSFORM+5-lg_bs, 0));
+        }
+
+        // Encode the luma transform block.
+        int nz_flag;
+        int64_t tb_cost = fenc_analyze_inter_tb_full(t, cb, &nz_flag, 0, depth, lg_bs, cb_ox, cb_oy);
+        unsplit_yuv_flags |= nz_flag;
+
+        // Encode the chroma transform blocks.
+        // FIXME: we should consider the tentative chroma flags for the
+        // split/unsplit comparison (without passing up the tentative costs).
+        if (lg_bs > 2 || block_idx == 3)
+        {
+            int chroma_lg_bs = F265_MAX(lg_bs-1, 2);
+            int chroma_off[2] = { (cb_ox>>1)&~3, (cb_oy>>1)&~3 };
+
+            for (int comp = 1; comp < 3; comp++)
+            {
+                tb_cost += fenc_analyze_inter_tb_full(t, cb, &nz_flag, comp, depth, chroma_lg_bs,
+                                                      chroma_off[0], chroma_off[1]);
+                unsplit_yuv_flags |= nz_flag<<comp;
+            }
+        }
+
+        // Encode the luma CBF unless it is inferred by root_cbf.
+        int64_t luma_cbf_cost = 0;
+        if (depth || unsplit_yuv_flags&6)
+        {
+            F265_RDO_COST(luma_cbf_cost,
+                          fenc_encode_context_bin(&t->cbs, F265_CO_CBF_LUMA+!depth, unsplit_yuv_flags&1));
+        }
+
+        unsplit_cost = unsplit_flag_cost + luma_cbf_cost + tb_cost;
+        *tn = unsplit_yuv_flags;
+
+        // Early terminate the exploration.
+        if (t->enc->gd.algo&(1<<14) && !unsplit_yuv_flags) split_flag = stash_flag = 0;
+    }
+
     if (split_flag)
     {
+        if (stash_flag) fenc_stash_save_reset(t);
+
         // Add a split transform node.
         uint8_t *tn = t->tt.tn++;
 
@@ -1041,6 +1125,9 @@ static int64_t fenc_analyze_inter_sub_tt(f265_enc_thread *t, f265_cb *cb, int *y
                         fenc_encode_context_bin(&t->cbs, F265_CO_CBF_CHROMA+depth+1, cbf);
                     }
                 chroma_cbf_cost = fenc_rdo_bit_cost(t, t->cbs.rdo.bits);
+                #ifdef NO_CHROMA_RDO
+                chroma_cbf_cost = 0;
+                #endif
             }
         }
 
@@ -1055,62 +1142,15 @@ static int64_t fenc_analyze_inter_sub_tt(f265_enc_thread *t, f265_cb *cb, int *y
         *tn = 8|split_yuv_flags;
     }
 
-    if (unsplit_flag)
-    {
-        if (stash_flag) fenc_stash_save_reset(t);
-
-        // Add an unsplit transform node.
-        uint8_t *tn = t->tt.tn++;
-
-        int64_t unsplit_flag_cost = 0;
-        if (split_present_flag)
-        {
-            F265_RDO_COST(unsplit_flag_cost,
-                          fenc_encode_context_bin(&t->cbs, F265_CO_SPLIT_TRANSFORM+5-lg_bs, 0));
-        }
-
-        // Encode the luma transform block.
-        int nz_flag;
-        int64_t tb_cost = fenc_analyze_inter_tb_full(t, cb, &nz_flag, 0, depth, lg_bs, cb_ox, cb_oy);
-        unsplit_yuv_flags |= nz_flag;
-
-        // Encode the chroma transform blocks.
-        // FIXME: we should consider the tentative chroma flags for the
-        // split/unsplit comparison (without passing up the tentative costs).
-        if (lg_bs > 2 || block_idx == 3)
-        {
-            int chroma_lg_bs = F265_MAX(lg_bs-1, 2);
-            int chroma_off[2] = { (cb_ox>>1)&~3, (cb_oy>>1)&~3 };
-
-            for (int comp = 1; comp < 3; comp++)
-            {
-                tb_cost += fenc_analyze_inter_tb_full(t, cb, &nz_flag, comp, depth, chroma_lg_bs,
-                                                      chroma_off[0], chroma_off[1]);
-                unsplit_yuv_flags |= nz_flag<<comp;
-            }
-        }
-
-        // Encode the luma CBF unless it is inferred by root_cbf.
-        int64_t luma_cbf_cost = 0;
-        if (depth || unsplit_yuv_flags&6)
-        {
-            F265_RDO_COST(luma_cbf_cost,
-                          fenc_encode_context_bin(&t->cbs, F265_CO_CBF_LUMA+!depth, unsplit_yuv_flags&1));
-        }
-
-        unsplit_cost = unsplit_flag_cost + luma_cbf_cost + tb_cost;
-        *tn = unsplit_yuv_flags;
-    }
-
     if (split_cost < unsplit_cost)
     {
-        if (stash_flag) fenc_stash_restore(t);
         best_cost = split_cost;
         *yuv_flags = split_yuv_flags;
     }
 
     else
     {
+        if (stash_flag) fenc_stash_restore(t);
         best_cost = unsplit_cost;
         *yuv_flags = unsplit_yuv_flags;
     }
@@ -1163,6 +1203,9 @@ static int64_t fenc_analyze_inter_tt_residual(f265_enc_thread *t, f265_cb *cb, i
     for (int i = 0; i < 2; i++)
         fenc_encode_context_bin(&t->cbs, F265_CO_CBF_CHROMA+0, (nz_flags>>(1+i))&1);
     int64_t chroma_cbf_cost = fenc_rdo_bit_cost(t, t->cbs.rdo.bits);
+    #ifdef NO_CHROMA_RDO
+    chroma_cbf_cost = 0;
+    #endif
 
     int64_t residual_cost = root_cbf_cost + chroma_cbf_cost + subtree_cost;
 
@@ -1762,6 +1805,7 @@ static int64_t fenc_analyze_intra_part_luma(f265_enc_thread *t, f265_cb *cb, int
     int64_t best_cost;
 
     // Parse the internal algorithm fields.
+    if (!an->bypass_intra_algo_flag)
     {
         int algo = t->enc->gd.algo, tmp;
 
@@ -1950,144 +1994,164 @@ static int64_t fenc_analyze_intra_part_luma(f265_enc_thread *t, f265_cb *cb, int
     return best_cost;
 }
 
-// Return the cost of encoding the CB with intra prediction.
-// FIXME, split that function.
-static void fenc_analyze_intra_cb(f265_enc_thread *t, f265_cb *cb)
+// Analyze the intra HV mode.
+static void fenc_analyze_intra_hv(f265_enc_thread *t, f265_cb *cb)
 {
     f265_analysis *an = &t->an;
     f265_cb_analysis *cba = t->cba;
     int lg_bs = cb->lg_bs;
-    int split_flag = lg_bs == t->enc->gd.cb_range[0];
     int depth_range[2];
     int save_stash_flags = F265_STASH_ALL;
     int luma_stash_flags = F265_STASH_CABAC|F265_STASH_TN|F265_STASH_Y;
     int chroma_stash_flags = F265_STASH_CABAC|F265_STASH_U|F265_STASH_V;
 
-    // No cache right now.
-    cba->intra_mode_cache_flags[0] = cba->intra_mode_cache_flags[1] = 0;
-
     // Remember the initial transform tree position.
     uint8_t *init_tn = t->tt.tn;
 
     // Set the CB mode to intra.
     F265_SET_FLAG(cb->flags, F265_CB_INTRA, 1);
 
-    // Test HV. Consider putting HV last, there are less modes to save/restore
-    // if worse than UN.
-    if (split_flag)
-    {
-        // Add a split transform node.
-        if (!an->rdm_flag) *t->tt.tn++ = 15;
+    // Set the partitioning mode to HV and update the prediction map.
+    // This needs to be optimized. FIXME.
+    cb->intra_luma_mode[1] = 0;
+    fenc_update_pmap_unsplit_cb(t, cb);
 
-        // Set the partitioning mode to HV and update the prediction map.
-        // This needs to be optimized. FIXME.
-        cb->intra_luma_mode[1] = 0;
-        fenc_update_pmap_unsplit_cb(t, cb);
+    // Add a split transform node.
+    if (!an->rdm_flag) *t->tt.tn++ = 15;
 
-        // Set the analyzed depth range.
-        fenc_get_intra_part_depth_range(t, lg_bs-1, depth_range);
+    // Set the analyzed depth range.
+    fenc_get_intra_part_depth_range(t, lg_bs-1, depth_range);
 
-        // Get the base split cost.
-        int64_t cost = an->rdm_flag ? cba->rdm_se_costs[0] : cba->rdo_se_costs[1];
+    // Get the base split cost.
+    int64_t cost = an->rdm_flag ? cba->rdm_se_costs[0] : cba->rdo_se_costs[1];
 
-        fenc_stash_push(t);
+    fenc_stash_push(t);
 
-        // Analyze luma.
+    // Analyze luma.
 
-        // Fake the luma reconstruction with the source pixels.
-        if (an->rdm_flag && !an->rdm_rec_flag) fenc_analyze_fake_cb_rec(t, cb, 0);
+    // Fake the luma reconstruction with the source pixels.
+    if (an->rdm_flag && !an->rdm_rec_flag) fenc_analyze_fake_cb_rec(t, cb, 0);
 
-        t->stash.flags = luma_stash_flags;
-        for (int i = 0, sbs = 1<<(lg_bs-1); i < 4; i++)
-        {
-            cost += fenc_analyze_intra_part_luma(t, cb, depth_range, i, lg_bs-1, (i&1)*sbs, (i>>1)*sbs);
+    t->stash.flags = luma_stash_flags;
+    for (int i = 0, sbs = 1<<(lg_bs-1); i < 4; i++)
+    {
+        cost += fenc_analyze_intra_part_luma(t, cb, depth_range, i, lg_bs-1, (i&1)*sbs, (i>>1)*sbs);
 
-            // Optionally reconstruct the partition in RDM mode.
-            if (an->rdm_rec_flag)
-            {
-                // Make a static transform tree with the largest intra block.
-                fenc_make_static_tt(&t->tt.tn, depth_range[0], 0);
+        // Optionally reconstruct the partition in RDM mode.
+        if (an->rdm_rec_flag)
+        {
+            // Make a static transform tree with the largest intra block.
+            fenc_make_static_tt(&t->tt.tn, depth_range[0], 0);
 
-                // Reconstruct. FIXME, writing TBs here.
-                t->tt.tn = init_tn;
-                fenc_set_tmp_tb(t);
-                fenc_rec_intra_part_tt(t, 0, lg_bs-1, cb->intra_luma_mode[i],
-                                       cb->cb_off[0] + (i&1)*sbs, cb->cb_off[1] + (i>>1)*sbs, 0);
-                t->tt.tn = init_tn;
-            }
+            // Reconstruct. FIXME, writing TBs here.
+            t->tt.tn = init_tn;
+            fenc_set_tmp_tb(t);
+            fenc_rec_intra_part_tt(t, 0, lg_bs-1, cb->intra_luma_mode[i],
+                                   cb->cb_off[0] + (i&1)*sbs, cb->cb_off[1] + (i>>1)*sbs, 0);
+            t->tt.tn = init_tn;
         }
+    }
 
-        // Analyze chroma.
-        t->stash.flags = chroma_stash_flags;
-        if (!an->rdm_flag) cost += fenc_analyze_intra_cb_chroma(t, cb, init_tn);
+    // Analyze chroma.
+    t->stash.flags = chroma_stash_flags;
+    #ifndef NO_CHROMA_RDO
+    if (!an->rdm_flag) cost += fenc_analyze_intra_cb_chroma(t, cb, init_tn);
+    #else
+    if (!an->rdm_flag) fenc_analyze_intra_cb_chroma(t, cb, init_tn);
+    #endif
 
-        fenc_stash_pop(t);
+    fenc_stash_pop(t);
 
-        // Export the intra data.
-        for (int i = 0; i < 4; i++) cba->intra_hv_modes[i] = cb->intra_luma_mode[i];
-        cba->intra_chroma_modes[1] = cb->intra_chroma_mode;
+    // Export the intra data.
+    for (int i = 0; i < 4; i++) cba->intra_hv_modes[i] = cb->intra_luma_mode[i];
+    cba->intra_chroma_modes[1] = cb->intra_chroma_mode;
 
-        // Update the best intra mode and the best CB mode.
+    // Update the best intra mode and the best CB mode.
+    if (an->rdm_flag)
+    {
+        cba->rdm_mode_costs[2] = cost;
         F265_COPY2_IF_LT(cba->rdm_intra_cost, cost, cba->rdm_intra_mode, 2, int, int);
         F265_COPY2_IF_LT(cba->rdm_best_cost, cost, cba->rdm_best_mode, 2, int, int);
+    }
 
-        if (!an->rdm_flag)
+    if (!an->rdm_flag)
+    {
+        // Update the best intra mode and the best CB mode.
+        if (cost < cba->rdo_best_cost)
         {
-            // Update the best intra mode and the best CB mode.
-            if (cost < cba->rdo_best_cost)
-            {
-                t->stash.flags = save_stash_flags;
-                cba->rdo_best_cost = cost;
-                cba->rdo_best_mode = 2;
-                fenc_stash_save(t);
-            }
-
-            fenc_stash_reset(t);
+            t->stash.flags = save_stash_flags;
+            cba->rdo_best_cost = cost;
+            cba->rdo_best_mode = 2;
+            fenc_stash_save(t);
         }
+
+        fenc_stash_reset(t);
     }
+}
 
-    // Test UN.
-    {
-        t->stash.flags = save_stash_flags;
-        cb->intra_luma_mode[1] = -1;
-        fenc_get_intra_part_depth_range(t, lg_bs, depth_range);
+// Analyze the intra UN mode.
+static void fenc_analyze_intra_un(f265_enc_thread *t, f265_cb *cb)
+{
+    f265_analysis *an = &t->an;
+    f265_cb_analysis *cba = t->cba;
+    int lg_bs = cb->lg_bs;
+    int depth_range[2];
+    int save_stash_flags = F265_STASH_ALL;
+    int luma_stash_flags = F265_STASH_CABAC|F265_STASH_TN|F265_STASH_Y;
+    int chroma_stash_flags = F265_STASH_CABAC|F265_STASH_U|F265_STASH_V;
 
-        // Get the base cost.
-        int64_t cost = an->rdm_flag ? cba->rdm_se_costs[0] : cba->rdo_se_costs[0];
+    // Remember the initial transform tree position.
+    uint8_t *init_tn = t->tt.tn;
 
-        fenc_stash_push(t);
+    // Set the CB mode to intra UN.
+    F265_SET_FLAG(cb->flags, F265_CB_INTRA, 1);
+    t->stash.flags = save_stash_flags;
+    cb->intra_luma_mode[1] = -1;
+    fenc_get_intra_part_depth_range(t, lg_bs, depth_range);
 
-        // Analyze luma.
-        t->stash.flags = luma_stash_flags;
-        cost += fenc_analyze_intra_part_luma(t, cb, depth_range, 0, lg_bs, 0, 0);
+    // Get the base cost.
+    int64_t cost = an->rdm_flag ? cba->rdm_se_costs[0] : cba->rdo_se_costs[0];
 
-        // Analyze chroma.
-        t->stash.flags = chroma_stash_flags;
-        if (!an->rdm_flag) cost += fenc_analyze_intra_cb_chroma(t, cb, init_tn);
+    fenc_stash_push(t);
 
-        fenc_stash_pop(t);
+    // Analyze luma.
+    t->stash.flags = luma_stash_flags;
+    cost += fenc_analyze_intra_part_luma(t, cb, depth_range, 0, lg_bs, 0, 0);
 
-        // Export the intra data.
-        cba->intra_un_mode = cb->intra_luma_mode[0];
-        cba->intra_chroma_modes[0] = cb->intra_chroma_mode;
+    // Analyze chroma.
+    t->stash.flags = chroma_stash_flags;
+    #ifndef NO_CHROMA_RDO
+    if (!an->rdm_flag) cost += fenc_analyze_intra_cb_chroma(t, cb, init_tn);
+    #else
+    if (!an->rdm_flag) fenc_analyze_intra_cb_chroma(t, cb, init_tn);
+    #endif
 
-        // Update the best intra mode and the best CB mode.
+    fenc_stash_pop(t);
+
+    // Export the intra data.
+    cba->intra_un_mode = cb->intra_luma_mode[0];
+    cba->intra_chroma_modes[0] = cb->intra_chroma_mode;
+
+    // Update the best intra mode and the best CB mode.
+    if (an->rdm_flag)
+    {
+        cba->rdm_mode_costs[1] = cost;
         F265_COPY2_IF_LT(cba->rdm_intra_cost, cost, cba->rdm_intra_mode, 1, int, int);
         F265_COPY2_IF_LT(cba->rdm_best_cost, cost, cba->rdm_best_mode, 1, int, int);
+    }
 
-        if (!an->rdm_flag)
+    if (!an->rdm_flag)
+    {
+        // Update the best intra mode and the best CB mode.
+        if (cost < cba->rdo_best_cost)
         {
-            // Update the best intra mode and the best CB mode.
-            if (cost < cba->rdo_best_cost)
-            {
-                cba->rdo_best_cost = cost;
-                cba->rdo_best_mode = 1;
-                t->stash.flags = save_stash_flags;
-                fenc_stash_save(t);
-            }
-
-            fenc_stash_reset(t);
+            cba->rdo_best_cost = cost;
+            cba->rdo_best_mode = 1;
+            t->stash.flags = save_stash_flags;
+            fenc_stash_save(t);
         }
+
+        fenc_stash_reset(t);
     }
 }
 
@@ -2172,13 +2236,13 @@ static void fenc_analyze_un_merge_rdm(f265_enc_thread *t, f265_cb *cb)
 
         // Get the total cost and update the best cost.
         int cost = base_cost + t->an.se_costs[F265_SE_MERGE_IDX+merge_idx] + fenc_me_merge_cand_dist(t, *cand);
+        cba->rdm_mode_costs[12 + merge_idx] = cost;
         F265_COPY2_IF_LT(best_cost, cost, best_idx, merge_idx, int, int);
     }
 
-    // Update the best merge candidate index, the skip flag, the best inter mode
-    // and the best CB mode.
+    // Update the best merge candidate index, the best inter mode and the best
+    // CB mode.
     cba->rdm_un_merge_idx = best_idx;
-    cba->un_skip_flag = 0;
     int cb_mode = 12 + best_idx;
     F265_COPY2_IF_LT(cba->rdm_inter_cost, best_cost, cba->rdm_inter_mode, cb_mode, int, int);
     F265_COPY2_IF_LT(cba->rdm_best_cost, best_cost, cba->rdm_best_mode, cb_mode, int, int);
@@ -2701,6 +2765,7 @@ static void fenc_analyze_inter_part_mode_rdm(f265_enc_thread *t, f265_cb *cb, in
 
     // Update the best inter mode and the best CB mode.
     int cb_mode = 4 + part_mode;
+    cba->rdm_mode_costs[cb_mode] = cost;
     F265_COPY2_IF_LT(cba->rdm_inter_cost, cost, cba->rdm_inter_mode, cb_mode, int, int);
     F265_COPY2_IF_LT(cba->rdm_best_cost, cost, cba->rdm_best_mode, cb_mode, int, int);
 }
@@ -2860,6 +2925,10 @@ static void fenc_analyze_init_cb(f265_enc_thread *t, f265_cb *cb)
     // merge candidates were not obtained yet.
     cba->nb_unique_un_merge_idx = 0;
 
+    // Initialize the best merge candidate data.
+    cba->rdm_un_merge_idx = 0;
+    cba->un_skip_flag = 0;
+
     // Get the RDM cost of a bypass bin. FIXME. This needs to be adjusted by the
     // CB QP.
     cba->rdm_bin_cost = an->se_costs[F265_SE_BYPASS];
@@ -3057,12 +3126,365 @@ static void fenc_analyze_save_cb(f265_enc_thread *t, f265_cb *cb, uint8_t *init_
     }
 }
 
+// Test intra UN in RDM. The RDM modes found are cached in the CB analysis.
+static inline void fenc_analyze_intra_un_rdm(f265_enc_thread *t, f265_cb *cb)
+{
+    f265_analysis *an = &t->an;
+    f265_cb_analysis *cba = t->cba;
+    f265_intra_block *ib = &t->intra_block;
+
+    an->rdm_flag = 1;
+    fenc_analyze_intra_un(t, cb);
+    for (int i = 0; i < 3; i++) cba->intra_un_cands[i] = ib->cands[i];
+    an->rdm_flag = 0;
+}
+
+// Test intra UN in RDO. This assumes the RDM analysis has already been done.
+static inline void fenc_analyze_intra_un_rdo(f265_enc_thread *t, f265_cb *cb, int limit_cand_flag)
+{
+    f265_analysis *an = &t->an;
+    f265_cb_analysis *cba = t->cba;
+    f265_intra_block *ib = &t->intra_block;
+
+    // Restore the cached RDM modes.
+    for (int i = 0; i < 3; i++) ib->cands[i] = cba->intra_un_cands[i];
+
+    // Use the intra algorithms specified below. FIXME.
+    an->bypass_intra_algo_flag = 1;
+
+    // Disable the RDM pass.
+    ib->rdm_base_algo = 0;
+
+    // Use one RDO pass.
+    ib->cache_neighbour_flags[1] = 0;
+    ib->nb_cands[1] = limit_cand_flag ? 1 : 3;
+    ib->rdo_algo[0] = 3;
+    ib->rdo_save_flag[0] = 1;
+    ib->nb_cands[2] = 1;
+    ib->rdo_algo[1] = 0;
+
+    // Analyze.
+    fenc_analyze_intra_un(t, cb);
+
+    // Restore the default behavior.
+    an->bypass_intra_algo_flag = 0;
+}
+
+// Test all inter UN modes in RDM.
+static inline void fenc_analyze_inter_un_rdm(f265_enc_thread *t, f265_cb *cb)
+{
+    fenc_analyze_un_merge_rdm(t, cb);
+    fenc_analyze_inter_part_mode_rdm(t, cb, 0);
+}
+
+// Test the inter H2/V2 modes in RDM.
+static inline void fenc_analyze_inter_h2_v2_rdm(f265_enc_thread *t, f265_cb *cb)
+{
+    fenc_analyze_inter_part_mode_rdm(t, cb, 1);
+    fenc_analyze_inter_part_mode_rdm(t, cb, 2);
+}
+
+// Test the inter UN merge modes in RDO based on the threshold.
+static inline void fenc_analyze_inter_merge_rdo_threshold(f265_enc_thread *t, f265_cb *cb, int threshold)
+{
+    f265_cb_analysis *cba = t->cba;
+    for (int8_t idx = 0; idx < 5; idx++)
+        if (cba->rdm_mode_costs[12+idx] <= threshold)
+            fenc_analyze_un_merge_rdo(t, cb, &idx, 1);
+}
+
+// Test the inter UN non-merge, H2 and V2 modes in RDO based on the threshold.
+static inline void fenc_analyze_inter_un_h2_v2_rdo_threshold(f265_enc_thread *t, f265_cb *cb, int threshold)
+{
+    f265_cb_analysis *cba = t->cba;
+    for (int part_mode = 0; part_mode < 3; part_mode++)
+        if (cba->rdm_mode_costs[4+part_mode] <= threshold)
+            fenc_analyze_inter_part_mode_rdo(t, cb, part_mode);
+}
+
+// Return true if the current CB may early skip. This is the case if the best
+// RDO UN merge mode skips and the best RDM mode is also a UN merge mode. The
+// RDM test is important to avoid a large quality penalty.
+static int fenc_analyze_can_early_skip(f265_cb_analysis *cba)
+{
+    return (cba->un_skip_flag && cba->rdm_best_mode >= 12);
+}
+
+// Analyze the CB with RDO and thresholds. Experimental code.
+static void fenc_analyze_cb_rdo_threshold(f265_enc_thread *t, f265_cb *cb)
+{
+    f265_analysis *an = &t->an;
+    f265_cb_analysis *cba = t->cba;
+    int cb_unsplit_flag = !(cb->flags&F265_CB_FORBIDDEN);
+    int cb_inter_flag = cb_unsplit_flag && t->frame_type != F265_FRAME_I;
+    uint8_t *init_tn = t->tt.tn;
+
+    // Initialize the CB.
+    fenc_analyze_init_cb(t, cb);
+
+    // 8x8 preliminary exploration.
+    if (cb->lg_bs == 3 && an->rdm_flag)
+    {
+        // Testing intra HV and inter H2/V2 improves quality minorly for a large
+        // performance penalty, so we don't do it here.
+
+        // Test intra UN.
+        fenc_analyze_intra_un(t, cb);
+
+        // Test inter UN.
+        if (cb_inter_flag)
+            fenc_analyze_inter_un_rdm(t, cb);
+
+        // Set the interest flags based on the costs obtained. Intra (1), inter
+        // (4).
+        cb->interests = 0;
+        cb->interests |= (cba->rdm_intra_cost < cba->rdm_inter_cost*1.33)<<1;
+        cb->interests |= (cba->rdm_inter_cost < cba->rdm_intra_cost*1.33)<<4;
+    }
+
+    // 8x8 final exploration.
+    else if (cb->lg_bs == 3)
+    {
+        // 8x8 RDM. We do not reuse the modes found in the preliminary
+        // exploration since they are potentially stale.
+
+        // Test inter unless the preliminary exploration ruled it out. For the
+        // H2/V2 modes, the early termination based on the intra UN cost vs
+        // inter UN cost does not seem useful, so we test all modes.
+        if (cb->interests&(1<<4))
+        {
+            fenc_analyze_inter_un_rdm(t, cb);
+            fenc_analyze_inter_h2_v2_rdm(t, cb);
+        }
+
+        // Test intra unless the preliminary exploration ruled it out.
+        if (cb->interests&(1<<1))
+        {
+            // Test intra UN.
+            fenc_analyze_intra_un_rdm(t, cb);
+
+            // Test intra HV unless inter is significantly better than intra UN.
+            if (cba->rdm_intra_cost < cba->rdm_best_cost*1.07)
+            {
+                an->rdm_flag = 1;
+                fenc_analyze_intra_hv(t, cb);
+                an->rdm_flag = 0;
+            }
+        }
+
+        // 8x8 RDO. We use a bigger threshold when the 8x8 CBs seem good.
+        int threshold = an->long_8x8_analysis_flag ? cba->rdm_best_cost*1.33 : cba->rdm_best_cost*1.05;
+
+        // Test inter UN merge.
+        fenc_analyze_inter_merge_rdo_threshold(t, cb, threshold);
+
+        // Test the other modes unless we early skip.
+        if (!fenc_analyze_can_early_skip(cba))
+        {
+            // Test UN/H2/V2 in RDO.
+            fenc_analyze_inter_un_h2_v2_rdo_threshold(t, cb, threshold);
+
+            // Test intra UN with one RDO pass.
+            if (cba->rdm_mode_costs[1] <= threshold)
+                fenc_analyze_intra_un_rdo(t, cb, 0);
+
+            // Test intra HV with one RDO pass.
+            if (cba->rdm_mode_costs[2] <= threshold)
+            {
+                f265_intra_block *ib = &t->intra_block;
+
+                // Use fixed parameters. FIXME.
+                an->bypass_intra_algo_flag = 1;
+
+                ib->cache_neighbour_flags[0] = 1;
+                ib->rdm_base_algo = 5;
+                ib->rdm_angular_algo = 1;
+                ib->rdm_angular_early_term_flag = 1;
+                ib->rdm_cand_select_algo = 1;
+
+                ib->cache_neighbour_flags[1] = 1;
+                ib->nb_cands[1] = 3;
+                ib->rdo_algo[0] = 3;
+                ib->rdo_save_flag[0] = 1;
+                ib->nb_cands[2] = 1;
+                ib->rdo_algo[1] = 0;
+
+                fenc_analyze_intra_hv(t, cb);
+
+                an->bypass_intra_algo_flag = 0;
+            }
+        }
+    }
+
+    // 16x16.
+    else if (cb->lg_bs == 4)
+    {
+        // Probe the 8x8 CBs in RDM.
+        an->rdm_flag = 1;
+        fenc_analyze_split_cb(t, cb);
+        an->rdm_flag = 0;
+
+        // Test 16x16 RDM.
+        if (cb_unsplit_flag)
+        {
+            // Test inter.
+            if (cb_inter_flag)
+            {
+                // Test UN.
+                fenc_analyze_inter_un_rdm(t, cb);
+
+                // Test H2/V2 if inter 8x8 is better than inter 16x16.
+                if (cba->rdm_child_inter_cost < cba->rdm_inter_cost)
+                    fenc_analyze_inter_h2_v2_rdm(t, cb);
+            }
+
+            // Test intra unless inter 8x8 is better than intra 8x8.
+            if (cba->rdm_child_intra_cost < cba->rdm_child_inter_cost)
+                fenc_analyze_intra_un_rdm(t, cb);
+        }
+
+        // Track whether the 8x8 CBs look good.
+        an->long_8x8_analysis_flag = cba->rdm_mode_costs[0] < cba->rdm_best_cost*1.03;
+
+        // 16x16 RDO threshold.
+        int threshold = cba->rdm_best_cost*1.07;
+
+        // Test inter UN merge.
+        fenc_analyze_inter_merge_rdo_threshold(t, cb, threshold);
+
+        // Test the other modes unless we early skip.
+        if (!fenc_analyze_can_early_skip(cba))
+        {
+            // Test UN/H2/V2 in RDO.
+            fenc_analyze_inter_un_h2_v2_rdo_threshold(t, cb, threshold);
+
+            // Test intra. Limit the number of candidates if intra 16x16 is not
+            // the best RDM mode.
+            if (cba->rdm_mode_costs[1] <= threshold)
+                fenc_analyze_intra_un_rdo(t, cb, cba->rdm_intra_mode != cba->rdm_best_mode);
+
+            // Analyze the 8x8 CBs in RDO.
+            if (cba->rdm_mode_costs[0] <= threshold)
+                fenc_analyze_split_cb(t, cb);
+        }
+    }
+
+    // 32x32.
+    else
+    {
+        // RDO threshold multipliers for the best cost. 100 means "unlimited", 0
+        // means "do not test". FIXME, this is a kludge.
+        float inter_mult = 0;
+        float intra_mult = 0;
+
+        // Analyze the 16x16 CBs in RDO.
+        fenc_analyze_split_cb(t, cb);
+
+        // Test 32x32 RDM.
+        if (cb_unsplit_flag)
+        {
+            // Track the modes of the children blocks. FIXME, some stats are
+            // obsolete.
+            int nb_intra_4 = 0;
+            int nb_intra_8 = 0;
+            int nb_intra_16 = 0;
+            int nb_inter_8 = 0;
+            int nb_inter_16 = 0;
+            for (int i = 0; i < 4; i++)
+            {
+                f265_cb *cb_16 = t->cb + cb->child_idx + i;
+
+                if (cb_16->flags&F265_CB_SPLIT)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        f265_cb *cb_8 = t->cb + cb_16->child_idx + j;
+
+                        if (cb_8->flags&F265_CB_INTRA)
+                        {
+                            if (cb_8->intra_luma_mode[1] == -1) nb_intra_8++;
+                            else nb_intra_4++;
+                        }
+
+                        else nb_inter_8++;
+                    }
+                }
+
+                else if (cb_16->flags&F265_CB_INTRA) nb_intra_16++;
+                else nb_inter_16++;
+            }
+
+            // Inter coverage in 8x8 blocks. 0 is all intra, 16 is all inter.
+            int inter_score = 4*nb_inter_16 + nb_inter_8;
+
+            // All intra. Test intra. Do not test inter.
+            if (inter_score == 0)
+                intra_mult = 100;
+
+            // All inter. Do not test intra. Test inter with a very low
+            // threshold if all children choose inter 16x16.
+            else if (inter_score == 16)
+            {
+                if (nb_inter_16 == 4) inter_mult = 1.01;
+            }
+
+            // Mixed intra/inter modes. Test intra if at least 3/4 of the blocks
+            // are intra. Same for inter.
+            else
+            {
+                if (inter_score <= 4) intra_mult = 100;
+                if (inter_score >= 12) inter_mult = 1.01;
+            }
+
+            // Test inter.
+            if (inter_mult)
+            {
+                // Test UN.
+                fenc_analyze_inter_un_rdm(t, cb);
+
+                // Test H2/V2 if inter 16x16 is better than inter 32x32.
+                if (cba->rdm_child_inter_cost < cba->rdm_inter_cost)
+                    fenc_analyze_inter_h2_v2_rdm(t, cb);
+            }
+
+            // Test intra UN.
+            if (intra_mult)
+                fenc_analyze_intra_un_rdm(t, cb);
+        }
+
+        // Actual RDO thresholds.
+        int inter_threshold = (inter_mult == 100) ? F265_MAX_SAD - 1 : (int)(cba->rdm_best_cost*inter_mult);
+        int intra_threshold = (intra_mult == 100) ? F265_MAX_SAD - 1 : (int)(cba->rdm_best_cost*intra_mult);
+
+        // Test inter UN merge.
+        fenc_analyze_inter_merge_rdo_threshold(t, cb, inter_threshold);
+
+        // Test the other modes unless we early skip.
+        if (!fenc_analyze_can_early_skip(cba))
+        {
+            // Test UN/H2/V2 in RDO.
+            fenc_analyze_inter_un_h2_v2_rdo_threshold(t, cb, inter_threshold);
+
+            // Test intra.
+            if (cba->rdm_mode_costs[1] <= intra_threshold)
+                fenc_analyze_intra_un_rdo(t, cb, 0);
+        }
+    }
+
+    // Save the CB.
+    fenc_analyze_save_cb(t, cb, init_tn);
+}
+
 // Analyze a coding block.
 static void fenc_analyze_cb(f265_enc_thread *t, f265_cb *cb)
 {
+    // Hijack the control flow. FIXME, kludge.
+    if (t->enc->gd.algo&(1<<11)) { fenc_analyze_cb_rdo_threshold(t, cb); return; }
+
     int cb_split_flag = !(cb->flags&F265_CB_LEAF);
     int cb_unsplit_flag = !(cb->flags&F265_CB_FORBIDDEN);
-    int cb_intra_flag = cb_unsplit_flag;
+    int cb_intra_un_flag = cb_unsplit_flag;
+    int cb_intra_hv_flag = !cb_split_flag;
     int cb_inter_flag = cb_unsplit_flag && t->frame_type != F265_FRAME_I;
     uint8_t *init_tn = t->tt.tn;
 
@@ -3072,7 +3494,8 @@ static void fenc_analyze_cb(f265_enc_thread *t, f265_cb *cb)
     // Test the modes.
     if (cb_split_flag) fenc_analyze_split_cb(t, cb);
     if (cb_inter_flag) { if (t->an.rdm_flag) fenc_analyze_inter_cb_rdm(t, cb); else fenc_analyze_inter_cb_rdo(t, cb); }
-    if (cb_intra_flag) fenc_analyze_intra_cb(t, cb);
+    if (cb_intra_hv_flag) fenc_analyze_intra_hv(t, cb);
+    if (cb_intra_un_flag) fenc_analyze_intra_un(t, cb);
 
     // Save the CB.
     fenc_analyze_save_cb(t, cb, init_tn);
@@ -3089,10 +3512,14 @@ void fenc_analyze_ctb(f265_enc_thread *t)
     fenc_trace_syntax_flag = 0;
     #endif
 
+    #ifdef VAN_TRACE_ANALYSIS
     // FIXME. Used for debug.
-    #if 0
-    printf("\nFrame %d CTB %d (%d,%d) pix (%d,%d).\n",
-           (int)t->src_frame->abs_poc, t->ctb_xy, t->ctb_x, t->ctb_y, t->ctb_off[0], t->ctb_off[1]);
+    fenc_trace_analysis_flag = t->src_frame->abs_poc == 1 && t->ctb_xy >= 0;
+    if (fenc_trace_analysis_flag)
+    {
+        printf("\nFrame %d CTB %d (%d,%d) pix (%d,%d).\n",
+               (int)t->src_frame->abs_poc, t->ctb_xy, t->ctb_x, t->ctb_y, t->ctb_off[0], t->ctb_off[1]);
+    }
     #endif
 
     // Back up the raw CABAC object.
diff --git a/f265/bdi.h b/f265/bdi.h
index 9856598..9d2e4ce 100644
--- a/f265/bdi.h
+++ b/f265/bdi.h
@@ -76,11 +76,13 @@
 #else
 #define noinline __attribute__ ((noinline))
 // GCC randomly fails to inline functions after trivial modifications, with a
-// message like "sorry, unimplemented, function not inlinable". Uncomment when
-// gcc gets repaired or we switch to a better compiler.
-// #define finline __attribute__ ((always_inline))
+// message like "sorry, unimplemented, function not inlinable".
+#ifdef __clang__
+#define finline __attribute__ ((always_inline)) inline
+#else
 #define finline inline
 #endif
+#endif
 
 // Data alignment specifications.
 // NOTE This prefix screws up Doxygen's parser, disable it when producing the
@@ -482,6 +484,7 @@ extern const uint8_t f265_scan_map_data[2*256];
 extern const uint8_t f265_last_coeff_table[32];
 extern const uint8_t f265_coeff_nz_ctx_table[5*16];
 extern const uint8_t f265_gt1_ctx_counter_table[8];
+extern const int8_t f265_nz_to_gt1_table[17];
 extern const uint8_t f265_mpm_bin_table[3];
 extern const uint8_t f265_chroma_mode_table[4];
 extern const int8_t f265_intra_luma_modes[35];
diff --git a/f265/bdi_ro.c b/f265/bdi_ro.c
index 57ed3d3..05972f0 100644
--- a/f265/bdi_ro.c
+++ b/f265/bdi_ro.c
@@ -455,6 +455,10 @@ const uint8_t f265_coeff_nz_ctx_table[5*16] =
 // Coefficient greater-than-1 context counter update table.
 const uint8_t f265_gt1_ctx_counter_table[8] = { 0, 2, 3, 3, 0, 0, 0, 0 };
 
+// Convert the number of non-zero coefficients to the number of greater-than-1
+// flags.
+const int8_t f265_nz_to_gt1_table[17] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
+
 // Intra MPM binarization table. The first two bits are the binarization string,
 // the next two bits are the number of bins.
 const uint8_t f265_mpm_bin_table[3] = { 0x4, 0xa, 0xb };
diff --git a/f265/enc.h b/f265/enc.h
index b00584d..12da932 100644
--- a/f265/enc.h
+++ b/f265/enc.h
@@ -924,6 +924,12 @@ typedef struct f265_analysis
     // True if the blocks are reconstructed in RDM.
     int8_t rdm_rec_flag;
 
+    // True to bypass the intra algo configuration.
+    int8_t bypass_intra_algo_flag;
+
+    // True if the 8x8 CBs look interesting.
+    int8_t long_8x8_analysis_flag;
+
     // We may override the encoder parameters with local parameters during
     // analysis to speed up the analysis.
     uint8_t tb_range[2];
diff --git a/f265/entropy.c b/f265/entropy.c
index ed06549..bd1d5ed 100644
--- a/f265/entropy.c
+++ b/f265/entropy.c
@@ -401,9 +401,9 @@ static finline int fenc_write_rqt_root_cbf(f265_enc_thread *t, f265_cb *cb)
     return flag;
 }
 
-// Write the coefficients in a non-empty transform block.
-// FIXME: check if forced inlining of chroma_flag is beneficial.
-static finline void fenc_write_tb(f265_cabac_bs *cbs, f265_tt_enc *tt, int chroma_flag)
+// Write the coefficients in a non-empty transform block. We use a frontend
+// function to control how inlining occurs. It's slightly faster with clang.
+static finline void fenc_write_tb_base(f265_cabac_bs *cbs, f265_tt_enc *tt, int chroma_flag)
 {
     f265_tb_enc *tb = tt->tb;
     f265_sb_enc *sb = tt->sb;
@@ -507,13 +507,12 @@ static finline void fenc_write_tb(f265_cabac_bs *cbs, f265_tt_enc *tt, int chrom
         // Compute the values of the subblock neighbour flags for the context
         // derivation of the non-zero flags.
         int neighbour_flags[2] = { (sb_neighbour_flags[0]>>sb_pos)&1, (sb_neighbour_flags[1]>>sb_pos)&1 };
-        int sb_neighbour_idx = neighbour_flags[0]|neighbour_flags[1];
-        int coeff_neighbour_idx = (neighbour_flags[0]|(neighbour_flags[1]<<1))<<4;
 
         // Encode the subblock non-zero flag unless it is inferred to be 1.
         int sb_enc_flag = (sb_nz_flags>>sb_idx)&1;
         if (middle_sb_flag)
         {
+            int sb_neighbour_idx = neighbour_flags[0]|neighbour_flags[1];
             fenc_trace_syntax_element("coded_sub_block_flag");
             fenc_encode_context_bin(cbs, F265_CO_CODED_SUB_BLOCK + 2*chroma_flag + sb_neighbour_idx, sb_enc_flag);
         }
@@ -521,19 +520,11 @@ static finline void fenc_write_tb(f265_cabac_bs *cbs, f265_tt_enc *tt, int chrom
         // Skip empty subblocks.
         if (!sb_enc_flag) continue;
 
-        // Expand the subblock data and pass to the next.
-        int nz_flags = sb->nz_flags;
-        int signs = sb->signs;
-        int remain_flags = sb->remain_flags;
-        int gt1_flags = sb->gt1_flags;
-        int gt2_flag = sb->packed_data>>5;
-        int nb_nz = sb->packed_data&31;
-        int nb_gt1 = F265_MIN(nb_nz, 8);
-        sb++;
-
         // Encode the coefficient non-zero flags. The non-zero flag of the first
         // coefficient is present unless the subblock non-zero flag is present
         // and the other coefficients are zero.
+        int coeff_neighbour_idx = (neighbour_flags[0]|(neighbour_flags[1]<<1))<<4;
+        int nz_flags = sb->nz_flags;
         int skip_first_coeff_nz_flag = middle_sb_flag & (nz_flags == (1<<15));
         for (int i = last_coeff_nz_idx; i < 16-skip_first_coeff_nz_flag; i++)
         {
@@ -553,7 +544,11 @@ static finline void fenc_write_tb(f265_cabac_bs *cbs, f265_tt_enc *tt, int chrom
         // Encode the greater-than-1 flags. FIXME: also count the number of
         // coefficients inferred using the greater-than-1 flags. Use conditional
         // compilation to use popcnt below instead when available.
+        int nb_nz = sb->packed_data&31;
+        int nb_gt1 = f265_nz_to_gt1_table[nb_nz];
         int nb_gt1_inferred = 0;
+        int gt1_flags = sb->gt1_flags;
+        int gt2_flag = sb->packed_data>>5;
         for (int i = 0; i < nb_gt1; i++)
         {
             int gt1_flag = (gt1_flags>>i)&1;
@@ -573,11 +568,13 @@ static finline void fenc_write_tb(f265_cabac_bs *cbs, f265_tt_enc *tt, int chrom
         }
 
         // Encode the signs.
+        int signs = sb->signs;
         int sign_hidden_flag = (sign_hiding_flags>>sb_pos)&1;
         if (nb_nz - sign_hidden_flag) fenc_trace_syntax_element("coeff_sign_flag");
         fenc_encode_bypass_bins(cbs, signs >> sign_hidden_flag, nb_nz - sign_hidden_flag);
 
         // Encode the coefficient remaining levels.
+        int remain_flags = sb->remain_flags;
         if (remain_flags)
         {
             // The current base level decreases as we consume the last
@@ -652,6 +649,9 @@ static finline void fenc_write_tb(f265_cabac_bs *cbs, f265_tt_enc *tt, int chrom
             // Pass to the next subblock levels.
             levels += 16;
         }
+
+        // Pass to the next subblock.
+        sb++;
     }
 
     tt->tb++;
@@ -659,6 +659,12 @@ static finline void fenc_write_tb(f265_cabac_bs *cbs, f265_tt_enc *tt, int chrom
     tt->levels = levels;
 }
 
+static noinline void fenc_write_tb(f265_cabac_bs *cbs, f265_tt_enc *tt, int chroma_flag)
+{
+    if (chroma_flag) fenc_write_tb_base(cbs, tt, 1);
+    else fenc_write_tb_base(cbs, tt, 0);
+}
+
 // Encode the current TT. Used in RDOQ to update the contexts.
 static void fenc_encode_rdoq_sub_part(f265_enc_thread *t, int yuv_flag, int comp, int lg_bs, int depth)
 {
@@ -751,6 +757,7 @@ static void fenc_write_tt(f265_enc_thread *t, f265_cb *cb, int cbf_mask, int max
     for (int c = 1; c < 3; c++) if (chroma_cbf&(1<<c)) fenc_write_tb(&t->cbs, &t->tt, 1);
 }
 
+
 ///////////////////////////////////////////////////////////////////////////////
 // Non-RDO functions.
 
diff --git a/f265/parse.c b/f265/parse.c
index b4e3231..96778d1 100644
--- a/f265/parse.c
+++ b/f265/parse.c
@@ -267,10 +267,24 @@ static void handle_param_quality(f265_parse_ctx *ctx, f265_enc_params *p, f265_p
         p->tb_range[1] = 4;
         p->tb_depth[0] = 1;
         p->tb_depth[1] = 1;
-        p->rdo_level = 0;
+        p->rdo_level = 1;
+        p->amp_flag = 0;
+        p->tmv_flag = 1;
+        p->algo = (1<<0)|(1<<2)|(1<<4)|(1<<5)|(1<<7)|(1<<10)|(1<<11)|(1<<14);
+    }
+
+    else if (quality <= 15)
+    {
+        p->cb_range[0] = 3;
+        p->cb_range[1] = 5;
+        p->tb_range[0] = 2;
+        p->tb_range[1] = 5;
+        p->tb_depth[0] = 1;
+        p->tb_depth[1] = 1;
+        p->rdo_level = 1;
         p->amp_flag = 0;
         p->tmv_flag = 1;
-        p->algo = (1<<0)|(1<<2)|(1<<4)|(1<<10)|(1<<12);
+        p->algo = (1<<0)|(1<<2)|(1<<4)|(1<<5)|(1<<7)|(1<<10)|(1<<11)|(1<<14);
     }
 
     else if (quality <= 25)

[f265 dev team] Algorithmic patch 2

Reply via email to