diff --git a/analyze.c.orig.c b/analyze.c
index 7b62a2a..cd64c57 100644
--- a/analyze.c.orig.c
+++ b/analyze.c
@@ -103,6 +103,13 @@ static void fenc_analyze_cb(f265_enc_thread *t, f265_cb *cb);
 // Number of bytes to copy in a stash frame.
 #define F265_STASH_FRAME_SIZE (8*4)
 
+// REVIEW. After reading the code multiple times, I am not a fan of this syntax.
+//         Having to interpret the save_flag when reading the code removes a
+//         layer of clarity. I would prefer a function or a macro that calls
+//         this function, adding the save_flag so that when you read the code,
+//         you immediately know if you are saving to the stash, or loading from
+//         it. The macro approach might best here.
+
 // Save or load the CABAC contexts from the stash. We might want to refine this
 // eventually, e.g. we'll never need the SAO contexts.
 static inline void fenc_stash_copy_cabac(f265_enc_thread *t, uint8_t **wp, int save_flag)
@@ -143,11 +150,17 @@ static inline void fenc_stash_copy_rec(f265_enc_thread *t, uint8_t **wp, int sav
     {
         if (!(flags&(1<<comp))) continue;
 
+        // REVIEW. Are we always going to assume 4:2:0 subsampling? We should
+        //         probably add a FIXME tag here for further modifications.
         int csf = !!comp;
         int px = s->px>>csf;
         int py = s->py>>csf;
         int width = s->width>>csf;
         int height = s->height>>csf;
+
+        // REVIEW. This value is constant. Consider declaring it outside the
+        //         loop, or directly after comp in the loop declaration.
+        //         for (int comp = 0, ref_stride = t->me.ref_stride;
         int ref_stride = t->me.ref_stride;
         f265_pix *rec_plane = t->src_frame->rec_planes[comp ? 3+comp : 0] + py*ref_stride + px;
         f265_pix *stash_rec = (f265_pix*)*wp;
@@ -222,6 +235,12 @@ static void fenc_stash_init_cb(f265_enc_thread *t, f265_cb *cb)
     fenc_stash_init_cb_off(t, cb, 0, 0, 1<<cb->lg_bs, 1<<cb->lg_bs);
 }
 
+// REVIEW. Constants related to the stash flags needs to be define. Reading
+//         stash_flags = 16|8; is not insightful. Instead, we should have
+//         something like stash_flags = F265_STASH_CABAC|F265_STASH_TT;.
+//         Please change this in the f265 analysis. I don't expect you to apply
+//         the changes to the HM emulation analysis code.
+
 // Rollback to the initial state.
 static void fenc_stash_reset(f265_enc_thread *t)
 {
@@ -276,8 +295,13 @@ static void fenc_stash_save_reset(f265_enc_thread *t)
 // fake the reconstruction.
 static void fenc_analyze_fake_cb_rec(f265_enc_thread *t, f265_cb *cb, int comp)
 {
+    // REVIEW. Are we always going to assume 4:2:0 subsampling? Might want to
+    //         add a FIXME tag for future modifications.
     int bs = 1<<(cb->lg_bs-!!comp);
     int plane_off = fenc_get_cb_block_plane_off(t, cb, comp, 0, 0);
+
+    // REVIEW. From compactness, simply use t->me.ref_stride in the function
+    //         call below.
     int ref_stride = t->me.ref_stride;
     f265_pix *src = t->src_frame->src_planes[comp] + plane_off;
     f265_pix *rec = t->src_frame->rec_planes[comp ? 3+comp : 0] + plane_off;
@@ -303,6 +327,9 @@ static int64_t fenc_rdo_bit_cost(f265_enc_thread *t, int rdo_bits)
 {
     // 64-bit arithmetic to avoid overflows before shift.
     int64_t rdo_lambda = t->an.rdo_lambda;
+
+    // REVIEW. Lamba is in 256 fractions. Bits are in 32768 fractions. Why keep
+    //         an extra 5 bits of precision? Please document.
     return (rdo_bits*rdo_lambda)>>(15+8-5);
 }
 
@@ -475,6 +502,8 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
 {
     f265_intra_block *ib = &t->intra_block;
     f265_pix pred[32*32];
+    // REVIEW. Are we always going to assume 4:2:0 subsampling? Add a FIXME tag
+    //         for later modifications.
     int csf = !!comp;
     int ct_ox = (cb->cb_off[0]>>csf) + cb_ox;
     int ct_oy = (cb->cb_off[1]>>csf) + cb_oy;
@@ -540,6 +569,7 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
         int dst_flag, order;
         fenc_get_intra_encode_flags(&dst_flag, &order, comp, lg_bs, mode);
         fenc_set_tmp_tb(t);
+        // REVIEW. Replace 1<<lg_bs by bs. It's a local variable.
         *nz_flag = fenc_rec_tb(t, pred, 1<<lg_bs, comp, lg_bs, dst_flag, order, 0, ct_ox, ct_oy, 0, 1, 0);
 
         // Compute the encoding cost.
@@ -558,6 +588,8 @@ static int64_t fenc_analyze_intra_tb(f265_enc_thread *t, f265_cb *cb, int *nz_fl
     return cost;
 }
 
+// REVIEW. CONTINUE FROM HERE. EVERYTHING ABOVE HAS BEEN REVIEWED.
+
 // Return the cost of the intra chroma transform tree. Set the two chroma
 // non-zero flags.
 static int64_t fenc_analyze_intra_chroma_tt(f265_enc_thread *t, f265_cb *cb, int *chroma_nz, int mode, int depth,
@@ -566,6 +598,7 @@ static int64_t fenc_analyze_intra_chroma_tt(f265_enc_thread *t, f265_cb *cb, int
     int64_t cost;
     int tt_nz = 0;
 
+    // REVIEW. ((*t->tt.tn++)>>3)&1 also gets the job done (in one less op).
     // Get the split flag. Skip the 4x4 subblocks if present.
     int split_flag = !!((*t->tt.tn++)&8);
     if (split_flag && lg_bs == 2)
@@ -719,6 +752,8 @@ static int64_t fenc_analyze_intra_cb_chroma(f265_enc_thread *t, f265_cb *cb, uin
     return best_cost;
 }
 
+// REVIEW. Everything above this has been reviewed.
+
 // Return the cost of the intra luma transform tree. This function currently
 // stores the transform nodes unconditionally.
 static int64_t fenc_analyze_intra_luma_tt(f265_enc_thread *t, f265_cb *cb, int mode, int depth_range[2], int depth,
@@ -1104,6 +1139,9 @@ static int64_t fenc_analyze_inter_sub_tt(f265_enc_thread *t, f265_cb *cb, int *y
     return best_cost;
 }
 
+// REVIEW. Please elaborate the strategy of signalling a max cost. We use this
+//         to favour skip over merge, and cbf_root=0. The idea is to select the
+//         cheapest, most explicit signalling.
 // Return the cost of the inter transform tree with a residual (including
 // root_cbf if needed). The cost is maximal if it turns out there is no
 // residual.
@@ -1194,6 +1232,8 @@ static int64_t fenc_analyze_inter_tt(f265_enc_thread *t, f265_cb *cb)
 
         if (stash_flag)
         {
+            // REVIEW. Please add a quick comment to say that we avoid complex
+            //         signalling when all we need is to signal no residual.
             if (residual_cost != F265_MAX_SSD) fenc_stash_save_reset(t);
             else fenc_stash_reset(t);
         }
@@ -1270,6 +1310,13 @@ static void fenc_analyze_ctb_tt(f265_enc_thread *t, f265_cb *cb)
 
     int lg_bs = cb->lg_bs;
 
+    // REVIEW. This function is only called from fenc_analyze_ctb. The ambiguity
+    //         flag could be computed there and simply passed along as a
+    //         parameter. From the looks of things, this value will always be
+    //         the same. In fact, you might want to consider two separate
+    //         functions: one for the ambiguous case and one for the unambiguous
+    //         case. This would lead to 2 smaller recursive functions without
+    //         branching and very little redundant code: the 2 conditions above.
     // Determine if ambiguity is possible in the whole CTB.
     int ambiguity_flag = t->an.tb_depth[0] || t->an.tb_depth[1];
 
@@ -1464,6 +1511,10 @@ static void fenc_analyze_intra_luma_select_rdm_cands(f265_enc_thread *t, f265_cb
         {
             int cost = ib->rdm_mode_costs[mode];
 
+            // REVIEW. Please clean up. If the branchy version is truly better,
+            //         remove the MIN_MAX case. If there is little difference,
+            //         I would opt for the MIN_MAX approach to stay consistent
+            //         with the code block above.
             // Branchy version.
             #if 1
             if (cost < best_costs[0])
@@ -1582,10 +1633,14 @@ static void fenc_analyze_intra_luma_angular(f265_enc_thread *t, f265_cb *cb, int
             // Get the refinement direction.
             int dir = (best_mode == init_mode+1) ? 1 : -1;
 
+            // REVIEW. These comment lines exceed 80 characters.
             // Get the maximum number of modes to test beyond the first neighbour.
             int max_modes = (nb_init == 11) ? 2 : 6;
 
             // Refine in the direction. Bail out early if the cost stops decreasing.
+
+            // REVIEW. Guard against cur_mode < 2 or cur_mode > 34? In practice,
+            //         this may never happen.
             for (int cur_mode = init_mode + 2*dir, i = 0; i < max_modes; i++, cur_mode += dir)
             {
                 fenc_analyze_intra_luma_angular_mode_rdm(t, cb, &best_cost, &best_mode, cur_mode);
@@ -1603,6 +1658,7 @@ static void fenc_analyze_intra_luma_angular(f265_enc_thread *t, f265_cb *cb, int
         // Refine left.
         if (init_mode > 2)
         {
+            // REVIEW. Guard against cur_mode < 2?
             for (int i = 0, cur_mode = init_mode - 1; i < max_modes; i++, cur_mode--)
             {
                 fenc_analyze_intra_luma_angular_mode_rdm(t, cb, &best_cost, &best_mode, cur_mode);
@@ -1613,6 +1669,7 @@ static void fenc_analyze_intra_luma_angular(f265_enc_thread *t, f265_cb *cb, int
         // Refine right.
         if (init_mode < 34)
         {
+            // REVIEW. Guard against cur_mode > 34?
             for (int i = 0, cur_mode = init_mode + 1; i < max_modes; i++, cur_mode++)
             {
                 fenc_analyze_intra_luma_angular_mode_rdm(t, cb, &best_cost, &best_mode, cur_mode);
@@ -1627,11 +1684,13 @@ static void fenc_analyze_intra_luma_angular(f265_enc_thread *t, f265_cb *cb, int
         // Number of modes on each side of the best position.
         int nb_side = (nb_init == 11) ? 3 : 7;
 
+        // REVIEW. Guard against init_mode - i - 1 < 2?
         // Refine left.
         if (init_mode > 2)
             for (int i = 0; i < nb_side; i++)
                 fenc_analyze_intra_luma_angular_mode_rdm(t, cb, &best_cost, &best_mode, init_mode - i - 1);
 
+        // REVIEW. Guard against init_mode + i + 1 > 34?
         // Refine right.
         if (init_mode < 34)
             for (int i = 0; i < nb_side; i++)
@@ -1641,6 +1700,16 @@ static void fenc_analyze_intra_luma_angular(f265_enc_thread *t, f265_cb *cb, int
     ib->rdm_angular_mode = best_mode;
 }
 
+// REVIEW. I'm not convinced of the gain we get from separating the case we
+//         only want to keep 1 mode from the alternative case of keeping 3. We
+//         could always run the second part of the loop, and simply change the
+//         loop that transfers the best candidates to the intra block from
+//         for (int i = 0; i < 3; i++) ib->cands[i] = best_modes[i];
+//         to
+//         for (int i = 0; i < nb_kept; i++) ib->cands[i] = best_modes[i];
+//         To be on the safe side, we could run F265_MIN(3, nb_kept).
+//         Unless the code really improves performance, I would prefer less code
+//         with the extra conditions.
 // Analyze the specified modes in RDO. Update the candidate cache with the best
 // modes found. Return the cost of the best mode.
 static int64_t fenc_analyze_intra_luma_rdo(f265_enc_thread *t, f265_cb *cb, int8_t *cands, int nb_cands,
@@ -1755,6 +1824,10 @@ static int64_t fenc_analyze_intra_part_luma(f265_enc_thread *t, f265_cb *cb, int
     {
         int algo = t->enc->gd.algo, tmp;
 
+        // REVIEW. I don't understand the need for this initialization block.
+        //         rdm_base_algo is sure to be overwritten. rdm_angular_algo
+        //         will also be overwritten. Please remove any unnecessary
+        //         assignments to zero. It will make the function more compact.
         // Initialize.
         ib->rdm_base_algo = 0;
         ib->rdm_angular_algo = 0;
@@ -1778,6 +1851,7 @@ static int64_t fenc_analyze_intra_part_luma(f265_enc_thread *t, f265_cb *cb, int
         // Early termination.
         ib->rdm_angular_early_term_flag = F265_GET_FLAG(algo, (1<<4));
 
+        // REVIEW. s/algorihtm/algorithm/.
         // RDM candidate selection algorihtm.
         ib->rdm_cand_select_algo = F265_GET_FLAG(algo, (1<<5));
 
@@ -1940,6 +2014,7 @@ static int64_t fenc_analyze_intra_part_luma(f265_enc_thread *t, f265_cb *cb, int
     return best_cost;
 }
 
+// REVIEW. Agreed. This function needs to be split.
 // Return the cost of encoding the CB with intra prediction.
 // LB FIXME, split that function.
 static void fenc_analyze_intra_cb(f265_enc_thread *t, f265_cb *cb)
@@ -1949,6 +2024,8 @@ static void fenc_analyze_intra_cb(f265_enc_thread *t, f265_cb *cb)
     int lg_bs = cb->lg_bs;
     int split_flag = lg_bs == t->enc->gd.cb_range[0];
     int depth_range[2];
+
+    // REVIEW. See note above about naming the stash flags.
     int save_stash_flags = 16|8|4|2|1, luma_stash_flags = 16|8|1, chroma_stash_flags = 16|4|2;
 
     // No cache right now.
@@ -1964,6 +2041,9 @@ static void fenc_analyze_intra_cb(f265_enc_thread *t, f265_cb *cb)
     // if worse than UN.
     if (split_flag)
     {
+        // REVIEW. Consider caching lg_bs-1 here. It's used multiple times
+        //         below.
+
         // Add a split transform node.
         if (!an->rdm_flag) *t->tt.tn++ = 15;
 
@@ -2005,6 +2085,7 @@ static void fenc_analyze_intra_cb(f265_enc_thread *t, f265_cb *cb)
             }
         }
 
+        // REVIEW. Please use an->rdm_flag for consistency.
         // Analyze chroma.
         t->stash.flags = chroma_stash_flags;
         if (!t->an.rdm_flag) cost += fenc_analyze_intra_cb_chroma(t, cb, init_tn);
@@ -2049,6 +2130,7 @@ static void fenc_analyze_intra_cb(f265_enc_thread *t, f265_cb *cb)
         t->stash.flags = luma_stash_flags;
         cost += fenc_analyze_intra_part_luma(t, cb, depth_range, 0, lg_bs, 0, 0);
 
+        // REVIEW. Please use an->rdm_flag for consistency.
         // Analyze chroma.
         t->stash.flags = chroma_stash_flags;
         if (!t->an.rdm_flag) cost += fenc_analyze_intra_cb_chroma(t, cb, init_tn);
@@ -2108,6 +2190,8 @@ static int fenc_analyze_get_merge_cands(f265_enc_thread *t, f265_cb *cb, int par
 static finline void fenc_analyze_get_un_merge_cands(f265_enc_thread *t, f265_cb *cb)
 {
     f265_cb_analysis *cba = t->cba;
+
+    // REVIEW. Why not simply cache t->inter_block.neighbours? We use it twice?
     f265_inter_block *ib = &t->inter_block;
 
     // Get the neighbour MVs.
@@ -2177,6 +2261,12 @@ static void fenc_analyze_un_merge_rdo(f265_enc_thread *t, f265_cb *cb, int8_t *i
 {
     f265_cb_analysis *cba = t->cba;
 
+    // REVIEW. These two exact statements appear below in
+    //         fenc_analyze_un_merge_rdo_all. I checked and this function is
+    //         only called from there. Calling fenc_analyze_get_un_merge_cands
+    //         does not change the flags, nor the inter partition type. You can
+    //         safely remove these statements. Please move the comment above the
+    //         statements in fenc_analyze_un_merge_rdo_all.
     // Set the partitioning mode to UN.
     F265_SET_FLAG(cb->flags, F265_CB_SPLIT|F265_CB_INTRA|F265_CB_SKIP, 0);
     cb->inter_part = F265_PART_UN;
@@ -2214,6 +2304,8 @@ static void fenc_analyze_un_merge_rdo(f265_enc_thread *t, f265_cb *cb, int8_t *i
             int64_t tt_cost = fenc_analyze_inter_tt_residual(t, cb, 0);
             fenc_stash_pop(t);
 
+            // REVIEW. Please refine the comment. Merge + no residual = skip.
+            //         Favour skip in such cases.
             // There is a residual.
             if (tt_cost != F265_MAX_SSD)
             {
@@ -2235,6 +2327,15 @@ static void fenc_analyze_un_merge_rdo(f265_enc_thread *t, f265_cb *cb, int8_t *i
             int64_t cost = skip_base_cost + merge_idx_cost + fenc_analyze_inter_tt_no_residual(t, cb);
             if (cost < cba->rdo_best_cost)
             {
+                // REVIEW. It took me a while to understand the "update".
+                //         Reading through the code, I was lead to believe that
+                //         merge_idx_ctx and contexts[F265_CO_MERGE_IDX] were
+                //         identical at this point. It isn't the case because
+                //         calling fenc_stash_reset reverts the CABAC contexts
+                //         to their state prior to signalling the merge_idx.
+                //         The comment above merge_idx_ctx's declaration as well
+                //         as the one atop this line need to be refined to help
+                //         the reader catch on quickly.
                 // Update the merge index context.
                 t->cbs.contexts[F265_CO_MERGE_IDX] = merge_idx_ctx;
 
@@ -2663,6 +2764,7 @@ static void fenc_analyze_inter_part_mode_rdm(f265_enc_thread *t, f265_cb *cb, in
 {
     f265_cb_analysis *cba = t->cba;
 
+    // REVIEW. Add a FIXME tag to the optimization comment.
     // Set the partitioning mode and update the prediction map. This needs to be
     // optimized.
     F265_SET_FLAG(cb->flags, F265_CB_SPLIT|F265_CB_INTRA|F265_CB_SKIP, 0);
@@ -2740,6 +2842,10 @@ static void fenc_analyze_inter_cb_rdo(f265_enc_thread *t, f265_cb *cb)
 static void fenc_analyze_split_cb(f265_enc_thread *t, f265_cb *cb)
 {
     f265_analysis *an = &t->an;
+
+    // REVIEW. Can you move this closer to the children loop to put emphasis on
+    //         this being a bookmark rather than a local variable to write more
+    //         compact expressions (like the "an" variable above).
     f265_cb_analysis *cba = t->cba;
 
     // Update the prediction map. For intra, this is unnecessary. For inter, the
@@ -2880,6 +2986,11 @@ static void fenc_analyze_init_cb(f265_enc_thread *t, f265_cb *cb)
             pred_costs[1] = fenc_context_bin_cost(t, F265_CO_PRED_MODE, 1);
             int64_t merge_cost = fenc_context_bin_cost(t, F265_CO_MERGE_FLAG, 1);
 
+            // REVIEW. Adding details to understand these costs will make it
+            //         easier to understand. No need to add much details.
+            //         Something like this would suffice:
+            //         split_cu_flag=0, cu_skip_flag=1.
+
             cba->rdo_se_costs[2] = split_cost + skip_costs[1];
             cba->rdo_se_costs[3] = split_cost + skip_costs[0] + pred_costs[0] + part_costs[1] + merge_cost;
             cba->rdo_se_costs[4] = split_cost + skip_costs[0] + pred_costs[0];
@@ -2947,6 +3058,11 @@ static void fenc_analyze_save_cb(f265_enc_thread *t, f265_cb *cb, uint8_t *init_
     f265_analysis *an = &t->an;
     f265_cb_analysis *cba = t->cba;
 
+    // REVIEW. For code clarity, could you define a local best_mode variable and
+    //         avoid the ternary operator as conditional statements?
+    //         Assuming a small type (see REVIEW note below), all 4 ternary
+    //         operators could be replaced by it.
+
     // Restore the children.
     if (an->rdm_flag ? cba->rdm_best_mode == 0 : cba->rdo_best_mode == 0)
     {
@@ -2966,6 +3082,11 @@ static void fenc_analyze_save_cb(f265_enc_thread *t, f265_cb *cb, uint8_t *init_
         // Restore inter.
         if (an->rdm_flag ? cba->rdm_best_mode >= 4 : cba->rdo_best_mode >= 4)
         {
+            // REVIEW. I don't follow why rdm_inter_mode is being used here
+            //         rather than rdm_best_mode. The call to
+            //         fenc_analyze_import_inter_mode uses rdm_best_mode or
+            //         rdo_best_mode.
+
             fenc_analyze_import_inter_mode(cb, cba, an->rdm_flag ? cba->rdm_inter_mode : cba->rdo_best_mode);
 
             // In RDO, reconstruct the CB with the best inter mode.
@@ -2975,6 +3096,8 @@ static void fenc_analyze_save_cb(f265_enc_thread *t, f265_cb *cb, uint8_t *init_
                 t->stash.flags = 16|8;
                 fenc_stash_restore(t);
 
+                // REVIEW. These comment lines exceed 80 characters.
+
                 // FIXME WARNING: this code is setting the YUV flags in the transform
                 // nodes to their actual non-zero values. This is not what we want for
                 // the final RDOQ. Also, we're writing the coefficients uselessly.
@@ -3045,6 +3168,20 @@ static void fenc_analyze_cb(f265_enc_thread *t, f265_cb *cb)
 
     // Test the modes.
     if (cb_split_flag) fenc_analyze_split_cb(t, cb);
+
+    // REVIEW. Even if this fits in 120 characters, it is not fun to read.
+    //         Can you please change it to
+    //         if (cond)
+    //         {
+    //             if (cond) statement;
+    //             else statement;
+    //         }
+    //
+    //         Alternatively, you may want to simply add an
+    //         fenc_analyze_inter_cb function and hide the conditional call.
+    //         You can make it an inline function. I would prefer this as it
+    //         would match the call to fenc_analyze_intra_cb.
+
     if (cb_inter_flag) { if (t->an.rdm_flag) fenc_analyze_inter_cb_rdm(t, cb); else fenc_analyze_inter_cb_rdo(t, cb); }
     if (cb_intra_flag) fenc_analyze_intra_cb(t, cb);
 
@@ -3057,6 +3194,9 @@ void fenc_analyze_ctb(f265_enc_thread *t)
 {
     f265_gen_data *gd = &t->enc->gd;
     f265_analysis *an = &t->an;
+
+    // REVIEW. For compactness, please use gd->algo. gd is fetched above.
+    // REVIEW. For readability, please define (1<<12) in bdi.h.
     an->rdm_rec_flag = F265_GET_FLAG(t->enc->gd.algo, (1<<12));
 
     #ifdef VAN_TRACE_SYNTAX
@@ -3093,6 +3233,12 @@ void fenc_analyze_ctb(f265_enc_thread *t)
     // Initialize the transform tree.
     fenc_init_transform_tree(t);
 
+    // REVIEW. I understand that the following assignments are an infinitely
+    //         small part of the analysis, but we should simply copy them to the
+    //         analysis structure during the encoding thread's initialization.
+    //         That is unless we plan to play with the analysis depths to speed
+    //         up the analysis. 
+
     // Set the analyzed TB range to the actual parameter values.
     // FIXME, precompute tb_depth.
     // - intra[lg_bs-2][2];
@@ -3107,11 +3253,16 @@ void fenc_analyze_ctb(f265_enc_thread *t)
     t->cba = (f265_cb_analysis*)t->store;
     t->store += sizeof(f265_cb_analysis);
     fenc_analyze_cb(t, t->cb);
+
+    // REVIEW. t->store = t->cba.
     t->store -= sizeof(f265_cb_analysis);
 
     // Obtain valid transform trees for the chosen CTB layout.
     if (an->rdm_flag)
     {
+        // REVIEW. For consistency, please use an->rdm_flag instead of
+        //         t->an.rdm_flag.
+
         // Switch to RDO.
         t->an.rdm_flag = 0;