%%% Please search for %%%.

diff --git a/f265/analyze.c b/f265/analyze.c
index 84fa1f5..79e592b 100644
--- a/f265/analyze.c
+++ b/f265/analyze.c
@@ -32,6 +32,7 @@
 //
 //   Bit 8 enables the second RDO pass.
 //
+// Bit 9 enables fake luma bi-prediction.
 // Bit 10 enables fake luma interpolation.
 // Bit 11 enables the use of thresholds for the mode analysis.
 // Bit 12 forces the reconstruction of blocks in RDM.
@@ -2682,7 +2683,20 @@ static int fenc_analyze_inter_part(f265_enc_thread *t, f265_cb *cb, int part_idx
             }
 
             // Compute the cost and update the partition cost.
-            int bi_cost = cba->rdm_bin_cost + fenc_me_mv_total_cost_bi(ib->bi_mv, me);
%%% Why not simply init bi_cost at cba->rdm_bin_cost?
%%% It would avoid the accumulate operation in the if block.
%%% Would would need to accumulate fenc_me_mv_total_cost_bi() in the else block.
+            int bi_cost = 0;
+
+            // Fake the interpolation. This changes the motion estimation
+            // context.
+            if (t->enc->gd.algo&(1<<9))
+            {
+                bi_cost += cba->rdm_bin_cost;
+                for (int i = 0; i < 2; i++) bi_cost += fenc_me_mv_cost_test(me, ib->bi_mv[i], i);
+                bi_cost += fenc_get_fake_luma_block_dist_b(t, ib->bi_ref_idx, ib->bi_mv);
+            }
+
+            // Do the real interpolation.
+            else bi_cost = cba->rdm_bin_cost + fenc_me_mv_total_cost_bi(ib->bi_mv, me);
+
             F265_COPY2_IF_LT(part_cost, bi_cost, pred_type, 1, int, int);
         }
     }
diff --git a/f265/enc.h b/f265/enc.h
index bca2e55..afbca27 100644
--- a/f265/enc.h
+++ b/f265/enc.h
@@ -2869,6 +2869,7 @@ void fenc_me_interpol_plane(int16_t *dst, int dst_stride, f265_mv mv, f265_me_ct
                             f265_pix *ref_plane, int comp, int width, int height);
 void fenc_me_interpol(f265_pix *dst, int dst_stride, f265_mv mv, f265_me_ctx *me, int comp);
 void fenc_me_interpol_bi(f265_pix *dst, int dst_stride, f265_mv mv[2], f265_me_ctx *me, int comp);
%%% The nomenclature for the function is getting a little messy.
%%% fenc_get_fake_luma_block_disp_p exists. I understand the "B" version.
%%% However, the interpolation functions above don't use "_p", but use "_bi".
%%% Can we unify all of this?
+int fenc_get_fake_luma_block_dist_b(f265_enc_thread *t, int8_t bi_ref_idx[2], f265_mv bi_mv[2]);
 int fenc_me_get_dist(f265_me_ctx *me, f265_pix *src0, int32_t stride0, f265_pix *src1,
                      int32_t stride1, int32_t width, int32_t height, int32_t bitdepth);
 int fenc_me_luma_cost(f265_mv mv, f265_me_ctx *me);
diff --git a/f265/me.c b/f265/me.c
index 2d8024f..1eddb6e 100644
--- a/f265/me.c
+++ b/f265/me.c
@@ -149,8 +149,9 @@ static f265_pix* fenc_fake_luma_ref_p(f265_enc_thread *t, int *out_stride, f265_
     return out;
 }
 
-// Return the fake luma distortion cost for the specified motion vector.
-static int fenc_get_fake_luma_block_dist(f265_enc_thread *t, f265_mv mv)
+// Return the fake luma uni-predicted distortion cost for the specified motion
+// vector. The function assumes the references are already set up.
+static int fenc_get_fake_luma_block_dist_p(f265_enc_thread *t, f265_mv mv)
 {
     f265_me_ctx *me = &t->me;
     int packed_dims = me->packed_dims[0];
@@ -169,6 +170,42 @@ static int fenc_get_fake_luma_block_dist(f265_enc_thread *t, f265_mv mv)
     return dist;
 }
 
+// Return the fake luma bi-predicted distortion cost for the specified reference
+// indices and motion vectors. The function sets the references and clips the
+// motion vectors itself.
+int fenc_get_fake_luma_block_dist_b(f265_enc_thread *t, int8_t bi_ref_idx[2], f265_mv bi_mv[2])
+{
+    f265_me_ctx *me = &t->me;
+    int packed_dims = me->packed_dims[0];
+    int width = (packed_dims>>8)&0xff, height = packed_dims&0xff, awi = packed_dims>>24;
+    int aligned_block_size = F265_ALIGN_VAL(width*height, 64);
+    int alloc_size = 2*aligned_block_size*sizeof(f265_pix);
+    f265_pix *tmp_buf = (f265_pix*)t->store;
%%% Can you move this line above "int aligned block_size" or
%%% below "t->store += alloc_size"? I find it best if all the
%%% "allocation" code is grouped together.
+    fenc_fsad_func df = me->dist_func_id ? fenc_satd_awi : fenc_fsad[awi];
+
+    t->store += alloc_size;
+
+    // Get the reference in each list.
+    int ref_strides[2];
+    f265_pix *refs[2];
+    for (int list = 0; list < 2; list++)
+    {
+        fenc_me_set_ref(t, t->ref_ctx[list] + bi_ref_idx[list], 0);
+        f265_mv mv = f265_clip_mv(bi_mv[list], t->mc_bounds64);
+        refs[list] = fenc_fake_luma_ref_p(t, ref_strides + list, tmp_buf + aligned_block_size*list, mv.x, mv.y);
+    }
+
+    // Average the references. Clobber the L0 reference in place.
+    fenc_avg_pix[awi](tmp_buf, refs[0], ref_strides[0], refs[1], ref_strides[1], packed_dims);
+
+    // Compute the distortion.
+    int dist = df(me->src_planes[0], t->plane_stride, refs[0], width, packed_dims);
+
%%% Add a comment about "releasing" the memory.
+    t->store -= alloc_size;
+
+    return dist;
+}
+
 // Return the distortion using the current distortion metric.
 int fenc_me_get_dist(f265_me_ctx *me, f265_pix *src0, int32_t stride0, f265_pix *src1,
                      int32_t stride1, int32_t width, int32_t height, int32_t bitdepth)
@@ -213,44 +250,63 @@ int fenc_me_merge_cand_dist(f265_enc_thread *t, f265_inter_neighbour_mv cand)
     int bi_flag = cand.ref_idx[0] != -1 && cand.ref_idx[1] != -1;
     int uni_list = cand.ref_idx[0] == -1;
     int uni_ref_idx = cand.ref_idx[uni_list];
-    int dist = 0;
-
-    // Set the references and clip the MVs in place.
-    if (bi_flag)
-        for (int list = 0; list < 2; list++)
-        {
-            fenc_me_set_ref(t, t->ref_ctx[list] + cand.ref_idx[list], list);
-            f265_clip_mv(cand.mv[list], t->mc_bounds64);
-        }
 
-    else
+    // Fake the uniprediction.
+    if (likely(t->enc->gd.algo&(1<<10) && !bi_flag))
     {
         fenc_me_set_ref(t, t->ref_ctx[uni_list] + uni_ref_idx, 0);
-        f265_clip_mv(cand.mv[uni_list], t->mc_bounds64);
+        f265_mv mv = f265_clip_mv(cand.mv[uni_list], t->mc_bounds64);
+        return fenc_get_fake_luma_block_dist_p(t, mv);
     }
 
-    // Fast track.
-    if ((t->enc->gd.algo&0x400) && !bi_flag)
-        return fenc_get_fake_luma_block_dist(t, cand.mv[uni_list]);
+    // Fake the biprediction.
+    else if (t->enc->gd.algo&(1<<9) && bi_flag)
+    {
+        int8_t bi_ref_idx[2] = { cand.ref_idx[0], cand.ref_idx[1] };
+        return fenc_get_fake_luma_block_dist_b(t, bi_ref_idx, cand.mv);
+    }
 
-    // Pass each component.
-    int nb_comp = 1 + (me->chroma_flag<<1);
-    for (int comp = 0; comp < nb_comp; comp++)
+    // Use the real prediction.
+    else
     {
-        // Do the interpolation.
-        f265_pix buf[64*64];
-        if (bi_flag) fenc_me_interpol_bi(buf, 64, cand.mv, me, comp);
-        else fenc_me_interpol(buf, 64, cand.mv[uni_list], me, comp);
+        f265_mv mvs[2];
+        int dist = 0;
 
-        // Compute the distortion.
-        int is_chroma = !!comp;
-        int scale_x = me->csf[0] * is_chroma;
-        int scale_y = me->csf[1] * is_chroma;
-        dist += (me->dist[me->dist_func_id])(me->src_planes[comp], me->ref_stride, buf, 64,
-                                             me->dim[0]>>scale_x, me->dim[1]>>scale_y, me->bit_depth[0]);
-    }
+        // Set the references and clip the MVs.
+        if (bi_flag)
+        {
+            for (int list = 0; list < 2; list++)
+            {
+                fenc_me_set_ref(t, t->ref_ctx[list] + cand.ref_idx[list], list);
+                mvs[list] = f265_clip_mv(cand.mv[list], t->mc_bounds64);
+            }
+        }
 
-    return dist;
+        else
+        {
+            fenc_me_set_ref(t, t->ref_ctx[uni_list] + uni_ref_idx, 0);
+            mvs[0] = f265_clip_mv(cand.mv[uni_list], t->mc_bounds64);
+        }
+
+        // Pass each component.
+        int nb_comp = 1 + (me->chroma_flag<<1);
+        for (int comp = 0; comp < nb_comp; comp++)
+        {
+            // Do the interpolation.
+            f265_pix buf[64*64];
+            if (bi_flag) fenc_me_interpol_bi(buf, 64, mvs, me, comp);
+            else fenc_me_interpol(buf, 64, mvs[0], me, comp);
+
+            // Compute the distortion.
+            int is_chroma = !!comp;
+            int scale_x = me->csf[0] * is_chroma;
+            int scale_y = me->csf[1] * is_chroma;
+            dist += (me->dist[me->dist_func_id])(me->src_planes[comp], me->ref_stride, buf, 64,
+                                                 me->dim[0]>>scale_x, me->dim[1]>>scale_y, me->bit_depth[0]);
+        }
+
+        return dist;
+    }
 }
 
 // Compute the MV length like HM.
@@ -442,7 +498,7 @@ int fenc_me_mv_cost_test(f265_me_ctx *me, f265_mv mv, int ref_id)
 int fenc_me_mv_total_cost(f265_mv mv, f265_me_ctx *me)
 {
     int mv_cost = fenc_me_mv_cost_test(me, mv, 0);
-    int dist = (me->t->enc->gd.algo&0x400) ? fenc_get_fake_luma_block_dist(me->t, mv) : fenc_me_luma_cost(mv, me);
+    int dist = (me->t->enc->gd.algo&0x400) ? fenc_get_fake_luma_block_dist_p(me->t, mv) : fenc_me_luma_cost(mv, me);
     if (me->chroma_flag)
         for (int comp = 1; comp < 3; comp++)
             dist += fenc_me_chroma_cost(mv, me, comp);
@@ -500,7 +556,7 @@ int fenc_me_test_pmv(f265_mv pmv[2], f265_me_ctx *me, int dist, int *cost)
     if (t->enc->gd.algo&0x400)
     {
         for (int i = 0; i < 2; i++)
-            costs[i] = fenc_get_fake_luma_block_dist(t, f265_clip_mv(pmv[i], me->me_bounds64));
+            costs[i] = fenc_get_fake_luma_block_dist_p(t, f265_clip_mv(pmv[i], me->me_bounds64));
     }
 
     else