diff --git a/f265/bdi.c b/f265/bdi.c
index 4d204d9..3110d66 100644
--- a/f265/bdi.c
+++ b/f265/bdi.c
@@ -302,6 +302,11 @@ void f265_normalize_params(f265_enc_params *p)
     // Make sure there is room for B frames in the lookahead.
     p->la_decision_delay = F265_MAX(p->la_decision_delay, p->nb_b_frames);
 
+    // Kludge. Set one neighbour in the lookahead.
+    // FIXME. This should be set according to the decision delay, the process
+    //        delay, the number of references, and the number of B-frames.
+    if (p->la_flag) p->la_neighbours[0] = 1;
+
     // Normalize the number of lookahead threads. FIXME.
     p->nb_workers[1] = 0;
 
diff --git a/f265/enc.c b/f265/enc.c
index a752fc9..ea4b8c7 100644
--- a/f265/enc.c
+++ b/f265/enc.c
@@ -128,6 +128,7 @@ typedef struct f265_enc_mem_data
     // Object pointers.
     f265_enc *enc;
     f265_enc_thread *enc_threads;
+    f265_enc_thread *la_threads;
     f265_frame *frame_objs;
     uint8_t *src_objs;
     uint8_t *la_objs;
@@ -268,6 +269,45 @@ static void fenc_analyze_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, uint8
 
     // Lookahead object size.
     d->la_obj_size = 0;
+    if (p->la_flag)
+    {
+        // There are as many 16x16 blocks in the actual plane as there are 8x8
+        // blocks in the subsampled planes.
+        // NOTE. pix_dim are multiples of CTB size (16, 32, or 64).
+        int blk_dim[2] = { d->pix_dim[0] >> 4, d->pix_dim[1] >> 4 };
+        int nb_blks = blk_dim[0] * blk_dim[1];
+
+        // Sufficient space for pure intra, and all inter setup combinations.
+        int nb_setups = (p->la_neighbours[0] + 1) * (p->la_neighbours[1] + 1);
+
+        // 16x16 block costs.
+        d->la_obj_size += nb_setups * nb_blks * sizeof(uint16_t);
+
+        // 16x16 block motion vectors.
+        d->la_obj_size += nb_setups * nb_blks * sizeof(f265_mv);
+
+        // 16x16 block QP offsets.
+        d->la_obj_size += nb_blks * sizeof(float);
+
+        // 16x16 block propagation addition factor.
+        d->la_obj_size += nb_blks * sizeof(uint16_t);
+
+        // 16x16 block propagation multiplication factor.
+        d->la_obj_size += nb_blks * sizeof(uint16_t);
+
+        // 16x16 block propagation total.
+        d->la_obj_size += nb_blks * sizeof(uint16_t);
+
+        // 16x16 block propagation bitfield.
+        d->la_obj_size += (nb_blks >> 2) * sizeof(uint8_t);
+
+        // Subsampled planes.
+        // NOTE. There are 4 half-width half-height planes (e.g. 1 luma plane).
+        d->la_obj_size += d->plane_size[0];
+
+        // Cache alignment.
+        d->la_obj_size = F265_ALIGN_VAL(d->la_obj_size, 64);
+    }
 
     // Number of reconstructed luma planes.
     d->nb_rec_luma = 1 + 3*d->subpel_me_flag;
@@ -380,8 +420,10 @@ static void fenc_analyze_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, uint8
     // Track the offset and size of each object group.
     SET_OFF(f265_enc, enc, sizeof(f265_enc), 64);
     SET_OFF(f265_enc_thread, enc_threads, d->nb_enc_threads*sizeof(f265_enc_thread), 64);
+    SET_OFF(f265_enc_thread, la_threads, d->nb_la_threads*sizeof(f265_enc_thread), 64);
     SET_OFF(f265_frame, frame_objs, d->nb_frame_objs*sizeof(f265_frame), 64);
     SET_OFF(uint8_t, src_objs, d->nb_src_objs*d->src_obj_size, 64);
+    SET_OFF(uint8_t, la_objs, d->nb_la_objs*d->la_obj_size, 64);
     SET_OFF(uint8_t, enc_objs, d->nb_enc_objs*d->enc_obj_size, 64);
     SET_OFF(uint8_t, deblock_objs, d->nb_deblock_objs*d->deblock_obj_size, 64);
     SET_OFF(uint8_t, chunk_objs, d->nb_chunk_objs*d->chunk_obj_size, 64);
@@ -689,6 +731,7 @@ static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **e
         gd->early_me_params[0].nb_iters[i] = p->me_iter[i];
         gd->early_me_params[0].search_funcs[i] = sf[p->me_algo[i]];
     }
+    for (int i = 0; i < 2; i++) gd->la_neighbours[i] = p->la_neighbours[i];
 
     // Encoder flags.
     gd->eflags = 0;
@@ -770,6 +813,7 @@ static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **e
     md->nb_unused_objs[F265_UNUSED_ENC_OBJ] = d->nb_enc_objs;
     for (int i = 0; i < d->nb_frame_objs; i++) md->unused_objs[F265_UNUSED_FRM_OBJ][i] = (uint8_t*)(d->frame_objs + i);
     for (int i = 0; i < d->nb_src_objs; i++) md->unused_objs[F265_UNUSED_SRC_OBJ][i] = d->src_objs + i*d->src_obj_size;
+    for (int i = 0; i < d->nb_la_objs; i++) md->unused_objs[F265_UNUSED_LA_OBJ][i] = d->la_objs + i*d->la_obj_size;
     for (int i = 0; i < d->nb_enc_objs; i++) md->unused_objs[F265_UNUSED_ENC_OBJ][i] = d->enc_objs + i*d->enc_obj_size;
 
     // Stream-level rate control initialization.
@@ -777,6 +821,12 @@ static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **e
 
     // Lookahead.
     f265_lookahead *la = &enc->la;
+    la->threads = d->la_threads;
+    for (int i = 0; i < d->nb_la_threads; i++)
+    {
+        fenc_me_set_params(enc, &la->threads[i].me);
+    }
+    la->nb_la_threads = d->nb_la_threads;
     la->display = d->la_display;
     la->display[0] = NULL;
     la->coded = d->la_coded;
@@ -828,7 +878,7 @@ static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **e
     #endif
 
     // Encoding thread objects.
-    for (int i = 0; i < d->nb_enc_threads; i++)
+    for (int i = 0; i < d->nb_enc_threads + d->nb_la_threads; i++)
     {
         f265_enc_thread *t = d->enc_threads + i;
         t->plane_stride = d->stride;
@@ -867,21 +917,31 @@ static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **e
             // encoding resources to avoid unnecessary calls to pthread destroy
             // functions.
 
-            // Init private communication channel.
-            t->cv_flag = !f265_cond_init(&t->worker_cv);
-            if (!t->cv_flag)
+            // Encoding threads.
+            if (i < d->nb_enc_threads)
             {
-                *error_handle = "error creating conditional variable";
-                return;
+                // Init private communication channel.
+                t->cv_flag = !f265_cond_init(&t->worker_cv);
+                if (!t->cv_flag)
+                {
+                    *error_handle = "error creating conditional variable";
+                    return;
+                }
+
+                // Start the next thread.
+                t->status = F265_THREAD_IDLE;
+                t->handle_flag = !f265_thread_create(&t->thread_handle, fenc_encode_parallel_tile, t);
+                if (!t->handle_flag)
+                {
+                    *error_handle = "error creating thread";
+                    return;
+                }
             }
 
-            // Start the next thread.
-            t->status = F265_THREAD_IDLE;
-            t->handle_flag = !f265_thread_create(&t->thread_handle, fenc_encode_parallel_tile, t);
-            if (!t->handle_flag)
+            // Lookahead threads.
+            else
             {
-                *error_handle = "error creating thread";
-                return;
+                // TODO.
             }
         }
         #endif
@@ -987,6 +1047,7 @@ void fenc_deinit_enc(f265_enc *enc)
     if (gd->mt_mode)
     {
         f265_main_data *md = &enc->md;
+        f265_lookahead *la = &enc->la;
         f265_enc_sync *es = &enc->es;
 
         // Abort the encoding.
@@ -996,7 +1057,8 @@ void fenc_deinit_enc(f265_enc *enc)
 
         // Terminate running threads. Check flags to avoid releasing
         // uninitialized resources.
-        for (int n = 0; n < md->nb_enc_threads; n++)
+        int nb_threads = md->nb_enc_threads + la->nb_la_threads;
+        for (int n = 0; n < nb_threads; n++)
         {
             f265_enc_thread *t = md->enc_threads[n];
 
@@ -1051,6 +1113,74 @@ void fenc_link_src_obj(f265_enc *e, f265_frame *f)
     o += gd->plane_size[1];
 }
 
+// Link an unused lookahead object to the frame specified.
+void fenc_link_la_obj(f265_enc *e, f265_frame *f)
+{
+    f265_gen_data *gd = &e->gd;
+
+    uint8_t *o = fenc_get_unused_obj(e, F265_UNUSED_LA_OBJ);
+    f->mem_buf[2] = o;
+
+    // Bail out if the lookahead is not used.
+    if (!(gd->eflags&F265_PF_LA)) return;
+
+    // Subsampled planes.
+    // The planes are laid out as follow. Each "sub_plane" is surrounded by
+    // padding (at least 32 samples). The actual stride is identical to the one
+    // used in the input source pictures. Thus, to access the row of samples
+    // above or below the current one, use the stride stored in f265_gen_data.
+    // Half that stride lets you move from sub_plane[0] to sub_plane[1] or
+    // sub_plane[2] to sub_plane[3].
+    //                    +-------------------+
+    //                    |      Padding      |
+    //                    | P +---+ P +---+ P |
+    //                    | a | 0 | a | 1 | a |
+    //                    | d +---+ d +---+ d |
+    //                    | d    Padding    d |
+    //                    | i +---+ i +---+ i |
+    //                    | n | 2 | n | 3 | n |
+    //                    | g +---+ g +---+ g |
+    //                    |      Padding      |
+    //                    +-------------------+
+    f->sub_planes[0] = (f265_pix*)(o + (gd->plane_off[0] >> 1)); o += gd->plane_size[0];
+    f->sub_planes[1] = f->sub_planes[0] + (gd->stride >> 1);
+    f->sub_planes[2] = f->sub_planes[0] + (gd->stride * ((gd->pix_dim[1] >> 1) + F265_LUMA_PLANE_PADDING));
+    f->sub_planes[3] = f->sub_planes[2] + (gd->stride >> 1);
+
+    f265_la_frame *la = &f->la;
+
+    // Reset frame sums to 0.
+    memset(la->frame_sums, 0, sizeof(la->frame_sums));
+
+    // Initialize frame_costs to -1: nothing has been tested yet.
+    memset(la->frame_costs, -1, sizeof(la->frame_costs));
+
+    // Initialize propagation setup. 0xffff is an invalid setup.
+    la->blk_prop_setup = 0xffff;
+
+    // Reset the reference test bitmap.
+    memset(la->ref_tests, 0, sizeof(la->ref_tests));
+
+    // Number of 16x16 blocks in the luma plane / 8x8 blocks in the subsampled
+    // plane.
+    int blk_dim[2] = { gd->pix_dim[0] >> 4, gd->pix_dim[1] >> 4 };
+    int nb_blks = blk_dim[0] * blk_dim[1];
+
+    // Allocate storage space for each setup combination.
+    for (int n0 = 0; n0 <= gd->la_neighbours[0]; n0++)
+        for (int n1 = 0; n1 <= gd->la_neighbours[1]; n1++)
+        {
+            la->blk_costs[n0][n1] = (uint16_t*)o; o += nb_blks * sizeof(uint16_t);
+            la->blk_mv[n0][n1] = (f265_mv*)o; o += nb_blks * sizeof(f265_mv);
+        }
+
+    la->blk_qp_off = (float*)o; o += nb_blks * sizeof(float);
+    la->blk_prop_add = (uint16_t*)o; o += nb_blks * sizeof(uint16_t);
+    la->blk_prop_mult = (uint16_t*)o; o += nb_blks * sizeof(uint16_t);
+    la->blk_prop_total = (uint16_t*)o; o += nb_blks * sizeof(uint16_t);
+    la->blk_prop_bf = (uint8_t*)o; o += (nb_blks >> 2) * sizeof(uint8_t);
+}
+
 // Link an unused encoding object to the frame specified.
 void fenc_link_enc_obj(f265_enc *e, f265_frame *f)
 {
diff --git a/f265/enc.h b/f265/enc.h
index 9ea5c07..ad5af69 100644
--- a/f265/enc.h
+++ b/f265/enc.h
@@ -1366,6 +1366,99 @@ typedef struct f265_rec_params
 
 } f265_rec_params;
 
+// Lookahead data associated to a frame.
+// The lookahead operates on 16x16 blocks (8x8 blocks in a subsampled plane -
+// half width, half height). This fixed block size differs from the CTB concept
+// which is variable  in nature. Thus, each 16x16 block maps to a single CTB;
+// no 16x16 block will ever lie on a CTB boundary. However, each CTB may be
+// associated to multiple 16x16 blocks. For example, 32x32 CTBs map to four
+// 16x16 regions. This is illustrated below.
+//
+//                           +---+---+ +---+---+
+//                           | a | b | | e | f |
+//                  CTB 1 => +---+---+ |---+---+ <= CTB 2
+//                           | c | d | | g | h |
+//                           +---+---+ +---+---+
+//                           +---+---+ +---+---+
+//                           | i | j | | m | n |
+//                  CTB 3 => +---+---+ +---+---+ <= CTB 4
+//                           | k | l | | o | p |
+//                           +---+---+ +---+---+
+//
+// The size of the arrays used to store the costs and other relevant stats only
+// depends on the picture size: the CTB size can be ignored when allocating
+// memory that pointers such as blk_costs will point to.
+// Working with 16x16 blocks is easy. The f265_gen_data pix_dim[2] member gives
+// the number of samples in both direction. These values are multiples of the
+// CTB size: CTBs are 16, 32, or 64 samples wide/high. Thus, simply right shift
+// the values by 4 to know how many 16x16 blocks there are.
+typedef struct f265_la_frame
+{
+    // Frame "complexity" buffer for each component.
+    // The first index is the colour component. The second index is for the
+    // complexity metrics: 0 => sum of the pixels, 1 => sum of the squared
+    // pixels values.
+    uint64_t frame_sums[3][2];
+
+    // Cost of encoding the entire frame with a specific setup. For instance,
+    // imagine that every 16x16 block in the current frame uses the previous
+    // frame for motion compensation and no other frame, nor intra coding.
+    //
+    // The first index refers to the L0 list, and the second one, the L1 list.
+    // A value of 0 in any one index means that the list is not used.
+    // frame_costs[0][0] is used to store the estimated all-intra cost.
+    // frame_costs[n][0] is used to store the estimated all-inter cost with L0.
+    // frame_costs[0][n] is used to store the estimated all-inter cost with L1.
+    // frame_costs[n][m] is used to store the estimated bi-dir cost.
+    // A cost of -1 indicates that the setup has not yet been tested.
+    uint64_t frame_costs[1 + 17][1 + 17];
+
+    // Packed L0|L1 indices of the most recent setup investigated.
+    // (L0 idx << 8) | L1 idx. Initialized to 0xffff.
+    uint16_t blk_prop_setup;
+
+    // Bitmap which is used to keep track of the reference frames already
+    // tested. The index is the reference list. The offset in the bitmap is the
+    // reference frame offset minus 1.
+    uint32_t ref_tests[2];
+
+    // Cost and encoding mode of a 16x16 block in a setup. The cost is in the
+    // the low 14 bits, the mode is in the high 2 bits (00 intra, 01 L0, 10 L1,
+    // 11 Bi).
+    //
+    // Multiple costs for each 16x16 block are stored. The same indexing scheme
+    // used for the above frame_costs is used here. Moreover, a third index,
+    // the 16x16 blocks's raster index, is required.
+    // b8_costs[0][0][k] indicates the pure intra cost for the k-th block.
+    // b8_costs[n][0][k] stores the k-th block's inter cost (n-th L0 ref).
+    // b8_costs[0][n][k] stores the k-th block's inter cost (n-th L1 ref).
+    // b8_costs[n][m][k] stores the k-th block's bi-dir cost (n-th L0 ref, and
+    // m-th L1 ref).
+    //
+    // Unlike frame_costs, there isn't a value used to indicate that the setup
+    // hasn't been tested yet. The ref_tests bitmaps are used for that.
+    uint16_t *blk_costs[1 + 17][1 + 17];
+
+    // Estimated motion vectors. Same indices as b8_costs.
+    f265_mv *blk_mv[1 + 17][1 + 17];
+
+    // 16x16 block's suggested QP offset.
+    float *blk_qp_off;
+
+    // 16x16 block's flow propagation addition and multiplication factors.
+    uint16_t *blk_prop_add;
+    uint16_t *blk_prop_mult;
+
+    // 16x16 block's flow propagation total. The values are padded with entries
+    // at both ends to cater for out-of-bounds MVs in assembly.
+    uint16_t *blk_prop_total;
+
+    // 16x16 block's flow propagation list bitfields. There are two bits per
+    // block (identical to blk_costs). They are grouped in 64-bit bitfields.
+    uint8_t *blk_prop_bf;
+
+} f265_la_frame;
+
 // Data associated to a frame. The frame object is lightweight. Other objects
 // are attached and detached dynamically from the frame object.
 struct f265_frame
@@ -1484,8 +1577,8 @@ struct f265_frame
     int64_t ref_poc[2][16];
 
     // Lookahead data. The data is exclusively set by the lookahead. The main
-    // thread and the lookahead do not access this data concurrently. FIXME.
-    //f265_la_frame la;
+    // thread and the lookahead do not access this data concurrently.
+    f265_la_frame la;
 
 
     // Data exclusive to the main thread.
@@ -2215,6 +2308,12 @@ typedef struct f265_gen_data
     // weighted prediction entry?
     f265_early_me_params early_me_params[3];
 
+    // Lookahead.
+
+    // Number of frames to the left [0] and to the right [1] of the current
+    // frame the lookahead may use.
+    uint8_t la_neighbours[2];
+
     #ifdef F265_HAVE_STDIO
     // Reconstructed YUV frame file. NULL if not used.
     FILE *yuv_dump_file;
@@ -2332,6 +2431,10 @@ typedef struct f265_lookahead
 {
     // Data exclusive to the lookahead and invariant during slice encoding.
 
+    // Pointer to the lookahead thread array. The first thread is the primary
+    // lookahead thread.
+    f265_enc_thread *threads;
+
     // Pointer to the array of frames stored in display order. For unification,
     // the first element is the last frame of the last committed group of
     // pictures.
@@ -2683,6 +2786,7 @@ void fenc_analyze_params(f265_enc_params *params);
 void fenc_init_enc(f265_enc_params *params, uint8_t *buf, f265_enc **enc_handle, char **error_handle);
 uint8_t* fenc_get_unused_obj(f265_enc *e, int cat);
 void fenc_link_src_obj(f265_enc *e, f265_frame *f);
+void fenc_link_la_obj(f265_enc *e, f265_frame *f);
 void fenc_link_enc_obj(f265_enc *e, f265_frame *f);
 void fenc_release_frame_mem(f265_enc *e, f265_frame *f);
 int fenc_process_enc_req(f265_enc *e, f265_enc_req *req);
@@ -2756,10 +2860,8 @@ void fenc_get_merge_candidate(f265_enc_thread *t, f265_cb *cb, uint32_t partitio
                               f265_inter_neighbour_mv *neighbours, f265_inter_neighbour_mv *merge_candidate);
 
 // la.c
-f265_frame* fenc_la_make_frame(f265_enc *e);
-f265_frame* fenc_la_process_frame_regular(f265_enc *enc, f265_frame *in);
-void fenc_la_process_frames(f265_enc *enc);
 f265_frame* fenc_la_add_frame(f265_enc *e, f265_frame *in);
+f265_frame* fenc_la_make_frame(f265_enc *e);
 
 // me.c
 void fenc_me_interpol_plane(int16_t *dst, int dst_stride, f265_mv mv, f265_me_ctx *me, int csf[2],
@@ -2835,6 +2937,8 @@ void fenc_copy_block_s16(int16_t *dst, int dst_stride, int16_t *src, int src_str
 void fenc_save_bottom_right_edges(f265_pix *dst, f265_pix *src, int src_stride, int width, int height);
 void fenc_restore_bottom_right_edges(f265_pix *dst, int dst_stride, f265_pix *src, int width, int height);
 void fenc_transpose_block(f265_pix *dst, int dst_stride, f265_pix *src, int src_stride, int width, int height);
+void fenc_compute_subsampled(f265_pix *sub_planes[4], f265_pix *src_plane,
+                             int32_t stride, int32_t width, int32_t height);
 void fenc_pad_plane(f265_pix *plane, int32_t stride,
                     int32_t bl, int32_t br, int32_t bt, int32_t bb,
                     int32_t pl, int32_t pr, int32_t pt, int32_t pb);
diff --git a/f265/f265.h b/f265/f265.h
index 476be04..8519cbc 100644
--- a/f265/f265.h
+++ b/f265/f265.h
@@ -222,6 +222,9 @@ typedef struct f265_enc_params
     int16_t la_decision_delay;
     int16_t la_process_delay;
 
+    // Maximum number of available neighbours to the left and to the right of
+    // the current frame.
+    int8_t la_neighbours[2];
 
     // Rate control.
 
diff --git a/f265/la.c b/f265/la.c
index 448a4ff..ad4c59d 100644
--- a/f265/la.c
+++ b/f265/la.c
@@ -5,6 +5,10 @@
 
 #include "enc.h"
 
+// Trace lookahead steps.
+//#define F265_TRACE_LOOKAHEAD
+
+// Convenience function.
 // Pop a frame from a frame queue.
 static inline f265_frame* fenc_frame_queue_pop(f265_frame **queue, int16_t *queue_size)
 {
@@ -14,84 +18,1108 @@ static inline f265_frame* fenc_frame_queue_pop(f265_frame **queue, int16_t *queu
     return f;
 }
 
-// Prepare the frame for insertion in the lookahead.
-f265_frame* fenc_la_make_frame(f265_enc *e)
+// Convenience function.
+// Interpret uint32_t as float without compiler warnings.
+static inline float fenc_la_itof(uint32_t in)
 {
-    f265_gen_data *gd = &e->gd;
-    f265_main_data *md = &e->md;
-    f265_enc_req *req = md->req;
+    void *v = &in;
+    return *(float*)v;
+}
 
-    // Link the frame, source and lookahead objects.
-    f265_frame *f = (f265_frame*)fenc_get_unused_obj(e, F265_UNUSED_FRM_OBJ);
-    f->mem_buf[0] = (uint8_t*)f;
-    fenc_link_src_obj(e, f);
-    f->mem_buf[3] = NULL;
+// Convenience function.
+// Interpret float as uint32_t without compiler warnings.
+static inline uint32_t fenc_la_ftoi(float in)
+{
+    void *v = &in;
+    return *(uint32_t*)v;
+}
 
-    // Reset the frame flags.
-    f->main_flags = f->gen_flags = 0;
+// Convenience function.
+// Compute the approximate exponential of the value specified. The absolute
+// error is less than 0.003. The error is maximal near 0.0 (the result is not
+// 1).
+static inline float fenc_la_exp2f(float in)
+{
+    // Computation:
+    // 2**x == 2^(j + k) with j = (int)x and k = x - j. We have k in [0,1[.
+    //      == (2^j) * (2^k).
 
-    // The frame is used for reference in the lookahead.
-    f->main_flags |= F265_FF_LA_REF;
+    // Extract the integer part and the fractional part.
+    int i_part = (int)in;
+    float f_part = in - (float)i_part;
 
-    // Set the absolute picture order count.
-    f->abs_poc = md->abs_poc_counter++;
+    // The integer part is the unbiased exponent of the float value 1.0*2^j.
+    float f_j = fenc_la_itof((127 + i_part) << 23);
 
-    // Set the timestamp and the duration.
-    f->timestamp = req->input->timestamp;
-    f->duration = req->input->duration;
+    // Evaluate the order 2 minimax approximation polynomial for 2^k with k in
+    // [0, 1[.
+    float coeffs[3] = { 1.0024760560816106f,
+                        0.6510467806701004f,
+                        0.34400110716667837f };
+    float f_k = ((coeffs[2] * f_part) + coeffs[1]) * f_part + coeffs[0];
 
-    // Force key frame if requested.
-    F265_SET_FLAG(f->gen_flags, F265_FF_KEY, req->force_key_flag|req->force_idr_flag);
-    F265_SET_FLAG(f->gen_flags, F265_FF_IDR, req->force_idr_flag);
+    return f_j * f_k;
+}
 
-    // Force IDR if this is the first frame or we're overflowing the picture
-    // order count.
-    if (!f->abs_poc || md->enc_frames[0] && md->enc_frames[0]->h265_poc >= 2000000000)
+// Convenience function.
+// Compute approximate log2 of the value specified. The domain is [1,infinity).
+// The absolute error is les than 0.005. The error is maximal near 1.0f (the
+// result is non-null).
+static inline float fenc_la_log2f(float in)
+{
+    // This function assumes that the floating point layout is as follow.
+    //
+    // [ 31 ][30....23][22.....0]
+    // [sign][exponent][mantissa]
+    //
+    // RealVal = ((-1)^sign) * mantissa * 2^(exponent - 127)
+    //
+    // 'sign' is the sign of the float. 'exponent' is an unsigned byte for
+    // which 127 represents the non-biased exponent 0. 'mantissa' is the
+    // fractional part between [1, 2[ with the leading '1.' excluded. This is
+    // the normalized layout. Very small floats use another layout.
+    //
+    // The log is computed as follow.
+    //
+    // log2(a*b) == log(a) + log(b)
+    // log2(RealVal) == log2(mantissa) + (exponent - 127).
+
+    // Cast the input as an integer.
+    uint32_t i_in = fenc_la_ftoi(in);
+
+    // Shift the exponent to the less significant bits. There is no need to
+    // mask the sign bit of the float because it is assumed to be zero. Then,
+    // subtract 127 to get the non-biased value of the exponent.
+    int32_t log2_exp = (int32_t)(i_in >> 23) - 127;
+
+    // Mask the mantissa and set the non-biased exponent to 0. The result is a
+    // float in the range [1, 2[.
+    float frac = fenc_la_itof((i_in & 0x007FFFFF) | 0x3F800000);
+
+    // Evaluate the order 2 minimax approximation polynomial for log2(x) with x
+    // in the range [1, 2[. The coefficients were obtained using the Remez
+    // algorithm. log2_frac = c0 + c1*x + c2*x*x
+    float coeffs[3] = { -1.6748775934184137f,
+                         2.024665787920043f,
+                        -0.34484843538466037f };
+    float log2_frac = ((coeffs[2] * frac) + coeffs[1]) * frac + coeffs[0];
+
+    return (float)log2_exp + log2_frac;
+}
+
+// Convenience function.
+// Approximate the ratio num/den.
+static inline float fenc_la_divf(float num, float den)
+{
+    // Magic. The relative error measured experimentally is about 5%.
+    // http://stackoverflow.com/questions/12227126/division-as-multiply-and-lut-fast-float-division-reciprocal.
+    int mask = 0x7EF311C3;
+    float inv_f = fenc_la_itof(mask - fenc_la_ftoi(den));
+
+    // Newton-Raphson refinement.
+    // The relative error is less than 0.3% after one iteration.
+    inv_f *= 2.0f - inv_f * num;
+
+    return num * inv_f;
+}
+
+// Convenience function.
+// Compute the NxN block's sample variance. Also keeps track of the sum of
+// samples (sum) and the sum of squared samples (ssd).
+static void fenc_la_block_var(f265_pix *src, int32_t stride, int32_t lg_bs, uint64_t *sum, uint64_t *ssd, uint32_t *var)
+{
+    int32_t size = 1 << lg_bs;
+    *sum = 0;
+    *ssd = 0;
+    for (int i = 0; i < size; i++, src += stride)
     {
-        F265_SET_FLAG(f->gen_flags, F265_FF_KEY|F265_FF_IDR, 1);
+        for (int j = 0; j < size; j++)
+        {
+	  int pix = src[j];
+          *sum += pix;
+          *ssd += pix * pix;
+        }
     }
+    *var = *ssd - ((*sum * *sum) >> (lg_bs << 1));
+}
 
-    // FIXME: assuming 4:2:0 data for now. We need to figure out how we want to
-    // handle the format conversions. In particular, we may need to increase the
-    // bit depth of the input.
+// Convenience function.
+// Fetch the neighbouring samples for intra prediction. Assumes 8x8 block.
+// NOTE. packed = (bit depth << 8) | 8
+static void fenc_la_intra_neighbours(f265_pix neighbours[2][160], f265_pix *src, int stride, int packed)
+{
+    // FIXME. Respect tile boundaries.
+    int avail[2] = { 16, 16 };
+    int filter_flag = 0;
+    fenc_extract_intra_neigh[1](neighbours, src, stride, avail, filter_flag, packed);
+}
 
-    // Get the plane information. The luma plane needs to be padded with an
-    // extra pixel if the lookahead is used (subsampled planes + AQ intra
-    // prediction).
-    int32_t stride = gd->stride;
-    f265_pix **planes = (f265_pix**)f->src_planes;
-    int32_t *pix_dim = gd->pix_dim;
-    int32_t *clip_dim = gd->clip_dim;
-    int32_t pad_luma[4] = { 0, pix_dim[0] - clip_dim[0], 0, pix_dim[1] - clip_dim[1] };
-    int32_t pad_chroma[2] = { pad_luma[1]>>1, pad_luma[3]>>1 };
-    for (int i = 0; i < 4; i++) pad_luma[i] += F265_GET_FLAG(gd->eflags, F265_PF_LA);
-    for (int i = 0; i < 2; i++) pad_luma[i] = F265_ALIGN_VAL(pad_luma[i], 8);
-    pad_chroma[0] = F265_ALIGN_VAL(pad_chroma[0], 8);
+// Convert each 16x16 block's MSE into suggested QP offsets.
+static void fenc_la_aq_mse(f265_la_frame *la, uint32_t *blk_mse, int32_t nb_blks)
+{
+    // Cached data.
+    float *blk_qp_off = la->blk_qp_off;
+    uint16_t *blk_add = la->blk_prop_add;
+    uint16_t *blk_intra = la->blk_costs[0][0];
 
-    // Copy the plane data.
-    for (int i = 0; i < 3; i++)
+    // FIXME. This should be a parameter.
+    // NOTE. The value is taken from the default value of la_algo_strength[1]
+    //       in the legacy code.
+    float strength = 1.0f;
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%10sla_aq_mse\n", "");
+    printf("%12sStrength=%f\n", "", strength);
+    #endif
+
+    for (int blk_xy = 0; blk_xy < nb_blks; blk_xy++)
     {
-        int32_t in_stride = req->input->stride[i];
-        uint8_t *in_plane = req->input->planes[i];
-        f265_pix *p = planes[i];
-        for (int32_t j = 0; j < clip_dim[1]>>!!i; j++, in_plane += in_stride, p += stride)
-            memcpy(p, in_plane, clip_dim[0]>>!!i);
+        // The relation between the number of bits and the mean square error
+        // (MSE) is linear. The relation between the QP and the number of bits
+        // is logarithmic. Hence, the relation between the QP and the MSE is
+        // logarithmic.
+        blk_qp_off[blk_xy] = strength * (fenc_la_log2f(blk_mse[blk_xy] + 1) - 14.3f);
+
+        // For the block flow, scale the intra cost by the quantization scaling
+        // factor corresponding to the QP offset. This is another estimate of
+        // the complexity of the block, and thus of the expected gain in bits
+        // after improving the accuracy of the reference.
+        //
+        // Using the intra cost instead of the constant value yields a minor
+        // improvement. Scaling by the quantization scaling factor yields a
+        // medium improvement. All attempts to integrate the inter cost here
+        // have failed. The stability of the addition factor is important, and
+        // volatile values such as the inter costs increase the variance of the
+        // block QPs.
+        //
+        // Using the square root/log of the intra cost did not provide real
+        // benefits, even though it reduces the variance. There was no benefit
+        // to using a minimum intra value or integrating the chroma intra cost.
+        int add = (float)blk_intra[blk_xy] * fenc_la_exp2f(blk_qp_off[blk_xy] * (-1.0f / 6.0f));
+        blk_add[blk_xy] = F265_MIN(add, 65535);
+
+        #ifdef F265_TRACE_LOOKAHEAD
+        printf("%12sQP off=%f, 16x16 add factor=%d\n", "",
+               blk_qp_off[blk_xy], blk_add[blk_xy]);
+        #endif
     }
+}
 
-    // Pad the planes.
-    fenc_pad_plane(planes[0], stride, 0, clip_dim[0] - 1, 0, clip_dim[1] - 1,
-                   pad_luma[0], pad_luma[1], pad_luma[2], pad_luma[3]);
-    for (int i = 0; i < 2; i++)
-        fenc_pad_plane(planes[1+i], stride, 0, (clip_dim[0] - 1)>>1, 0, (clip_dim[1] - 1)>>1,
-                       0, pad_chroma[0], 0, pad_chroma[1]);
+// Study the chroma variances.
+static void fenc_la_aq_mse_chroma(f265_pix *u, f265_pix *v, int stride, uint32_t *blk_mse, uint64_t frame_sums[2][2],
+                                  int width)
+{
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%10sla_aq_mse_chroma\n", "");
+    #endif
 
-    return f;
+    f265_pix *src[2] = { u, v };
+    for (int blk_x = 0; blk_x < width; blk_x++)
+    {
+        for (int comp = 0; comp < 2; comp++)
+        {
+            // FIXME. This code assumes 4:2:0 subsampling.
+            // Compute the 8x8 chroma patch's variance.
+            uint64_t sum, ssd;
+            uint32_t var;
+            fenc_la_block_var(src[comp], stride, 3, &sum, &ssd, &var);
+
+            // Keep track of the entire frame's complexity.
+            frame_sums[comp][0] += sum;
+            frame_sums[comp][1] += ssd;
+            *blk_mse++ = var;
+
+            #ifdef F265_TRACE_LOOKAHEAD
+            printf("%12ssum=%lld, ssd=%lld, mse=%d\n", "", sum, ssd, var);
+            #endif
+
+            // FIXME. This code assumes 4:2:0 subsampling.
+            // Move to the component's next 8x8 patch.
+            src[comp] += 8;
+        }
+    }
+}
+
+// Evaluate the luma sample variance.
+// The main complexity metric is the pixel variance in the block. It is a good
+// metric, but it is blind to the spatial distribution of the samples. As a
+// result, the complexity is overestimated when the spatial correlation is
+// high.
+// We correct this problem by computing the sums of squared differences to the
+// vertical and horizontal intra predictions, and by keeping the minimum
+// between the variance and the best SSD. This is essentially an MSE metric.
+// Empirically, we have found that multiplying the intra SSDs by 4 yields
+// better results.
+//
+// NOTE. Summing the squared AC frequencies of the 4x4 Hadamard transform on
+//       the block yields no improvement over the variance.
+static void fenc_la_aq_mse_luma(f265_pix *src, int stride, int bd, uint32_t *blk_mse, uint64_t frame_sums[2],
+                                int width)
+{
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%10sla_aq_mse_luma\n", "");
+    #endif
+
+    F265_ALIGN64 f265_pix neighbours[2][160];
+    F265_ALIGN64 f265_pix pred[256];
+
+    int filter_flag = 0;
+    int max_err = 1 << bd;
+    int packed = (bd << 8) | 16;
+    int mode[2] = { 10, 26 };
+
+    for (int blk_x = 0; blk_x < width; blk_x++)
+    {
+        // Minimum Squared Error. Cap it out to start.
+        int mse = 256 * max_err * max_err;
+
+	// Fetch neighbours.
+        fenc_la_intra_neighbours(neighbours, src, stride, packed);
+
+        // Only try horizontal and vertical modes.
+        for (int i = 0; i < 2; i++)
+        {
+            // Fetch prediction signal.
+            fenc_predict_intra_mode(pred, neighbours[filter_flag], 4, bd, mode[i], filter_flag);
+
+            // Multiply SSD by 4.
+            int subshift = 2;
+            int ssd = fenc_ssd(pred, 16, src, stride, 16, 16, subshift, bd);
+
+            // Keep track of the better option (horizontal vs. vertical).
+            mse = F265_MIN(mse, ssd);
+        }
+
+        // Compute the variance.
+        uint64_t sum, ssd;
+        uint32_t var;
+        fenc_la_block_var(src, stride, 4, &sum, &ssd, &var);
+
+        // Choose between variance and best prediction MSE.
+        mse = F265_MIN(mse, var);
+
+        // Keep track of the entire frame's complexity.
+        frame_sums[0] += sum;
+        frame_sums[1] += ssd;
+        *blk_mse++ = mse;
+
+        #ifdef F265_TRACE_LOOKAHEAD
+        printf("%12ssum=%lld, ssd=%lld, mse=%d\n", "", sum, ssd, mse);
+        #endif
+
+        // Point to the top-left corner of the next 16x16 block on the row.
+        src += 16;
+    }
+}
+
+// Adaptive Quantization (AQ).
+// NOTE. AQ runs on the actual input planes, not the subsampled ones.
+//       AQ deals with 16x16 blocks since they match the 8x8 blocks used to
+//       evaluate the intra/inter costs in the subsampled plane.
+static void fenc_la_aq(f265_enc *enc, f265_frame *f)
+{
+    // Cached data.
+    f265_gen_data *gd = &enc->gd;
+    f265_la_frame *la = &f->la;
+    int width = gd->pix_dim[0] >> 4;
+    int height = gd->pix_dim[1] >> 4;
+    int stride = gd->stride;
+    int bd = gd->bit_depth[0];
+    f265_pix *src[3] = { f->src_planes[0], f->src_planes[1], f->src_planes[2] };
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%8sla_aq\n", "");
+    #endif
+
+    // FIXME. This won't work when multithreading is used.
+    // Fetch the primary thread.
+    f265_enc_thread *t = enc->la.threads;
+
+    // Use its store buffer to hold on to each 16x16 block's MSE.
+    int nb_blks = width * height;
+    uint32_t *mse = (uint32_t*)t->store;
+    t->store += nb_blks * sizeof(uint32_t);
+
+    // Compute the MSEs for every 16x16 block (first AQ pass).
+    for (int blk_y = 0; blk_y < height; blk_y++, mse += width)
+    {
+        fenc_la_aq_mse_luma(src[0], stride, bd, mse, la->frame_sums[0], width);
+        fenc_la_aq_mse_chroma(src[1], src[2], stride, mse, la->frame_sums + 1, width);
+
+        // FIXME. This code assumes 4:2:0 subsampling.
+        // Move to the top-left corner of the next row's first 16x16 block.
+        src[0] += stride << 4;
+        src[1] += stride << 3;
+        src[2] += stride << 3;
+    }
+
+    // Rollback to the 1st 16x16 block's MSE.
+    mse -= nb_blks;
+
+    // Process the MSEs (second AQ pass).
+    fenc_la_aq_mse(la, mse, nb_blks);
+
+    // Release the store buffer.
+    t->store = (uint8_t*)mse;
+}
+
+// Return 1 if a scene cut is detected, 0 otherwise.
+static int fenc_la_detect_scene_cut(f265_frame *f, int gop_size, int keyint_min, int keyint_max)
+{
+    // Intra/inter costs.
+    int intra_cost = f->la.frame_costs[0][0];
+    int inter_cost = f->la.frame_costs[1][0];
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%8sla_detect_scene_cut\n", "");
+    printf("%10sintra=%d, setup[1][0]=%d\n", "", intra_cost, inter_cost);
+    #endif
+
+    // Maximum/minimum strength of the algorithm. If an IDR frame occured
+    // lately, use the minimum strength to reduce the probability of outputting
+    // a bogus series of I frames.
+    // FIXME. Not optimal. More tests needed.
+    // FIXME. max_strength should be a parameter or a member in f265_lookahead.
+    //        The value 0.4f was taken from the legacy code.
+    float max_strength = 0.4f;
+    float min_strength = max_strength * 0.2f;
+    float strength = min_strength;
+
+    // Increase the strength linearly with the distance to the next key frame.
+    if (gop_size > keyint_min)
+        strength += (max_strength - min_strength) * (gop_size - keyint_min) / (keyint_max - keyint_min);
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%10sstrength=%f\n", "", strength);
+    #endif
+
+    // Scene cut starts when the intra and setup costs are roughly similar.
+    return inter_cost >= (1.0f - strength) * intra_cost;
+}
+
+// Convenience function.
+// Generate the lookahead motion vector candidates: MVP, null, right, bottom,
+// bottom-left, and bottom-right. No real gains for assembly unfortunately.
+// NOTE. packed_const = blk_y | blk_x | height - 1 | width - 1.
+static int fenc_la_est_mv_cands(f265_mv *cands, f265_mv *blk_mvs, uint64_t packed_const)
+{
+    int width = (uint16_t)packed_const + 1;
+    int height = (uint16_t)(packed_const >> 16) + 1;
+    int blk_x = (uint16_t)(packed_const >> 32);
+    int blk_y = (uint16_t)(packed_const >> 48);
+    int neighbours[4][2] = { { 1, 0 }, { 0, 1 }, { -1, 1 }, { 1, 1 } };
+
+    // Zero the candidates.
+    memset(cands, 0, 6 * sizeof(f265_mv));
+
+    // Keep track of how many candidates were successfully retreived.
+    int nb_cands = 0;
+
+    // Get the motion vector of the neighbours, in order.
+    for (int i = 0; i < 4; i++)
+    {
+        int nx = blk_x + neighbours[i][0];
+        int ny = blk_y + neighbours[i][1];
+
+        if (nx < 0 || ny < 0 || nx >= width || ny >= height) continue;
+        cands[2 + nb_cands++] = blk_mvs[neighbours[i][1] * width + neighbours[i][0]];
+
+        #ifdef F265_TRACE_LOOKAHEAD
+        printf("%12scands[%d] = (%d,%d)\n", "", 1 + nb_cands,
+               cands[1 + nb_cands].x, cands[1 + nb_cands].y);
+        #endif
+    }
+
+    // Set the MVP to the available candidate, or null if none.
+    if (nb_cands <= 1)
+        cands[0] = cands[2];
+
+    // Set the MVP to the median of the available candidates.
+    else
+    {
+        f265_mv a = cands[2];
+        f265_mv b = cands[3];
+        f265_mv c = cands[4];
+        cands[0].x = F265_MAX(F265_MIN(F265_MAX(a.x, b.x), c.x), F265_MIN(a.x, b.x));
+        cands[0].y = F265_MAX(F265_MIN(F265_MAX(a.y, b.y), c.y), F265_MIN(a.y, b.y));
+    }
+
+    // Count MVP, and null.
+    return nb_cands + 2;
+}
+
+// Run motion estimation for the current 8x8 block. Cache the cost.
+static void fenc_la_est_block_inter(f265_enc *enc, f265_frame *f, f265_frame *ref, int ro[2], uint64_t packed_const,
+                                    int blk_xy, int list)
+{
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%10sla_est_block_inter\n", "");
+    #endif
+
+    // Cached data.
+    f265_enc_thread *t = enc->la.threads;
+    f265_me_ctx *me = &t->me;
+    f265_la_frame *la = &f->la;
+
+    // Context setup. Chroma is deactivated on purpose.
+    me->best_cost = 0x7fffffff;
+
+    // Generate the lookahead motion vector candidates.
+    f265_mv cands[6];
+    f265_mv *blk_mvs = la->blk_mv[ro[0]][ro[1]];
+    int nb_cands = fenc_la_est_mv_cands(cands, blk_mvs + blk_xy, packed_const);
+
+    // Predicted motion vector setup.
+    f265_mv pmv = cands[0];
+    fenc_me_set_pmv(me, pmv, t->qp[0], 0);
+
+    // Early P-skip test if the predicted motion vector is null.
+    int skip_flag = 0;
+    if ((!list) & (!me->ref[0].pmv.p))
+    {
+        int inter_cost = fenc_me_mv_total_cost(me->ref[0].pmv, me);
+        if (inter_cost <= 64)
+        {
+            skip_flag = 1;
+            me->best_cost = inter_cost;
+        }
+    }
+
+    // Inter search.
+    if (!skip_flag) fenc_early_me(me, cands, nb_cands, NULL);
+
+    // Best MV.
+    la->blk_mv[ro[0]][ro[1]][blk_xy] = me->ref[0].best_mv;
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%12sBest inter cost=%d\n", "", me->best_cost);
+    #endif
+
+    // FIXME. Add penalty?
+
+    // Cap out the cost. The 2 MSBs are used for the "mode".
+    int cost = F265_MIN(me->best_cost, (1 << 14) - 1);
+
+    // Check for best mode. It might happen that intra performs better in the
+    // current setup. If that's the case, replace the best found cost.
+    int mode = 1 + list;
+    {
+        int intra_cost = la->blk_costs[0][0][blk_xy];
+        if (intra_cost < cost)
+        {
+            cost = intra_cost;
+            mode = 0;
+        }
+    }
+
+    // Update block/frame costs.
+    la->blk_costs[ro[0]][ro[1]][blk_xy] = (mode << 14) | cost;
+    la->frame_costs[ro[0]][ro[1]] += cost;
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%12sKept inter cost=%d (mode=%d)\n", "", cost, mode);
+    printf("%12sCurrent setup cost=%lld\n", "", la->frame_costs[ro[0]][ro[1]]);
+    #endif
+}
+
+// Generate intra prediction samples using src. Test all 35 modes. Evaluate the
+// distortion using src_buf. The function assumes both 'src' and 'src_buf' use
+// the subsampled luma plane.
+// NOTE. This function is tailored for 8x8 blocks.
+static void fenc_la_est_block_intra(f265_enc *enc, f265_frame *f, f265_pix *src, f265_pix src_buf[64], int blk_xy)
+{
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%10sla_est_block_intra\n", "");
+    #endif
+
+    // Cached data.
+    int bd = enc->gd.bit_depth[0];
+    int packed_bs = (8 << 8) | 8;
+    int stride = enc->gd.stride;
+    int filter_flag = 0;
+    F265_ALIGN64 f265_pix neighbours[2][160];
+    F265_ALIGN64 f265_pix pred_buf[64];
+
+    // Cache the neighbour pixels of the current 8x8 block.
+    int packed = (bd << 8) | 8;
+    fenc_la_intra_neighbours(neighbours, src, stride, packed);
+
+    // Identify the best intra mode.
+    // For each mode, generate the prediction signal using the cached
+    // neighbours, then measure the fit using simple SAD.
+    int pure_intra_cost = F265_MAX_SAD;
+    for (int i = 0; i < 35; i++)
+    {
+        fenc_predict_intra_mode(pred_buf, neighbours[filter_flag], 3, bd, i, filter_flag);
+        int cost = fenc_fsad_c(src_buf, 8, pred_buf, 8, packed_bs);
+        pure_intra_cost = F265_MIN(cost, pure_intra_cost);
+
+        #ifdef F265_TRACE_LOOKAHEAD
+        printf("%12smode=%d, cost=%d\n", "", i, cost);
+        #endif
+    }
+
+    // FIXME. Apply penalty?
+
+    // Cap out the intra cost. The two MSBs need to be 00.
+    pure_intra_cost = F265_MIN(pure_intra_cost, (1 << 14) - 1);
+
+    // Update known costs (block and frame).
+    f265_la_frame *la = &f->la;
+    la->blk_costs[0][0][blk_xy] = pure_intra_cost;
+    la->frame_costs[0][0] += pure_intra_cost;
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%12sKept intra cost=%d\n", "", la->blk_costs[0][0][blk_xy]);
+    printf("%12sCurrent intra cost=%lld\n", "", la->frame_costs[0][0]);
+    #endif
+}
+
+// Convenience function.
+// Setup the motion estimation context for an 8x8 block.
+// NOTE. Motion estimation is set in the subsampled planes.
+static inline void fenc_la_est_me_setup(f265_enc_thread *t, f265_frame *f, f265_frame *refs[2])
+{
+    f265_me_ctx *me = &t->me;
+
+    // Use subsampled planes.
+    me->src_planes[0] = f->sub_planes[0];
+    me->bit_depth[0] = t->enc->gd.bit_depth[0];
+
+    // Fixed 8x8 block. Make sure 8x8-tailored functions are used.
+    me->packed_dims[0] = (2 << 24) | (8 << 8) | 8;
+    me->dim[0] = me->dim[1] = 8;
+
+    // No chroma.
+    me->plane_off[1] = 0;
+    me->packed_dims[1] = 0;
+    me->csf[0] = me->csf[1] = 0;
+    me->chroma_flag = 0;
+    me->src_planes[1] = 0;
+    me->src_planes[2] = 0;
+
+    // Use SAD, and not SATD, since it's the metric used for intra.
+    int satd_flag = 0;
+    me->dist_func_id = satd_flag;
+
+    // Same lambda as the one used during the actual analysis.
+    me->lambda = (int)(sqrt(t->hm_lambda[0]) * 256.0f + 0.5f);
+
+    // Use "default" early motion estimation params.
+    me->early_me_params = t->enc->gd.early_me_params;
+
+    // List 0 reference.
+    {
+        me->ref[0].pmv.p = 0;
+        me->ref[0].best_mv.p = 0;
+        me->ref[0].weights = f265_lt.wp_default;
+        me->ref[0].ref_planes[0] = refs[0]->sub_planes[0];
+        me->ref[0].ref_planes[1] = 0;
+        me->ref[0].ref_planes[2] = 0;
+        me->ref[0].planes[0] = refs[0]->sub_planes[0];
+        me->ref[0].planes[1] = refs[0]->sub_planes[1];
+        me->ref[0].planes[2] = refs[0]->sub_planes[2];
+        me->ref[0].planes[3] = refs[0]->sub_planes[3];
+        me->ref[0].planes[4] = 0;
+        me->ref[0].planes[5] = 0;
+        me->ref[0].planes[6] = 0;
+    }
+
+    // FIXME. List 1 reference?
+
+    // Classic distortion functions.
+    me->dist[0] = &fenc_sad_wrap;
+    me->dist[1] = &fenc_satd;
+}
+
+// Estimate the costs of each 8x8 block in the specified setup.
+static void fenc_la_est_blocks(f265_enc *enc, f265_frame *f, f265_frame *f0,
+                               f265_frame *f1, int analysis_flags[3], int nb_lists)
+{
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%8sla_est_blocks\n", "");
+    #endif
+
+    // Cached data.
+    f265_gen_data *gd = &enc->gd;
+    f265_frame *refs[2] = { f0, f1 };
+    int ro[2] = { f->abs_poc - refs[0]->abs_poc,
+                  refs[1]->abs_poc - f->abs_poc };
+
+    // Picture dimensions.
+    // NOTE. Cost estimations use 8x8 blocks in the subsampled domain. There
+    //       are as many 8x8 blocks in the subsampled plane as there are 16x16
+    //       blocks in the regular luma plane.
+    int width = gd->pix_dim[0] >> 4, height = gd->pix_dim[1] >> 4;
+
+    // Motion estimation setup.
+    // FIXME. The code uses the first thread from the lookahead. This won't
+    //        work when MT is enabled.
+    f265_enc_thread *t = enc->la.threads;
+    fenc_la_est_me_setup(t, f, refs);
+
+    // Reset frame costs.
+    f265_la_frame *la = &f->la;
+    la->frame_costs[0][0] = 0;
+    la->frame_costs[ro[0]][ro[1]] = 0;
+
+    // FIXME. The legacy code favored a long loop body to force inlining.
+    //        Consider doing it the same way once the lookahead works properly.
+    // Process every 8x8 block in reverse order to improve quality. The motion
+    // vectors found in the lookahead are used as candidates in the real
+    // encoding for bottom/right spatial prediction.
+    for (int blk_y = height - 1, blk_xy = width * height - 1; blk_y >= 0; blk_y--)
+    {
+        // Packed constants used to generate the lookahead MV candidates.
+        uint64_t packed_const = 0;
+        packed_const |= (uint64_t)(width - 1) << 0;
+        packed_const |= (uint64_t)(height - 1) << 16;
+        packed_const |= (uint64_t)blk_y << 48;
+
+        for (int blk_x = width - 1; blk_x >= 0; blk_x--, blk_xy--)
+        {
+            // Copy the source from the subsampled plane.
+            F265_ALIGN64 f265_pix src_buf[64];
+            int blk_ox = blk_x * 8, blk_oy = blk_y * 8;
+            int plane_off = blk_oy * gd->stride + blk_ox;
+            f265_pix *src_plane = f->sub_planes[0] + plane_off;
+            fenc_copy_block(src_buf, 8, src_plane, gd->stride, 8, 8);
+
+            // Intra prediction.
+            fenc_la_est_block_intra(enc, f, src_plane, src_buf, blk_xy);
+
+            // Motion estimation limits.
+            {
+                f265_me_ctx *me = &t->me;
+                me->plane_off[0] = plane_off;
+                me->pos[0] = blk_ox;
+                me->pos[1] = blk_oy;
+                int r = width * 8 - blk_ox, b = height * 8 - blk_oy;
+                int range = (F265_MAX_MV_FPEL - F265_SEARCH_OOB) >> 1;
+                int min_border = (F265_LUMA_PLANE_PADDING - 4 - F265_SEARCH_OOB) >> 1;
+                int max_border = min_border - 8;
+                me->me_bounds[0] = -(F265_MIN(range, blk_ox + min_border)<<2);
+                me->me_bounds[1] = -(F265_MIN(range, blk_oy + min_border)<<2);
+                me->me_bounds[2] =  (F265_MIN(range, r + max_border)<<2);
+                me->me_bounds[3] =  (F265_MIN(range, b + max_border)<<2);
+                me->me_bounds_packs[0] = fenc_pack_mv_range(-me->me_bounds[0],
+                                                            -me->me_bounds[1]);
+                me->me_bounds_packs[1] = fenc_pack_mv_range(-me->me_bounds[2],
+                                                            -me->me_bounds[3]);
+            }
+
+            // Inter, single prediction.
+            for (int list = 0; list < nb_lists; list++)
+                fenc_la_est_block_inter(enc, f, refs[list], ro, packed_const | (uint64_t)blk_x << 32, blk_xy, list);
+
+            // Inter, bi-prediction. TODO.
+        }
+    }
+}
+
+// Estimate the cost of encoding the frame in the specified setup. As a
+// performance optimization, we usually analyze the intra setup at the same
+// time as an inter setup. The adaptive quantization (AQ) is performed after
+// the intra setup is analyzed.
+//
+// f: current frame, f0/f1: L0/L1 reference (set to 'f' if unused).
+static void fenc_la_est_frame(f265_enc *enc, f265_frame *f, f265_frame *f0, f265_frame *f1)
+{
+    f265_la_frame *la = &f->la;
+    f265_frame *refs[2] = { f0, f1 };
+
+    // Compute the reference frame offsets.
+    int ro[2] = { f->abs_poc - refs[0]->abs_poc,
+                  refs[1]->abs_poc - f->abs_poc };
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%6sla_est_frame\n", "");
+    printf("%8sPOC0: %lld, POC: %lld, POC1: %lld\n", "", f0->abs_poc,
+           f->abs_poc, f1->abs_poc);
+    printf("%8sSetup cost = %lld\n", "", la->frame_costs[ro[0]][ro[1]]);
+    #endif
+
+    // Bail if the requested setup was already tested.
+    if (la->frame_costs[ro[0]][ro[1]] != (uint64_t)(-1)) return;
+
+    // Initialize the setups. The per-block analysis function updates the
+    // requested setup cost, and the intra setup cost if it has not been
+    // computed (performance optimization). The other setup costs are not
+    // updated (performance optimization).
+
+    // True if the intra/L0/L1 analysis must be done.
+    int analysis_flags[3] = { 0, 0, 0 };
+
+    // Intra.
+    analysis_flags[0] = f->la.frame_costs[0][0] == (uint64_t)(-1);
+
+    // Inter.
+    // NOTE. We assume that the L0 reference is present if the L1 reference is
+    //       present.
+    assert(!ro[1] || ro[0]);
+    int nb_lists = !!ro[0] + !!ro[1];
+    for (int list = 0; list < nb_lists; list++)
+    {
+      // Offset in the bitmap (reference frame offset minus 1).
+      int bit = 1 << (ro[list] - 1);
+
+      // Only run test if it hasn't been done before.
+      analysis_flags[1 + list] = !(la->ref_tests[list] & bit);
+
+      // Indicate that the test will occur.
+      la->ref_tests[list] |= bit;
+    }
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%8sanalysis flags: Intra: %d, L0: %d, L1: %d\n", "",
+           analysis_flags[0], analysis_flags[1], analysis_flags[2]);
+    #endif
+
+    // Estimate the costs of the blocks.
+    fenc_la_est_blocks(enc, f, f0, f1, analysis_flags, nb_lists);
+
+    // FIXME. Consider the general flags F265_PF_AQ and F265_PF_MB_FLOW.
+    // Perform AQ as needed. The weighted prediction and block flow use the AQ
+    // results implicitly. If the AQ QP offsets are not needed, we set the AQ
+    // strength to zero, but we still perform the AQ pass.
+    if (analysis_flags[0])
+    {
+        fenc_la_aq(enc, f);
+    }
+}
+
+// Compute the QP offset for each 16x16 block.
+static void fenc_la_blk_flow_add_qp(f265_la_frame *la, int32_t nb_blks)
+{
+    // Cached data.
+    float *blk_qp_off = la->blk_qp_off;
+    uint16_t *blk_add = la->blk_prop_add;
+    uint16_t *blk_tot = la->blk_prop_total;
+
+    // FIXME. This needs to be a parameter to the function.
+    float strength = 0.4f;
+
+    // Improving the fidelity of this block reduces the encoding cost of
+    // future blocks linearly by (base_cmpl+prop_cmpl)/base_cmpl.
+    // Mapping to a QP offset corresponds to the log of this value. We use
+    // staturation for performance.
+    for (int blk_xy = 0; blk_xy < nb_blks; blk_xy++)
+    {
+        int32_t base_cmpl = blk_add[blk_xy];
+        int32_t prop_cmpl = blk_tot[blk_xy];
+        int32_t tot_cmpl = F265_MIN(base_cmpl + prop_cmpl, 65535);
+        blk_qp_off[blk_xy] = strength * (fenc_la_log2f(tot_cmpl) - fenc_la_log2f(base_cmpl));
+    }
+}
+
+// Convenience function.
+// Extract a 64-bit bitfield at some bit offset.
+//
+//                 [bitfield]
+//                 [BBBBAAAA]
+//            [bbbBBBBB][AAAaaaaa]
+//              [ bf 1 ][bf 0]
+//
+// The implementation depends on how the hardware is broken. The simple,
+// efficient solution is to load two adjacent aligned 64-bit integers, shift
+// them appropriately with the offset and OR them together. However, the result
+// of a shift by the field length (e.g. 64) is hardware-dependent, and that
+// cast happends if the offset is 0. On intel, shl is fast but incorrect, psll
+// is correct but slow, and using a branch is slow. We end up using unaligned
+// loads, while catering for the hardware that doesn't support it.
+static inline uint64_t fenc_la_blk_flow_extract_bitfield(uint8_t *blk_bf, int boff)
+{
+    uint64_t b, a = blk_bf[0];
+
+    #ifdef F265_ARCH_SSE4_TMP
+    b = *(uint64_t*)(blk_bf + 1);
+    #else
+    memcpy(&b, blk_bf + 1, 8);
+    #endif
+
+    return (a >> boff) | (b << (8 - boff));
 }
 
+// Convenience function.
+// Run block flow on up to 32 consecutive 16x16 blocks on the same row.
+static void fenc_la_blk_flow_distribute_batch(uint16_t *blk_tot, uint32_t *blk_amount, f265_mv *blk_mv,
+                                              uint64_t packed_const, uint32_t packed_off, uint64_t bf)
+{
+    // Unpack the data.
+    int width = (uint16_t)packed_const;
+    int height = (uint16_t)(packed_const >> 16);
+    int bweight = (uint16_t)(packed_const >> 32);
+    int list = (uint16_t)(packed_const >> 48);
+    int blk_x = (uint16_t)packed_off;
+    int blk_y = (uint16_t)(packed_off >> 16);
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%14sla_blk_flow_distribute_batch\n", "");
+    #endif
+
+    // Loop over the 16x16 blocks in the batch.
+    for (int i = 0; i < 32; i++, blk_x++)
+    {
+        // The block does not use the current list, skip it.
+        if (!(bf & (1ull << (2 * i + list)))) continue;
+
+        // Get the propagation amount and the motion vector.
+        int amount = blk_amount[blk_x];
+        int mv_x = blk_mv[blk_x].x;
+        int mv_y = blk_mv[blk_x].y;
+
+        // Scale the amount by the bi-pred weight if both lists are used.
+        uint64_t bmask = 3ull << (2 * i);
+        if ((bf & bmask) == bmask) amount = (amount * bweight + 32) >> 6;
+
+        // In general, a MV "splashes" on 4 blocks. Compute the weight of each
+        // block in units of 1/1024. There are 32x32=1024 quarterpels in an 8x8
+        // block.
+        int qx = mv_x & 31;
+        int qy = mv_y & 31;
+        int wxy[4] = { (32 - qy) * (32 - qx), (32 - qy) * qx,
+                       qy * (32 - qx), qy * qx };
+
+        // Distribute the weight on the blocks that are within the frame.
+        int pos_offsets[4][2] = { { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } };
+        for (int j = 0; j < 4; j++)
+        {
+            int px = blk_x + (mv_x >> 5) + pos_offsets[j][0];
+            int py = blk_y + (mv_y >> 5) + pos_offsets[j][1];
+            int pxy = py * width + px;
+
+            if (px >= 0 && py >= 0 && px < width && py < height)
+            {
+                // Saturate and round down for performance.
+                int distrib = F265_MIN((amount * wxy[j]) >> 10, 65535);
+                blk_tot[pxy] = F265_MIN(blk_tot[pxy] + distrib, 65535);
+
+                #ifdef F265_TRACE_LOOKAHEAD
+                printf("%16sblk_tot[%d]=%d\n", "", pxy, blk_tot[pxy]);
+                #endif
+            }
+        }
+    }
+}
+
+// Distribute the 16x16 block flow propagation amounts for one reference list.
+// The algorithm processes blocks one row at a time in batches of up to 32
+// blocks for performance.
+static void fenc_la_blk_flow_distribute_list(f265_enc *enc, f265_la_frame *cur, f265_la_frame *ref,
+                                             uint32_t *blk_amount, int ro[2], int32_t list, int32_t bweight)
+{
+    // Cached data.
+    f265_gen_data *gd = &enc->gd;
+    uint16_t *blk_ref_tot = ref->blk_prop_total;
+    f265_mv *blk_mv = cur->blk_mv[ro[0]][ro[1]];
+    uint8_t *blk_bf = cur->blk_prop_bf;
+    int width = gd->pix_dim[0] >> 4;
+    int height = gd->pix_dim[1] >> 4;
+
+    // Pack the constant data used by all batches.
+    uint64_t packed_const = 0;
+    packed_const |= (uint64_t)width << 0;
+    packed_const |= (uint64_t)height << 16;
+    packed_const |= (uint64_t)bweight << 32;
+    packed_const |= (uint64_t)list << 48;
+
+    // Offset of the current batch in the current byte of the bitfield.
+    int boff = 0;
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%12sla_blk_flow_distribute_list\n", "");
+    #endif
+
+    // Process every row.
+    for (int blk_y = 0; blk_y < height; blk_y++)
+    {
+        // Each row is processed in batches of up to 32 blocks.
+        int blk_x = 0;
+        for (int i = 0; i < width >> 5; i++, blk_x += 32, blk_bf += 8)
+        {
+            uint64_t bf = fenc_la_blk_flow_extract_bitfield(blk_bf, boff);
+            fenc_la_blk_flow_distribute_batch(blk_ref_tot, blk_amount, blk_mv,
+                                              packed_const, (blk_y << 16) | blk_x, bf);
+        }
+
+        // End the line with a partial batch. Keep only the leftover bits of
+        // the bitfield (if any). The shift cannot be 64.
+        uint64_t bf = fenc_la_blk_flow_extract_bitfield(blk_bf, boff);
+        int leftover_bits = (width & 31) << 1;
+        int shift = 64 - leftover_bits;
+        bf = (bf << shift) >> shift;
+        fenc_la_blk_flow_distribute_batch(blk_ref_tot, blk_amount, blk_mv,
+                                          packed_const, (blk_y << 16) | blk_x, bf);
+
+        // Update the bitfield offsets (wrap around 8).
+        blk_bf += (boff + leftover_bits) >> 3;
+        boff = (boff + leftover_bits) & 7;
+    }
+}
+
+// Add the amount received from subsequent frames to the base amount of each
+// 16x16 block.
+static void fenc_la_blk_flow_set_amount(uint32_t *blk_amount, f265_la_frame *la, int32_t nb_blks)
+{
+    // Cached data.
+    uint16_t *blk_add = la->blk_prop_add;
+    uint16_t *blk_mult = la->blk_prop_mult;
+    uint16_t *blk_tot = la->blk_prop_total;
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%12sla_blk_flow_set_amount\n", "");
+    #endif
+
+    // Multiply the summation by the propagation multiplication factor.
+    // NOTE. 16-bit saturation is used to speed up the assembly code.
+    for (int blk_xy = 0; blk_xy < nb_blks; blk_xy++)
+    {
+        uint32_t base = blk_tot[blk_xy];
+        uint32_t add = blk_add[blk_xy];
+        uint32_t mult = blk_mult[blk_xy];
+        uint32_t amount = F265_MIN(base + add, 65535);
+        blk_amount[blk_xy] = (amount * mult) >> 15;
+
+        #ifdef F265_TRACE_LOOKAHEAD
+        printf("%14sblk_amount[%d]=%d\n", "", blk_xy, blk_amount[blk_xy]);
+        #endif
+    }
+}
+
+// Evaluate each 16x16 block's multiplication factor and bitfield.
+static void fenc_la_blk_flow_set_mult_bf(f265_la_frame *la, int32_t ro[2], int32_t nb_blks)
+{
+    // Cached data.
+    uint16_t *blk_intra = la->blk_costs[0][0];
+    uint16_t *blk_inter = la->blk_costs[ro[0]][ro[1]];
+    uint16_t *blk_mult = la->blk_prop_mult;
+    uint8_t *blk_bf = la->blk_prop_bf;
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%12sla_blk_flow_set_mult_bf\n", "");
+    #endif
+
+    for (int blk_xy = 0; blk_xy < nb_blks; blk_xy++)
+    {
+        // In the following, the 'inter' and 'intra' costs refer to the costs
+        // in the inter/intra setups respectively. The propagation ratio is
+        // (intra-inter)/intra, a value in the range [0,1]. The algorithm
+        // performs well with low numerical accuracy, so we approximate the
+        // division and convert the ratio to integer with ~15 bits precision.
+        //
+        // Empirical tests with other metrics than inter/intra, such as square
+        // roots and logs, have been inconclusive. Any metric that is sane near
+        // its extrama seems to perform adequately. Some videos benefit from
+        // high propagation ratios while other benefit from low ratios. There
+        // are no clear tendencies so we use the simplest algorithm that works.
+        int intra = blk_intra[blk_xy];
+        int inter = blk_inter[blk_xy] & 16383;
+        blk_mult[blk_xy] = (uint32_t)(fenc_la_divf(intra - inter, intra) * 32768.0f);
+
+        // Update the bitfield.
+        int list = blk_inter[blk_xy] >> 14;
+        int boff = (blk_xy & 3) << 1;
+        blk_bf[blk_xy >> 2] = (blk_bf[blk_xy >> 2]&~(3 << boff)) | (list << boff);
+
+        #ifdef F265_TRACE_LOOKAHEAD
+        printf("%14sblk_mult[%d]=%d, blk_bf[%d]=%d\n", "",
+               blk_xy, blk_mult[blk_xy], blk_xy >> 2, blk_bf[blk_xy >> 2]);
+        #endif
+    }
+}
+
+// Propagate the costs of the current frame to the reference frames. The
+// reference frames are set to the current frame if unused.
+static void fenc_la_blk_flow_propagate(f265_enc *enc, f265_frame *f, f265_frame *f0, f265_frame *f1, int nb_blks)
+{
+    // Cached data.
+    f265_la_frame *la = &f->la;
+
+    // Compute the reference frame offsets.
+    f265_frame *refs[2] = { f0, f1 };
+    int ro[2] = { f->abs_poc - refs[0]->abs_poc,
+                  refs[1]->abs_poc - f->abs_poc };
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%10sla_blk_flow_propagate\n", "");
+    #endif
+
+    // Bi-prediction weights.
+    int bweights[2] = { 0, 0 };
+
+    // Compute the propagation multiplication factors and bitfields it they are
+    // not already in the cache.
+    uint16_t setup = (ro[0] << 8) | ro[1];
+    if (la->blk_prop_setup != setup)
+    {
+        la->blk_prop_setup = setup;
+        fenc_la_blk_flow_set_mult_bf(la, ro, nb_blks);
+    }
+
+    // Store the propagation amounts in the store buffer.
+    f265_enc_thread *t = enc->la.threads;
+    uint32_t *blk_amount = (uint32_t*)t->store;
+    t->store += nb_blks * sizeof(uint32_t);
+    fenc_la_blk_flow_set_amount(blk_amount, la, nb_blks);
+
+    // Distribute the block flow propagation amounts.
+    for (int list = 0; list < 2; list ++)
+        if (ro[list])
+            fenc_la_blk_flow_distribute_list(enc, la, &refs[list]->la, blk_amount, ro, list, bweights[list]);
+
+    // Release the borrowed memory from the store.
+    t->store = (uint8_t*)blk_amount;
+}
+
+// Run the block flow algorithm on the sequence of frames.
+// NOTE. Performs AQ implicitly.
+static void fenc_la_blk_flow(f265_enc *enc, f265_frame **frames, int32_t seq)
+{
+    // Number of 16x16 blocks in the picture.
+    f265_gen_data *gd = &enc->gd;
+    int width = gd->pix_dim[0] >> 4, height = gd->pix_dim[1] >> 4;
+    int nb_blks = width * height;
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%8sla_blk_flow\n", "");
+    #endif
+
+    // Clear the propagation costs of the last frame.
+    memset(frames[seq-1]->la.blk_prop_total, 0, nb_blks * sizeof(uint16_t));
+
+    // Analyze the frames, last-to-first.
+    for (int i = seq - 1; i; i--)
+    {
+        // Analyze the propagation setup.
+        f265_frame *fr = frames[i-1];
+        f265_frame *fp = frames[i];
+        fenc_la_est_frame(enc, fp, fr, fp);
+
+        // Clear the propagation costs of the receiving frame.
+        memset(fr->la.blk_prop_total, 0, nb_blks * sizeof(uint16_t));
+
+        // Propagate the costs to the receiving frame.
+        fenc_la_blk_flow_propagate(enc, fp, fr, fp, nb_blks);
+    }
+
+    // Process the first frame.
+    f265_frame *f = frames[0];
+    f265_la_frame *la = &f->la;
+
+    // Add the QP deltas.
+    // FIXME. Add la_algo_strength to params.
+    fenc_la_blk_flow_add_qp(la, nb_blks);
+}
+
+
 // Recursively store the B frames in coded order. The frames are stored in a
 // B-pyramid while the supply of B reference frames lasts.
 void fenc_reorder_b_frames(f265_frame **out, f265_frame **in, int nb_in, int nb_b_refs)
 {
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%8sreorder_b_frames: nb_b_refs=%d.\n", "", nb_b_refs);
+    #endif
+
     // No more B references or not enough frames to add another B-pyramid level.
     // Store frames in order, using any remaining B references.
     if (!nb_b_refs || nb_in <= 2)
@@ -130,10 +1158,25 @@ void fenc_reorder_b_frames(f265_frame **out, f265_frame **in, int nb_in, int nb_
 // Process the received frame in regular lookahead. Return the output frame, if
 // any. We process the buffered frames as soon as possible. For real-time
 // processing this can help to distribute the CPU load evenly.
-f265_frame* fenc_la_process_frame_regular(f265_enc *enc, f265_frame *in)
+static f265_frame* fenc_la_process_frame_regular(f265_enc *enc, f265_frame *in)
 {
+    f265_gen_data *gd = &enc->gd;
     f265_lookahead *la = &enc->la;
 
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%4sla_process_frame_regular\n", "");
+
+    // A new frame came in.
+    if (in)
+    {
+        printf("%6sBuffering received frame in display queue at pos %d.\n",
+               "", 1 + la->nb_undecided);
+    }
+
+    // No input. Start flushing.
+    else printf("%6sFlushing the lookahead.\n", "");
+    #endif
+
     // Buffer the input frame.
     if (in) la->display[1 + la->nb_undecided++] = in;
 
@@ -143,28 +1186,43 @@ f265_frame* fenc_la_process_frame_regular(f265_enc *enc, f265_frame *in)
         // Current frame to decide.
         f265_frame *fc = la->display[1];
 
+        // Unification frame, if any. This is the P/I frame in the last GOP.
+        f265_frame *fu = la->display[0];
+
+        #ifdef F265_TRACE_LOOKAHEAD
+        if (fu)
+        {
+            printf("%6sUnification POC %lld, current POC %lld.\n", "", 
+                   fu->abs_poc, fc->abs_poc);
+        }
+        else
+        {
+            printf("%6sNo unification frame currently (POC=%lld).\n", "",
+                   fc->abs_poc);
+        }
+        #endif
+
         // Minimum/maximum key frame interval. Arbitrary.
         int keyint_max = la->key_interval;
         int keyint_min = F265_MAX(F265_MIN(keyint_max>>3, 30), 1);
 
-        // Number of frames between fc and the next key frame.
+        // The number of frames between fc and the next/previous key frame.
         int next_key_dist = la->key_countdown - 1;
-
-        // Number of frames between fc and the previous key frame.
         int prev_key_dist = la->key_interval - la->key_countdown + 1;
 
-        // Size of the current group of pictures (P or I frame followed by 0 or
-        // more B frames).
-        int nb_gop = 0;
+        // Key frames are forced every N frames for synchronisation purposes.
+        // We make sure that request is enforced here.
+        int key_flag = !next_key_dist;
 
-        // True if the leading frame in the current GOP is an intra/key frame.
-        int i_flag = 0, key_flag = 0;
+        // Key frames only use intra coding. Intra also has to be used when
+        // temporal references are deactivated on the command line.
+        int i_flag = key_flag | !gd->nb_refs;
 
-        // Key frame type, if the leading frame is used as a key frame.
+        // A key type of 0 is a CRA. A key type of 1 is an IDR.
         int key_type = la->key_type;
 
-        // Import the key frame status and type (if it is already set) from the
-        // external interface.
+        // Import the key frame status and type (if it is already set)
+        // from the external interface.
         if (F265_GET_FLAG(fc->gen_flags, F265_FF_KEY))
         {
             i_flag = key_flag = 1;
@@ -172,29 +1230,62 @@ f265_frame* fenc_la_process_frame_regular(f265_enc *enc, f265_frame *in)
             else if (F265_GET_FLAG(fc->gen_flags, F265_FF_IDR)) key_type = 1;
         }
 
-        // Enforce the key frame interval for the current frame.
-        if (!next_key_dist) i_flag = key_flag = 1;
+        #ifdef F265_TRACE_LOOKAHEAD
+        if (key_flag) printf("%6sA key frame has been requested.\n", "");
+        else if (i_flag) printf("%6sIntra coding has been requested.\n", "");
+        #endif
 
-        // Enforce use of intra frames.
-        i_flag |= !enc->gd.nb_refs;
+        // Use the lookahead.
+        if (gd->eflags&F265_PF_LA)
+        {
+            // If the frame is not intra, we have to analyze the relation
+            // between the current frame and the unification frame (unless only
+            // AQ is enabled). In all cases we have to analyze the intra setup.
+            // For simplicity we perform this step up front. AQ is done here as
+            // needed.
+            fenc_la_est_frame(enc, fc, i_flag ? fc : fu, fc);
 
-        // Use the lookahead (future support).
+            // Scene cut test.
+            if (!i_flag)
+            {
+                i_flag = fenc_la_detect_scene_cut(fc, prev_key_dist, keyint_min, keyint_max);
+
+                #ifdef F265_TRACE_LOOKAHEAD
+                if (i_flag) printf("%6sA scene cut has been detected.\n", "");
+                #endif
+            }
+
+            // Block flow. Analyze up to the next key frame, or the lookahead
+            // end. next_key_dist is 0 in the case of a key frame, so
+            // in that case, we analyse up to the key interval.
+            // FIXME. Consider F265_PF_MB_FLOW flag?
+            {
+                int seq = F265_MIN(next_key_dist ? next_key_dist : keyint_max, la->nb_undecided);
+                fenc_la_blk_flow(enc, la->display + 1, seq);
+            }
+        }
 
         // Convert I to key frame if the GOP size is higher than the minimum.
         // FIXME: enable when we don't need HM compatibility.
         #if 0
         key_flag |= (i_flag && prev_key_dist >= keyint_min);
-        #else
-        if (prev_key_dist + keyint_min) {}
         #endif
 
         // Compute the number of B frames naively. Enforce the B frame limit,
-        // the key frame interval limit, the frame type and the key frame flags.
-        // We include the key frame if the GOP is open and there are no
-        // interfering key frames. Should probably be done earlier eventually.
+        // the key frame interval limit, the frame type, and the key frame
+        // flags. We include the key frame if the GOP is open and there are no
+        // interfering key frames.
+        // FIXME.  Should probably be done earlier eventually.
         int open_gop_flag = !key_flag && key_type == 0;
-        int nb_seq_b = (!i_flag)*F265_MIN(F265_MIN(la->nb_undecided - 1, la->nb_b_frames),
-                                          next_key_dist + open_gop_flag - 1);
+        int nb_seq_b = 0;
+        if (!i_flag)
+        {
+            int b_limit = F265_MIN(la->nb_undecided - 1, la->nb_b_frames);
+            nb_seq_b = F265_MIN(b_limit, next_key_dist + open_gop_flag - 1);
+        }
+
+        // Check for interfering key frames. Reduce consecutive B frame count
+        // if one is found.
         for (int i = 0; i < nb_seq_b; i++)
         {
             if (la->display[1+i]->gen_flags&F265_FF_KEY)
@@ -203,19 +1294,28 @@ f265_frame* fenc_la_process_frame_regular(f265_enc *enc, f265_frame *in)
                 break;
             }
         }
-        nb_gop = nb_seq_b + 1;
         if (nb_seq_b == next_key_dist) key_flag = 1;
 
+        // Add an I or P frame to the consecutive series of B frames.
+        int nb_gop = nb_seq_b + 1;
+
+        #ifdef F265_TRACE_LOOKAHEAD
+        printf("%6sSet consecutive B frames to %d.\n", "", nb_seq_b);
+        printf("%6sGOP size will be %d.\n", "", nb_gop);
+        printf("%6s%d frame(s) are currently waiting in the coded queue.\n",
+               "", la->nb_committed);
+        #endif
+
         // Write the GOP.
         f265_frame **p = la->coded + la->nb_committed;
 
         // Leading GOP frame.
         p[0] = la->display[1+nb_seq_b];
         p[0]->la_frame_type = i_flag ? F265_FRAME_I : F265_FRAME_P;
-        F265_SET_FLAG(p[0]->gen_flags, F265_FF_REF, !!enc->gd.nb_refs);
+        F265_SET_FLAG(p[0]->gen_flags, F265_FF_REF, !!gd->nb_refs);
 
         // Reorder the B frames.
-        fenc_reorder_b_frames(p + 1, la->display + 1, nb_seq_b, enc->gd.nb_b_refs);
+        fenc_reorder_b_frames(p + 1, la->display + 1, nb_seq_b, gd->nb_b_refs);
 
         // Commit the current GOP if there are no committed frames and the
         // lookahead is full or flushing.
@@ -250,6 +1350,12 @@ f265_frame* fenc_la_process_frame_regular(f265_enc *enc, f265_frame *in)
 
             // Update the unification frame.
             la->display[0] = p[0];
+
+            #ifdef F265_TRACE_LOOKAHEAD
+            printf("%6s%d frames until the next key frame.\n", "", la->key_countdown);
+            printf("%6s%d frames have been committed.\n", "", la->nb_committed);
+            printf("%6s%d frames remain undecided.\n", "", la->nb_undecided);
+            #endif
         }
     }
 
@@ -259,50 +1365,65 @@ f265_frame* fenc_la_process_frame_regular(f265_enc *enc, f265_frame *in)
     // Retrieve the output frame from the coded array if there are committed
     // frames.
     if (la->nb_committed)
-    {
         out = fenc_frame_queue_pop(la->coded, &la->nb_committed);
-    }
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    if (in && !out)
+        printf("%6sNo frame outputted. Lookahead is still buffering.\n", "");
+    #endif
 
     return out;
 }
 
-// This function is executed by the primary lookahead thread to analyze frames.
-// The mutex is locked on entry and on exit.
-void fenc_la_process_frames(f265_enc *enc)
+// Transfer the contents of the lookahead's input queue to its output queue.
+// If the lookahead is active, each frame is analyzed during the transfer.
+// Otherwise, the function simply moves frames from one queue to the other.
+// FIXME. Incorporate multithreading eventually.
+static void fenc_la_process_frames(f265_enc *enc)
 {
+    // Cached data.
+    f265_gen_data *gd = &enc->gd;
     f265_lookahead *la = &enc->la;
+    int32_t stride = gd->stride;
+    int32_t half_dim[2] = { gd->pix_dim[0] >> 1, gd->pix_dim[1] >> 1 };
+    int32_t br = half_dim[0] - 1, bb = half_dim[1] - 1;
+    int32_t pad = F265_LUMA_PLANE_PADDING>>1;
+    int32_t la_flag = gd->eflags & F265_PF_LA;
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("%2sla_process_frames\n", "");
+    #endif
 
-    // Loop until all the frames were processed.
+    // Empty the input queue, filling the output queue at each iteration.
     while (la->in_queue_len)
     {
-        // Pop the frame from the input queue.
+        // Pop the frame from the input queue and decrement its count.
         f265_frame *in = fenc_frame_queue_pop(la->in_queue, &la->in_queue_len);
 
-        // Compute the subsampled planes. FIXME.
-        #if 0
-        if (in && (gd->eflags&F265_PF_LA))
+        // When the lookahead is active, downscale the input luma plane to 1/4
+        // of its size, then pad the "thumbnails".
+        if (in && la_flag)
         {
-            // Interpolate the planes.
-            fenc_compute_subsampled(in->sub_planes, in->src_planes[0], gd->stride, gd->mb_size[0], gd->mb_size[1]);
+            #ifdef F265_TRACE_LOOKAHEAD
+            printf("%4sDownscale %dx%d to %dx%d, and pad\n", "",
+                   gd->pix_dim[0], gd->pix_dim[1], half_dim[0], half_dim[1]);
+            #endif
 
-            // Pad the subsampled planes.
-            int32_t br = (gd->pix_size[0]>>1) - 1, bb = (gd->pix_size[1]>>1) - 1;
-            int32_t pad = F265_LUMA_PLANE_PADDING>>1;
-            for (int i = 0; i < 4; i++) fenc_pad_plane(in->sub_planes[i], gd->stride, 0, br, 0, bb, pad, pad, pad, pad);
+            fenc_compute_subsampled(in->sub_planes, in->src_planes[0], stride, half_dim[0], half_dim[1]);
+            for (int i = 0; i < 4; i++) fenc_pad_plane(in->sub_planes[i], stride, 0, br, 0, bb, pad, pad, pad, pad);
         }
-        #endif
 
         // Process the frame and get the output frame, which can be null.
         f265_frame *out = fenc_la_process_frame_regular(enc, in);
 
-        // Add the frame in the output queue and notify the main thread.
+        // Add the frame to the output queue.
         la->out_queue[la->out_queue_len++] = out;
     }
 }
 
-// Add a frame to the lookahead and return the next frame to encode. The input
-// frame is null when the lookahead is being flushed. The output frame is null
-// when the frames are being buffered.
+// Add a frame to the lookahead and return the next frame to encode.
+// NOTE. The input frame is null when the lookahead is being flushed.
+//       The output frame is null when the frames are being buffered.
 f265_frame* fenc_la_add_frame(f265_enc *e, f265_frame *in)
 {
     f265_gen_data *gd = &e->gd;
@@ -310,41 +1431,140 @@ f265_frame* fenc_la_add_frame(f265_enc *e, f265_frame *in)
     f265_lookahead *la = &e->la;
     int mt_flag = e->gd.nb_workers[1];
 
+    // FIXME. Incorporate multithreading eventually. For now, assume mt_flag
+    //        will always be equal to 0. See "Run the lookahead".
+
+    #ifdef F265_TRACE_LOOKAHEAD
+    printf("la_add_frame\n");
+    #endif
+
     // Add the frame to the input queue.
     la->in_queue[la->in_queue_len++] = in;
+
     if (in) md->nb_la_frames++;
 
     // Run the lookahead.
-    if (!mt_flag) fenc_la_process_frames(e);
+    if (!mt_flag || 1)
+    {
+        fenc_la_process_frames(e);
+    }
 
     // Read a frame from the output queue. The frame can be null.
     f265_frame *out = fenc_frame_queue_pop(la->out_queue, &la->out_queue_len);
-    if (out) md->nb_la_frames--;
 
-    // No output frame.
+    // No output frame. Bail.
     if (!out) return NULL;
 
+    // A frame was removed from the buffer. Decrement count.
+    md->nb_la_frames--;
+
     // If the frame is not a B frame, then it might still be used for reference
     // in the lookahead as the last frame of the last group of pictures.
     // FIXME: revisit that assumption, set flag status in lookahead.
     if ((gd->eflags&F265_PF_LA) && out->la_frame_type != F265_FRAME_B)
     {
-        // Clear the previous frame used for lookahead reference. The memory is
-        // only released if the frame was encoded.
+        // Clear the previous frame used for lookahead reference.
         if (md->la_ref_frame)
         {
             F265_SET_FLAG(md->la_ref_frame->main_flags, F265_FF_LA_REF, 0);
-            if (md->la_ref_frame->main_flags&F265_FF_ENC) fenc_release_frame_mem(e, md->la_ref_frame);
+
+            // Release the memory only if the frame was encoded.
+            if (md->la_ref_frame->main_flags&F265_FF_ENC)
+                fenc_release_frame_mem(e, md->la_ref_frame);
         }
         md->la_ref_frame = out;
     }
 
     // Mark the frame as no longer used for reference by the lookahead.
-    else
-    {
-        F265_SET_FLAG(out->main_flags, F265_FF_LA_REF, 0);
-    }
+    else F265_SET_FLAG(out->main_flags, F265_FF_LA_REF, 0);
 
     return out;
 }
 
+// Prepare the frame for insertion in the lookahead.
+f265_frame* fenc_la_make_frame(f265_enc *e)
+{
+    f265_gen_data *gd = &e->gd;
+    f265_main_data *md = &e->md;
+    f265_enc_req *req = md->req;
+
+    // Link the frame, source and lookahead objects.
+    f265_frame *f = (f265_frame*)fenc_get_unused_obj(e, F265_UNUSED_FRM_OBJ);
+    f->mem_buf[0] = (uint8_t*)f;
+    fenc_link_src_obj(e, f);
+    fenc_link_la_obj(e, f);
+    f->mem_buf[3] = NULL;
+
+    // Reset the frame flags.
+    f->main_flags = f->gen_flags = 0;
+
+    // The frame is used for reference in the lookahead.
+    f->main_flags |= F265_FF_LA_REF;
+
+    // Set the absolute picture order count.
+    f->abs_poc = md->abs_poc_counter++;
+
+    // Set the timestamp and the duration.
+    f->timestamp = req->input->timestamp;
+    f->duration = req->input->duration;
+
+    // Force key frame if requested.
+    F265_SET_FLAG(f->gen_flags, F265_FF_KEY, req->force_key_flag|req->force_idr_flag);
+    F265_SET_FLAG(f->gen_flags, F265_FF_IDR, req->force_idr_flag);
+
+    // Force IDR if this is the first frame or we're overflowing the picture
+    // order count.
+    if (!f->abs_poc || md->enc_frames[0] && md->enc_frames[0]->h265_poc >= 2000000000)
+    {
+        F265_SET_FLAG(f->gen_flags, F265_FF_KEY|F265_FF_IDR, 1);
+    }
+
+    // FIXME: assuming 4:2:0 data for now. We need to figure out how we want to
+    // handle the format conversions. In particular, we may need to increase
+    // the bit depth of the input.
+
+    // Get the plane information.
+    int32_t stride = gd->stride;
+    f265_pix **planes = (f265_pix**)f->src_planes;
+    int32_t *pix_dim = gd->pix_dim;
+    int32_t *clip_dim = gd->clip_dim;
+
+    // Copy the plane data.
+    for (int i = 0; i < 3; i++)
+    {
+        int32_t in_stride = req->input->stride[i];
+        uint8_t *in_plane = req->input->planes[i];
+        f265_pix *p = planes[i];
+        for (int32_t j = 0; j < clip_dim[1]>>!!i; j++, in_plane += in_stride, p += stride)
+            memcpy(p, in_plane, clip_dim[0]>>!!i);
+    }
+
+    // Pad the planes.
+    {
+        // Amount of padding samples surrounding the picture:
+        // [0] -> left, [1] -> right, [2] -> top, [3] -> bottom.
+        int32_t pad_luma[4] = { 0, pix_dim[0] - clip_dim[0], 0, pix_dim[1] - clip_dim[1] };
+        int32_t pad_chroma[4] = { 0, pad_luma[1]>>1, 0, pad_luma[3]>>1 };
+
+        // The luma plane needs to be padded with an extra pixel if the
+        // lookahead is used (subsampled planes + AQ intra prediction).
+        for (int i = 0; i < 4; i++) pad_luma[i] += F265_GET_FLAG(gd->eflags, F265_PF_LA);
+        for (int i = 0; i < 2; i++) pad_luma[i] = F265_ALIGN_VAL(pad_luma[i], 8);
+        pad_chroma[1] = F265_ALIGN_VAL(pad_chroma[1], 8);
+
+        // Pad the planes.
+        fenc_pad_plane(planes[0], stride, 0, clip_dim[0] - 1, 0, clip_dim[1] - 1,
+                       pad_luma[0], pad_luma[1], pad_luma[2], pad_luma[3]);
+        for (int c = 1; c < 3; c++)
+            fenc_pad_plane(planes[c], stride, 0, (clip_dim[0] - 1)>>1, 0, (clip_dim[1] - 1)>>1,
+                           pad_chroma[0], pad_chroma[1], pad_chroma[2], pad_chroma[3]);
+
+        #if 0
+        // FIXME. Ported from v264. To be activated?
+        // Transfer the ownership of the decoder statistics, if any.
+        f->dec_stats = req->dec_stats;
+        #endif
+    }
+
+    return f;
+}
diff --git a/f265/parse.c b/f265/parse.c
index 3ddbac1..e94f378 100644
--- a/f265/parse.c
+++ b/f265/parse.c
@@ -516,6 +516,13 @@ static void handle_param_nb_workers(f265_parse_ctx *ctx, f265_enc_params *p, f26
     p->nb_workers[1] = a[1].i;
 }
 
+static void handle_param_lookahead(f265_parse_ctx *ctx, f265_enc_params *p, f265_parse_arg *a, int32_t nb_args)
+{
+    p->la_flag = a[0].i >= 0;
+    p->la_decision_delay = F265_MAX(a[0].i, 0);
+    if (nb_args == 2) p->la_process_delay = a[1].i;
+}
+
 // Parameter dispatch table.
 static const f265_parse_entry f265_enc_params_table[] =
 {
@@ -573,6 +580,7 @@ static const f265_parse_entry f265_enc_params_table[] =
     { "tiles", handle_param_tiles, 2, 0 },
     { "mt-mode", handle_param_mt_mode, 1, 0 },
     { "nb-workers", handle_param_nb_workers, 2, 0 },
+    { "lookahead", handle_param_lookahead, 2, 0 },
 };
 
 void f265_parse_params(f265_enc_params *params, const char *param_string, char **error_handle)
diff --git a/f265/pixel.c b/f265/pixel.c
index fe9e517..b0c5444 100644
--- a/f265/pixel.c
+++ b/f265/pixel.c
@@ -61,6 +61,50 @@ void fenc_transpose_block(f265_pix *dst, int dst_stride, f265_pix *src, int src_
             dst[x*dst_stride + y] = src[x];
 }
 
+// Helper function for fenc_compute_subsampled(). The assembly version
+// interpolates the rows before the columns.
+// FIXME. Assembly version not yet imported.
+static finline f265_pix fenc_interpolate_subsampled(uint32_t nw, uint32_t ne, uint32_t sw, uint32_t se)
+{
+    return (((nw + sw + 1) >> 1) + ((ne + se + 1) >> 1) + 1) >> 1;
+}
+
+// Compute the subsampled planes of the plane specified. The function assumes
+// that the fullpel/vertical planes are stride/2 pixels apart from the
+// horizontal/diagonal planes. The function assumes that the source plane has
+// at least 32 bytes of padding. The source plane must be padded by one pixel
+// to the right and the bottom for the interpolation to proceed correctly.
+void fenc_compute_subsampled(f265_pix *sub_planes[4], f265_pix *src_plane,
+                             int32_t stride, int32_t width, int32_t height)
+{
+    for (int32_t y = 0; y < height; y++)
+    {
+        f265_pix *src0 = src_plane + y * 2 * stride;
+        f265_pix *src1 = src_plane + y * 2 * stride + stride;
+        f265_pix *src2 = src_plane + y * 2 * stride + 2 * stride;
+        f265_pix *subf = sub_planes[0] + y * stride;
+        f265_pix *subh = sub_planes[1] + y * stride;
+        f265_pix *subv = sub_planes[2] + y * stride;
+        f265_pix *subd = sub_planes[3] + y * stride;
+
+        for (int32_t x = 0; x < width; x++)
+        {
+            // Pixel layout:
+            // [0 1 2]
+            // [3 4 5]
+            // [6 7 8]
+            uint32_t p[9] = { src0[2 * x], src0[2 * x + 1], src0[2 * x + 2],
+                              src1[2 * x], src1[2 * x + 1], src1[2 * x + 2],
+                              src2[2 * x], src2[2 * x + 1], src2[2 * x + 2]
+                            };
+            subf[x] = fenc_interpolate_subsampled(p[0], p[1], p[3], p[4]);
+            subh[x] = fenc_interpolate_subsampled(p[1], p[2], p[4], p[5]);
+            subv[x] = fenc_interpolate_subsampled(p[3], p[4], p[6], p[7]);
+            subd[x] = fenc_interpolate_subsampled(p[4], p[5], p[7], p[8]);
+        }
+    }
+}
+
 // Pad a plane horizontally. Note that the assembly implementation accesses
 // pixels as uint32_t in the plane.
 static void noinline fenc_pad_horizontal(f265_pix *dst, f265_pix *src, int32_t stride, int32_t nb_rows, int32_t len)
* Unmerged path test/data/videos.ini
