Hi,

the patch looks good to me. I flagged some minor stuff but I don't think that further review is required to commit the patch.

Thanks,
Laurent
%%% Change file format to UNIX before committing.

diff --git a/doc/params.txt b/doc/params.txt
index ea08232..26ed3b3 100644
--- a/doc/params.txt
+++ b/doc/params.txt
@@ -291,6 +291,27 @@ Minimum number of frames used to compute the ABR rate control convergence window
 Exponent used to compute the ABR rate control convergence window.
 
 
+* tiles=COLS,ROWS. Default 1,1.
+
+Number of columns and rows that will be created using uniform spacing. The
+current restrictions are 11 rows and 10 columns. These restrictions conform to a
+level up to 5.2. The actual values used correspond to MIN(COLS, PIC WIDTH CTB)
+and MIN(ROWS, PIC HEIGHT CTB).
+
+
+* mt-mode=X. Default 0.
+
+Activate multithreading. Currently, only no multithreading (mt-mode=0) and tile-
+based multithreading (mt-mode=1) are supported.
+
+
+* nb-workers=X,Y. Default 0,0.
+
+When multithreading is activated, X indicates the number of additional threads
+dedicated to encoding, and Y indicates the number of threads used by the
+lookahead.
+
+
 -------------------------------------------------------------------------------
 Configurations and regression tests.
 
diff --git a/f265/bdi.c b/f265/bdi.c
index 2ca8d99..a7ca71d 100644
--- a/f265/bdi.c
+++ b/f265/bdi.c
@@ -180,6 +180,7 @@ void f265_set_default_params(f265_enc_params *p)
     p->me_algo[1] = p->me_algo[2] = 1;
     p->me_iter[0] = 16;
     p->me_iter[1] = p->me_iter[2] = 1;
+    p->tiles[0] = p->tiles[1] = 1;
 }
 
 void f265_normalize_params(f265_enc_params *p)
@@ -188,7 +189,9 @@ void f265_normalize_params(f265_enc_params *p)
     #define CLF(V) (V) = !!(V)
     #define CLP(V) (V) = F265_MAX(0, (V))
     for (int i = 0; i < 2; i++) CL(p->clip_dim[i], 1, 16384);
-    CL(p->mt_mode, 0, 2);
+    // FIXME. Only tile multi-threading is supported. Update this when other
+    //        modes are supported (WPP-based, frame-based, GOP-based, etc.).
+    CL(p->mt_mode, 0, 1);
     CL(p->chroma_format, 0, 3);
     for (int i = 0; i < 4; i++) CL(p->bit_depth[i], 8, 14);
     CL(p->cb_range[0], 3, 6);
@@ -235,6 +238,10 @@ void f265_normalize_params(f265_enc_params *p)
     CLP(p->frame_rate_num);
     p->frame_rate_den = F265_MAX(p->frame_rate_den, 1);
     CL(p->qp, 0, 51);
+    // Assume level is 5 or greater. FIXME. See table A1 in HEVC spec.
+    CL(p->tiles[0], 1, 10);
+    CL(p->tiles[1], 1, 11);
+    CLP(p->nb_workers[0]);
     #undef CL
     #undef CLF
     #undef CLP
@@ -242,8 +249,11 @@ void f265_normalize_params(f265_enc_params *p)
     // Make sure there is room for B frames in the lookahead.
     p->la_decision_delay = F265_MAX(p->la_decision_delay, p->nb_b_frames);
 
-    // Normalize the number of threads. FIXME.
-    p->nb_workers[0] = p->nb_workers[1] = 0;
+    // Normalize the number of lookahead threads. FIXME.
+    p->nb_workers[1] = 0;
+
+    // Prevent WPP and Tiles at the same time. Favour Tiles over WPP.
+    if (p->tiles[0] > 1 || p->tiles[1] > 1) p->wpp_flag = 0;
 }
 
 // Dispatch by the parameters/encoder bit depth. We rely on the bit depth being
diff --git a/f265/bdi.h b/f265/bdi.h
index c638a2a..ec478f4 100644
--- a/f265/bdi.h
+++ b/f265/bdi.h
@@ -279,6 +279,9 @@
 // Use wavefront parallel processing.
 #define F265_PF_WPP                     (1<<18)
 
+// Use tiles.
+#define F265_PF_TILES                   (1<<19)
+
 // Number of CABAC contexts.
 #define F265_NB_CABAC_CTX               154
 
diff --git a/f265/bs.c b/f265/bs.c
index 82ab920..0d1993e 100644
--- a/f265/bs.c
+++ b/f265/bs.c
@@ -255,6 +255,7 @@ void fenc_write_pps(f265_vlc_bs *vbs, f265_enc *enc)
 {
     f265_gen_data *gd = &enc->gd;
     int deblock_flag = F265_GET_FLAG(gd->eflags, F265_PF_DEBLOCK);
+    int tiles_enabled = F265_GET_FLAG(gd->eflags, F265_PF_TILES);
 
     FENC_ANNOUNCE_VLC("PPS");
     FENC_PUT_UE_V(vbs, 0, "pps_pic_parameter_set_id");
@@ -279,8 +280,19 @@ void fenc_write_pps(f265_vlc_bs *vbs, f265_enc *enc)
     FENC_PUT_FLAG(vbs, 0, "weighted_pred_flag");
     FENC_PUT_FLAG(vbs, 0, "weighted_bipred_flag");
     FENC_PUT_FLAG(vbs, F265_GET_FLAG(gd->eflags, F265_PF_TRANSQUANT_BYPASS), "transquant_bypass_enable_flag");
-    FENC_PUT_FLAG(vbs, 0, "tiles_enabled_flag");
+    FENC_PUT_FLAG(vbs, tiles_enabled, "tiles_enabled_flag");
     FENC_PUT_FLAG(vbs, F265_GET_FLAG(gd->eflags, F265_PF_WPP), "entropy_coding_sync_enabled_flag");
+    if (tiles_enabled)
+    {
+        FENC_PUT_UE_V(vbs, gd->tiles[0]-1, "num_tile_columns_minus1");
+        FENC_PUT_UE_V(vbs, gd->tiles[1]-1, "num_tile_rows_minus1");
+
+        // FIXME. Change this if non-uniform spacing is supported.
+        FENC_PUT_FLAG(vbs, 1, "uniform_spacing_flag");
+
+        // FIXME. Change if tile boundaries are considered during deblocking.
+        FENC_PUT_FLAG(vbs, 1, "loop_filter_across_tiles_enabled_flag");
+    }
     FENC_PUT_FLAG(vbs, 1, "loop_filter_across_slices_enabled_flag");
 
     FENC_PUT_FLAG(vbs, !deblock_flag, "deblocking_filter_control_present_flag");
@@ -303,13 +315,20 @@ void fenc_write_slice_header(f265_vlc_bs *vbs, f265_enc *enc, f265_frame *f, int
                              uint32_t seg_flags, int ctb_xy, int nal_type)
 {
     f265_gen_data *gd = &enc->gd;
+    int first_flag = ctb_xy == 0;
     int deblock_flag = F265_GET_FLAG(gd->eflags, F265_PF_DEBLOCK);
     int tmv_flag = F265_GET_FLAG(enc->gd.eflags, F265_PF_TMV);
 
     FENC_ANNOUNCE_VLC("Slice header");
-    FENC_PUT_FLAG(vbs, 1, "first_slice_segment_in_pic_flag");
+    FENC_PUT_FLAG(vbs, first_flag, "first_slice_segment_in_pic_flag");
     if (nal_type >= 16 && nal_type <= 23) FENC_PUT_FLAG(vbs, 0, "no_output_of_prior_pics_flag");
     FENC_PUT_UE_V(vbs, 0, "slice_pic_parameter_set_id");
+    if (!first_flag)
+    {
+        // FIXME. Handle dependent segments.
+
+        FENC_PUT_BITS(vbs, ctb_xy, 32-__builtin_clz(gd->nb_ctb), "slice_segment_address");
+    }
     FENC_PUT_UE_V(vbs, 2-f->frame_type, "slice_type");
 
     if (!(f->gen_flags&F265_FF_IDR))
@@ -383,7 +402,7 @@ void fenc_write_slice_header(f265_vlc_bs *vbs, f265_enc *enc, f265_frame *f, int
 
     if (deblock_flag) FENC_PUT_FLAG(vbs, 1, "slice_loop_filter_across_slices_enabled_flag");
 
-    if (gd->eflags&F265_PF_WPP)
+    if (gd->eflags&(F265_PF_WPP|F265_PF_TILES))
     {
         FENC_PUT_UE_V(vbs, nb_chunks-1, "num_entry_point_offsets");
 
diff --git a/f265/enc.c b/f265/enc.c
index 67a3cbb..4d7c726 100644
--- a/f265/enc.c
+++ b/f265/enc.c
@@ -10,9 +10,11 @@
 
 f265_large_tables f265_lt;
 
-static int fenc_encode_frame_mt_none(f265_enc *e, f265_frame *f);
+static void* fenc_encode_parallel_tile(void *args);
+static int fenc_encode_frame(f265_enc *e, f265_frame *f);
 static int fenc_encode_section(f265_enc_thread *t, int section_idx);
 static void fenc_encode_ctb(f265_enc_thread *t);
+static void fenc_init_enc_thread(f265_enc *e, f265_frame *f, f265_enc_thread *t, int qp);
 static f265_enc_thread* fenc_set_enc_thread(f265_enc *e, f265_frame *f);
 static int fenc_check_enc_result(f265_enc_thread *t, f265_frame *next_frame);
 
@@ -142,10 +144,6 @@ static void fenc_analyze_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, uint8
     // Maximum number of slice headers in the frame. FIXME.
     int64_t nb_slice_headers = 1;
 
-    // number of sections in the frame. FIXME.
-    d->nb_sections = 1;
-
-
     // Chroma scaling factors (X shift, Y shift, YUV size numerator, YUV size
     // denominator shift).
     int csf_table[4][4] = { {0,0,1,0}, {1,1,3,1}, {1,0,2,0}, {0,0,3,0} };
@@ -166,6 +164,15 @@ static void fenc_analyze_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, uint8
     }
     d->nb_ctb = d->ctb_dim[0]*d->ctb_dim[1];
 
+    // Tiles and number of sections in the frame.
+    {
+        int tile_cols = F265_MIN(d->pix_dim[0], p->tiles[0]);
+        int tile_rows = F265_MIN(d->pix_dim[1], p->tiles[1]);
+        d->nb_sections = p->mt_mode == F265_MT_ENC_TILE ? tile_cols * tile_rows : 1;
+        p->tiles[0] = tile_cols;
+        p->tiles[1] = tile_rows;
+    }
+
     // Frame 4x4/8x8 block dimensions.
     int64_t b4_dim[2] = { d->pix_dim[0]>>2, d->pix_dim[1]>>2 };
     int64_t nb_b4 = b4_dim[0]*b4_dim[1];
@@ -446,14 +453,19 @@ static void f265_compute_intra_avail_map(f265_gen_data *gd)
 }
 
 // Set the CTB segmentation flags.
-static void fenc_set_ctb_seg_flags(f265_enc *enc, f265_frame_ctb *ctb, int ctb_x, int ctb_y, int ctb_xy)
+static void fenc_set_ctb_seg_flags(f265_enc *enc, f265_frame_ctb *ctb, int ctb_x, int ctb_y,
+                                   int start_flag, int end_flag)
 {
     f265_gen_data *gd = &enc->gd;
     ctb->seg_flags = 0;
 
-    // Frame start/end.
-    if (ctb_xy == 0) ctb->seg_flags |= F265_SC_CABAC_INIT;
-    if (ctb_xy == gd->nb_ctb - 1) ctb->seg_flags |= F265_SC_SEGMENT_END|F265_SC_CHUNK_END;
+    // Chunk start/end.
+    if (start_flag) ctb->seg_flags |= F265_SC_CABAC_INIT;
+    if (end_flag) ctb->seg_flags |= F265_SC_CHUNK_END;
+
+    // Frame/Section end.
+    if ((ctb_x==gd->ctb_dim[0]-1 && ctb_y==gd->ctb_dim[1]-1) || (end_flag && gd->mt_mode==F265_MT_ENC_TILE))
+        ctb->seg_flags |= F265_SC_SEGMENT_END;
 
     // WPP.
     if (gd->eflags&F265_PF_WPP)
@@ -476,6 +488,131 @@ static void fenc_set_ctb_seg_flags(f265_enc *enc, f265_frame_ctb *ctb, int ctb_x
     }
 }
 
+// Set the motion bounds.
+static void fenc_set_motion_bounds(f265_frame_ctb *ctb, int ctb_size, int pix_dim[2])
+{
+    // The quarterpel filter refers to 3 pixels left/above of the
+    // current pixel and 4 pixels right/below.
+    //
+    // Example: frame with 64+64+32=160 pixels, ctb_size=64.
+    //          Let F265_LUMA_PLANE_PADDING=72, F265_SEARCH_OOB=0
+    //          so that the MC and ME bounds are symmetrical.
+    //          Padded pixels: left [-72..-1], right [160..231].
+    //
+    //          CTB 1 has 64 pixels left, 160-64=96 pixels right.
+    //          ME left:  64 - (64+72-4)    = -68. Pixels [-68..-4].
+    //          ME right: 64 + (96+72-4-64) = 164. Pixels [164..227].
+    //
+    //            ME left     *           ME right
+    //          | |<==>| |    *         | |<==>| |
+    //          | <------+----*---------+->      |
+    //          |4  64  4| 64 * 64 | 32 |4  64  4|
+    //          |        |    |<==>|    |        |
+    //          ^        ^    Block     ^        ^
+    //         -72       0             160      131
+
+    // Number of pixels in the frame around the CTB.
+    int l = ctb->pos[0]*ctb_size, t = ctb->pos[1]*ctb_size, r = pix_dim[0]-l, b = pix_dim[1]-t;
+
+    // Motion estimation bounds.
+    int range = F265_MAX_MV_FPEL - F265_SEARCH_OOB;
+    int min_border = F265_LUMA_PLANE_PADDING - 4 - F265_SEARCH_OOB;
+    int max_border = min_border - 64;
+    ctb->me_bounds[0] = -(F265_MIN(range, l + min_border)<<2);
+    ctb->me_bounds[1] = -(F265_MIN(range, t + min_border)<<2);
+    ctb->me_bounds[2] =  (F265_MIN(range, r + max_border)<<2);
+    ctb->me_bounds[3] =  (F265_MIN(range, b + max_border)<<2);
+
+    // Motion compensation bounds.
+    ctb->mc_bounds[0] = -(l + 64 + 3)<<2;
+    ctb->mc_bounds[1] = -(t + 64 + 3)<<2;
+    ctb->mc_bounds[2] =  (r + 2)<<2;
+    ctb->mc_bounds[3] =  (b + 2)<<2;
+}
+
+// Use uniformly spaced tiles.
+static void fenc_init_uniform_layout(f265_enc *enc, f265_frame_map *fmap)
+{
+    f265_gen_data *gd = &enc->gd;
+    f265_frame_ctb *ctb = fmap->ctb_map;
+
+    // Frame size.
+    int pix_dim[2] = { gd->pix_dim[0], gd->pix_dim[1] };
+    int ctb_dim[2] = { gd->ctb_dim[0], gd->ctb_dim[1] };
+
+    // Tile layout.
+    int tiles_dim[2] = { gd->tiles[0], gd->tiles[1] };
+
+    // CTB coordinates.
+    int ctb_pos[2] = { 0, 0 };
+
+    // Largest coding unit size.
+    int ctb_size = gd->ctb_size;
+
+    // Section map.
+    int section_idx = 0;
+    int ctb_count = 0;
+    int sections_flag = fmap->nb_sections > 1;
+
+    // Assume a single section with multiple entry points.
+    if (!sections_flag)
+    {
+        fmap->section_map[section_idx][0] = 0;
+        fmap->section_map[section_idx][1] = gd->nb_ctb;
+    }
+
+    for (int i = 0; i < tiles_dim[1]; i++)
+    {
+        int tile_height = ((i+1)*ctb_dim[1])/tiles_dim[1] - (i*ctb_dim[1])/tiles_dim[1];
+
+        // Reset to left-hand size column.
+        ctb_pos[0] = 0;
+
+        for (int j = 0; j < tiles_dim[0]; j++)
+        {
+            int tile_width = ((j+1)*ctb_dim[0])/tiles_dim[0] - (j*ctb_dim[0])/tiles_dim[0];
+
+            for (int y = 0; y < tile_height; y++)
+            {
+                int ctb_y = ctb_pos[1] + y;
+                for (int x = 0; x < tile_width; x++, ctb++)
+                {
+                    // Position.
+                    int ctb_x = ctb_pos[0] + x;
+                    ctb->pos[0] = ctb_x;
+                    ctb->pos[1] = ctb_y;
+
+                    // Neighbours (D, B, C, A).
+                    ctb->neighbours = ((y>0&&x>0)<<0) | ((y>0)<<1) | ((y>0&&x<tile_width-1)<<2) | ((x>0)<<3);
+
+                    // Segmentation flags.
+                    int start_flag = x==0 && y==0;
+                    int end_flag = x==(tile_width-1) && y==(tile_height-1);
+                    fenc_set_ctb_seg_flags(enc, ctb, ctb_x, ctb_y, start_flag, end_flag);
+
+                    // Motion bounds.
+                    fenc_set_motion_bounds(ctb, ctb_size, pix_dim);
+                }
+            }
+
+            if (sections_flag)
+            {
+                int section_size = tile_height*tile_width;
+                fmap->section_map[section_idx][0] = ctb_count;
+                fmap->section_map[section_idx][1] = section_size;
+
+                // Keep track of the tile sizes. That's how the we compute the
+                // offsets to mark the start of each section in the map.
+                ctb_count += section_size;
+                section_idx++;
+            }
+
+            ctb_pos[0] += tile_width;
+        }
+        ctb_pos[1] += tile_height;
+    }
+}
+
 // Initialize the encoder memory.
 static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **error_handle)
 {
@@ -506,6 +643,7 @@ static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **e
         gd->pcm_range[i] = p->pcm_range[i];
         gd->tb_range[i] = p->tb_range[i];
         gd->tb_depth[i] = p->tb_depth[i];
+        gd->tiles[i] = p->tiles[i];
     }
     gd->qg_log = p->qg_log;
     gd->nb_refs = p->nb_refs;
@@ -553,6 +691,7 @@ static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **e
     SET_PF(F265_PF_ALLOW_CTB_RECODE, p->allow_recode_flag[1]);
     SET_PF(F265_PF_RDOQ, p->rdoq_flag);
     SET_PF(F265_PF_WPP, p->wpp_flag);
+    SET_PF(F265_PF_TILES, (p->tiles[0]>1)|(p->tiles[1]>1));
     #undef SET_PF
 
     // Chroma QP table. Taken from the spec. Assumes BD 8.
@@ -577,69 +716,10 @@ static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **e
         uint8_t *o = (uint8_t*)fmap; o += sizeof(f265_frame_map);
         fmap->section_map = (int(*)[2])o; o += d->nb_sections*4*2;
         fmap->ctb_map = (f265_frame_ctb*)o; o += d->nb_ctb*sizeof(f265_frame_ctb);
-        int pix_dim[2] = { gd->pix_dim[0], gd->pix_dim[1] };
-        int ctb_size = gd->ctb_size;
-
-        // FIXME: we assume one tile/slice per thread.
         fmap->nb_sections = d->nb_sections;
-        fmap->section_map[0][0] = 0;
-        fmap->section_map[0][1] = gd->nb_ctb;
-        for (int y = 0; y < gd->ctb_dim[1]; y++)
-        {
-            for (int x = 0; x < gd->ctb_dim[0]; x++)
-            {
-                int ctb_xy = y*gd->ctb_dim[0] + x;
-                f265_frame_ctb *ctb = fmap->ctb_map + ctb_xy;
-
-                // Position.
-                ctb->pos[0] = x;
-                ctb->pos[1] = y;
-
-                // Neighbours (D, B, C, A).
-                ctb->neighbours = ((y>0&&x>0)<<0) | ((y>0)<<1) | ((y>0&&x<gd->ctb_dim[0]-1)<<2) | ((x>0)<<3);
-
-                // Set the CTB segmentation flags.
-                fenc_set_ctb_seg_flags(enc, ctb, x, y, ctb_xy);
-
-                // The quarterpel filter refers to 3 pixels left/above of the
-                // current pixel and 4 pixels right/below.
-                //
-                // Example: frame with 64+64+32=160 pixels, ctb_size=64.
-                //          Let F265_LUMA_PLANE_PADDING=72, F265_SEARCH_OOB=0
-                //          so that the MC and ME bounds are symmetrical.
-                //          Padded pixels: left [-72..-1], right [160..231].
-                //
-                //          CTB 1 has 64 pixels left, 160-64=96 pixels right.
-                //          ME left:  64 - (64+72-4)    = -68. Pixels [-68..-4].
-                //          ME right: 64 + (96+72-4-64) = 164. Pixels [164..227].
-                //
-                //            ME left     *           ME right
-                //          | |<==>| |    *         | |<==>| |
-                //          | <------+----*---------+->      |
-                //          |4  64  4| 64 * 64 | 32 |4  64  4|
-                //          |        |    |<==>|    |        |
-                //          ^        ^    Block     ^        ^
-                //         -72       0             160      131
-
-                // Number of pixels in the frame around the CTB.
-                int l = x*ctb_size, t = y*ctb_size, r = pix_dim[0]-l, b = pix_dim[1]-t;
-
-                // Motion estimation bounds.
-                int range = F265_MAX_MV_FPEL - F265_SEARCH_OOB;
-                int min_border = F265_LUMA_PLANE_PADDING - 4 - F265_SEARCH_OOB;
-                int max_border = min_border - 64;
-                ctb->me_bounds[0] = -(F265_MIN(range, l + min_border)<<2);
-                ctb->me_bounds[1] = -(F265_MIN(range, t + min_border)<<2);
-                ctb->me_bounds[2] =  (F265_MIN(range, r + max_border)<<2);
-                ctb->me_bounds[3] =  (F265_MIN(range, b + max_border)<<2);
-
-                // Motion compensation bounds.
-                ctb->mc_bounds[0] = -(l + 64 + 3)<<2;
-                ctb->mc_bounds[1] = -(t + 64 + 3)<<2;
-                ctb->mc_bounds[2] =  (r + 2)<<2;
-                ctb->mc_bounds[3] =  (b + 2)<<2;
-            }
-        }
+
+        // FIXME: we only support uniform spacing.
+        fenc_init_uniform_layout(enc, fmap);
     }
 
     // Main data.
@@ -699,6 +779,34 @@ static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **e
     la->nb_committed = 0;
     la->nb_b_frames = p->nb_b_frames;
 
+    // Multithreading.
+    f265_enc_sync *es = &enc->es;
+    #ifdef F265_HAVE_MT
+    if (gd->mt_mode)
+    {
+        // The encoder synchronization structure is zeroed during startup. Zero
+        // values are expected when pthread functions run smoothly. Any other
+        // value means an error occurred. For simplicity, we negate the returned
+        // value so that 1 represents a success and 0, an error.
+
+        // Initialize public communication channel with the master thread.
+        es->cv_flag = !f265_cond_init(&es->master_cv);
+        if (!es->cv_flag)
+        {
+            *error_handle = "could not init conditional variable";
+            return;
+        }
+
+        // Initialize shared mutex for master and worker threads.
+        es->mutex_flag = !f265_mutex_init(&es->mutex);
+        if (!es->mutex_flag)
+        {
+            *error_handle = "could not init mutex";
+            return;
+        }
+    }
+    #endif
+
     // Encoding thread objects.
     for (int i = 0; i < d->nb_enc_threads; i++)
     {
@@ -728,6 +836,36 @@ static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **e
         // considered unsplit.
         t->cb[F265_UNAVAIL_CB_IDX].lg_bs = 6;
         t->cb[F265_UNAVAIL_CB_IDX].enc_idx = 127;
+
+        // When using multithreading, start a worker that will immediately go to
+        // sleep. Wake it up later on.
+        #ifdef F265_HAVE_MT
+        if (gd->mt_mode)
+        {
+            // The f265_enc_thread instances are zeroed during startup. For
+            // proper cleanup, pthread return values are negated so that 1 means
+            // success and 0, failure. These values are used when releasing the
+            // encoding resources to avoid unnecessary calls to pthread destroy
+            // functions.
+
+            // Init private communication channel.
+            t->cv_flag = !f265_cond_init(&t->worker_cv);
+            if (!t->cv_flag)
+            {
+                *error_handle = "error creating conditional variable";
+                return;
+            }
+
+            // Start the next thread.
+            t->status = F265_THREAD_IDLE;
%%% Extra space after '! '.
+            t->handle_flag = ! f265_thread_create(&t->thread_handle, fenc_encode_parallel_tile, t);
+            if (!t->handle_flag)
+            {
+                *error_handle = "error creating thread";
+                return;
+            }
+        }
+        #endif
     }
 
     // Frame objects.
@@ -825,6 +963,44 @@ void fenc_deinit_enc(f265_enc *enc)
     // To prevent valgrind errors.
     if (van_an_dump_file) fclose(van_an_dump_file);
     #endif
+
+    #ifdef F265_HAVE_MT
+    if (gd->mt_mode)
+    {
+        f265_main_data *md = &enc->md;
+        f265_enc_sync *es = &enc->es;
+
+        // Terminate running threads. Check flags to avoid releasing
+        // uninitialized resources.
+        for (int n = 0; n < md->nb_enc_threads; n++)
+        {
+            f265_enc_thread *t = md->enc_threads[n];
+
+            if (t->handle_flag)
+            {
+                // Update its status.
+                f265_mutex_lock(&es->mutex);
+                t->status = F265_THREAD_EXIT;
+                f265_mutex_unlock(&es->mutex);
+
+                // Wake up worker thread so it can exit properly.
+                f265_cond_signal(&t->worker_cv);
+
+                // Wait for proper exit.
+                f265_thread_join(t->thread_handle);
+            }
+
+            // Close private communication channel.
+            if (t->cv_flag) f265_cond_deinit(&t->worker_cv);
+        }
+
+        // Close shared communication channel.
+        if (es->cv_flag) f265_cond_deinit(&es->master_cv);
+
+        // Release shared mutex.
+        if (es->mutex_flag) f265_mutex_deinit(&es->mutex);
+    }
+    #endif
 }
 
 void fenc_analyze_params(f265_enc_params *params)
@@ -937,7 +1113,7 @@ int fenc_process_enc_req(f265_enc *e, f265_enc_req *req)
         }
 
         // Encode the frame, if any.
-        res = fenc_encode_frame_mt_none(e, next_frame);
+        res = fenc_encode_frame(e, next_frame);
 
     } while (0);
 
@@ -1158,14 +1334,184 @@ static int fenc_encode_frame_mt_none(f265_enc *e, f265_frame *f)
     // Frame re-encoding loop.
     while (1)
     {
-        // FIXME: assuming one section.
-        int res = fenc_encode_section(t, 0);
-        if (res == F265_RET_ABORT) return res;
-        res = fenc_check_enc_result(t, f);
+        for (int section_idx = 0; section_idx < f->fmap->nb_sections; section_idx++)
+            if (fenc_encode_section(t, section_idx) == F265_RET_ABORT) return F265_RET_ABORT;
+
+        int res = fenc_check_enc_result(t, f);
         if (res != F265_RET_RETRY) return res;
     }
 }
 
+// Encode tiles until none left. Release the encoding thread afterwards.
+static void* fenc_encode_parallel_tile(void *args)
+{
+    f265_enc_thread *t = (f265_enc_thread*)args;
%%% s/0/NULL => some compilers might warn about that.
+    f265_enc_thread *next = 0;
+    f265_enc *e = t->enc;
+    f265_main_data *md = &e->md;
+    f265_enc_sync *es = &t->enc->es;
+    f265_mutex *mutex = &es->mutex;
+
+    // Check for colleague.
+    if (t - md->enc_threads[0] < md->nb_enc_threads-1) next = t+1;
+
+    // Worker loop.
+    int exit_flag = 0;
+    while (!exit_flag)
+    {
%%% Judging from a quick search it's usually spelled "spurious wakeup".
+        // Wait for work. Prevent spurious wake up.
+        int status;
+        f265_mutex_lock(mutex);
+        do
+        {
+            // When woken up, make sure the thread is no longer set to IDLE.
+            f265_cond_wait(&t->worker_cv, mutex);
+            status = t->status;
+
+        } while (status == F265_THREAD_IDLE);
+
+        // Wake up colleague.
+        if (status == F265_THREAD_BUSY && next)
+        {
%%% The fact that the main thread is already working during this initialization scares me a little.
%%% But it should work... eventually we can revisit that logic.
%%% The thread should probably initialize its own data to avoid unneeded CL transfers.
%%% Add a note for this please?
+            // Get colleague thread ready for work.
+            fenc_init_enc_thread(e, t->src_frame, next, t->qp[0]);
+            next->status = F265_THREAD_BUSY;
+
+            // Wake up call.
+            f265_cond_signal(&next->worker_cv);
+        }
+
+        // Stop hogging the mutex. Take a breather.
%%% Heh.
+        f265_mutex_unlock(mutex);
+
+        // Tile encoding loop.
+        do
+        {
%%% Unlock-lock back-to-back. That could be optimized by inverting the lock logic eventually.
%%% Add a note please?
+            // Fetch next available tile.
+            f265_mutex_lock(mutex);
+            int section_idx = es->next_section_idx++;
+            exit_flag = es->abort_flag | es->error_flag | status == F265_THREAD_EXIT;
+            f265_mutex_unlock(mutex);
+
+            // Stop work loop entirely if any thread aborted, an error occurred,
+            // or exit was requested.
+            if (exit_flag) break;
+
+            // Pause when all other tiles have been, or are being processed.
+            if (section_idx >= t->src_frame->fmap->nb_sections)
+            {
+                // Update status.
+                f265_mutex_lock(mutex);
+                status = t->status = F265_THREAD_IDLE;
+                f265_mutex_unlock(mutex);
+
+                // Notify master thread.
+                f265_cond_signal(&es->master_cv);
%%% Extra white line.
+
+            }
+
+            // Process tile. Signal any problems to coworkers.
+            else
+            {
+                int res = fenc_encode_section(t, section_idx);
+                if (res == F265_RET_ABORT)
+                {
+                    f265_mutex_lock(mutex);
+                    es->abort_flag = 1;
+                    f265_mutex_unlock(mutex);
+                }
+            }
+        } while (status == F265_THREAD_BUSY);
%%% Ditto.
+
+    }
+
+    // Clean exit. Force EXIT status in case this was caused by an abort signal
+    // or an error during encoding.
+    f265_mutex_lock(mutex);
+    t->status = F265_THREAD_EXIT;
+    f265_mutex_unlock(mutex);
+
+    // Tell the master thread about it.
+    f265_cond_signal(&es->master_cv);
+
+    return NULL;
+}
+
+// Have the main thread use the worker threads. The main thread then makes sure
+// the result is acceptable. The input frame cannot be null. Returns VALID,
+// ABORT or ERROR.
+static int fenc_encode_frame_mt_tile(f265_enc *e, f265_frame *f)
+{
+    f265_main_data *md = &e->md;
+    f265_enc_sync *es = &e->es;
+    f265_mutex *mutex = &es->mutex;
+
+    // Set the DPB.
+    fenc_set_frame_dpb(e, f);
+
+    // Link the encoding object.
+    fenc_link_enc_obj(e, f);
+
+    // Commit to a single frame.
+    md->nb_enc_frames++;
+
+    // Frame re-encoding loop.
+    while (1)
+    {
%%% Ditto "wakeup".
+        // Avoid spurious wake up locking problems.
+        f265_mutex_lock(mutex);
+
+        // Init the first worker thread.
+        f265_enc_thread *t = md->enc_threads[0];
+        fenc_init_enc_thread(e, f, t, -1);
+
%%% its.
+        // Update it's status to BUSY.
+        t->status = F265_THREAD_BUSY;
+
+        // Update job listing.
+        es->next_section_idx = 0;
+
+        f265_mutex_unlock(mutex);
+
+        // Wake it up. It will wake up the others.
+        f265_cond_signal(&t->worker_cv);
+
+        // Wait for all the threads to finish.
+        int busy_flag;
+        f265_mutex_lock(mutex);
+        do
+        {
+            f265_cond_wait(&es->master_cv, mutex);
+
+            busy_flag = 0;
+            for (int n = 0; n < md->nb_enc_threads; n++)
+                busy_flag |= md->enc_threads[n]->status == F265_THREAD_BUSY;
+
+        } while (busy_flag);
+        f265_mutex_unlock(mutex);
+
+        // The main thread validates the results.
+        if (es->error_flag) return F265_RET_ERROR;
+        else if (es->abort_flag) return F265_RET_ABORT;
+        else
+        {
+            int res = fenc_check_enc_result(md->enc_threads[0], f);
+            if (res != F265_RET_RETRY) return res;
+        }
+    }
+}
+
+// Forwarding function based on multi-threading mode.
+static int fenc_encode_frame(f265_enc *e, f265_frame *f)
+{
+    int res;
+    f265_gen_data *gd = &e->gd;
+    // FIXME. Consider WPP-based, frame-based and GOP-based modes.
+    if (gd->mt_mode == F265_MT_ENC_TILE) res = fenc_encode_frame_mt_tile(e, f);
+    else res = fenc_encode_frame_mt_none(e, f);
+    return res;
+}
+
 // Encode the current section. Return VALID or ABORT.
 static int fenc_encode_section(f265_enc_thread *t, int section_idx)
 {
@@ -1606,7 +1952,8 @@ static void fenc_load_ctb(f265_enc_thread *t)
         t->rec_planes[4+i] = f->rec_planes[4+i] + (t->ctb_off[1]*stride>>sy) + (t->ctb_off[0]>>sx);
 
     // Set the intra availability.
-    t->intra_avail_cutoff[0] = F265_MIN(gd->pix_dim[0] - t->ctb_off[0], gd->ctb_size<<1);
+    int tile_cutoff = gd->ctb_size<<((t->ctb_neighbours>>2)&1);
+    t->intra_avail_cutoff[0] = F265_MIN(gd->pix_dim[0] - t->ctb_off[0], tile_cutoff);
     t->intra_avail_cutoff[1] = F265_MIN(gd->pix_dim[1] - t->ctb_off[1], gd->ctb_size);
 
     // Mark CBs unavailable by default. Optimize this eventually.
@@ -1940,24 +2287,14 @@ static double fenc_calc_lambda_chroma(int chroma_qp, int luma_qp, double luma_la
     return luma_lambda / weight;
 }
 
-// Assign the frame to the next idle encoding thread and prepare encoding.
-// Return the thread.
-static f265_enc_thread* fenc_set_enc_thread(f265_enc *e, f265_frame *f)
+// Transfer contextual information to the thread prior to encoding.
+static void fenc_init_enc_thread(f265_enc *e, f265_frame *f, f265_enc_thread *t, int qp)
 {
     f265_main_data *md = &e->md;
 
-    // Set the DPB.
-    fenc_set_frame_dpb(e, f);
-
-    // Link the encoding object.
-    fenc_link_enc_obj(e, f);
-
     // Get the previous frame.
     f265_frame *prev = md->enc_frames[0];
 
-    // We have one more encoding thread active.
-    f265_enc_thread *t = md->enc_threads[md->nb_enc_frames++];
-
     // Assign the frame.
     t->src_frame = f;
     f->frame_type = f->la_frame_type;
@@ -1969,8 +2306,8 @@ static f265_enc_thread* fenc_set_enc_thread(f265_enc *e, f265_frame *f)
     int deblock_flag = t->mflags&F265_PF_DEBLOCK && (t->mflags&F265_PF_DEBLOCK_ALL || f->gen_flags&F265_FF_REF);
     F265_SET_FLAG(f->gen_flags, F265_FF_DEBLOCK, deblock_flag);
 
-    // Set the frame QP.
-    f->qp = fenc_rc_frame_start(t, prev);
+    // Set the frame QP. Only use the rate control when the QP is unknown.
+    f->qp = qp == -1 ? fenc_rc_frame_start(t, prev) : qp;
 
     // Set the QP.
     t->qp[0] = f->qp;
@@ -1986,6 +2323,25 @@ static f265_enc_thread* fenc_set_enc_thread(f265_enc *e, f265_frame *f)
 
     // Build the inter configuration.
     fenc_set_frame_inter(t, f);
+}
+
+// Assign the frame to the next idle encoding thread and prepare encoding.
+// Return the thread.
+static f265_enc_thread* fenc_set_enc_thread(f265_enc *e, f265_frame *f)
+{
+    f265_main_data *md = &e->md;
+
+    // Set the DPB.
+    fenc_set_frame_dpb(e, f);
+
+    // Link the encoding object.
+    fenc_link_enc_obj(e, f);
+
+    // We have one more encoding thread active.
+    f265_enc_thread *t = md->enc_threads[md->nb_enc_frames++];
+
+    // Set up context information.
+    fenc_init_enc_thread(e, f, t, -1);
 
     return t;
 }
diff --git a/f265/enc.h b/f265/enc.h
index 409e2b1..0edb5a5 100644
--- a/f265/enc.h
+++ b/f265/enc.h
@@ -1951,6 +1951,24 @@ struct f265_enc_thread
     // is the reference list. The second index is the reference index plus 1.
     // The value is a unique identifier for the referenced frame, if any.
     int8_t deblock_frame_id_map[2][17];
+
+
+    // Threading.
+    #ifdef F265_HAVE_MT
+
+    // Actual execution thread.
+    f265_thread_handle thread_handle;
+
+    // Private communication channel between the master and this thread.
+    f265_cond_var worker_cv;
+
+    // Thread status (F265_THREAD_)BUSY, IDLE, EXIT.
+    int16_t status;
+
+    // State variables for proper cleanup.
+    int8_t handle_flag;
+    int8_t cv_flag;
+    #endif
 };
 
 // Invariant data shared by all threads in the encoder instance.
@@ -2045,6 +2063,9 @@ typedef struct f265_gen_data
     // Log 2 of the parallel merge level.
     int8_t parallel_merge_level;
 
+    // Tile layout.
+    int8_t tiles[2];
+
     // == Direct parameter import end ==
 
     // Initial luma QP.
@@ -2261,6 +2282,32 @@ typedef struct f265_lookahead
 
 } f265_lookahead;
 
+// Data shared between the main thread and the encoding threads.
+typedef struct f265_enc_sync
+{
+    #ifdef F265_HAVE_MT
+    // Lock mechanism to query available job and communication flags.
+    f265_mutex mutex;
+
+    // Shared communication channel between all encoding threads and the master
+    // thread.
+    f265_cond_var master_cv;
+
+    // Index of the next section to be processed.
+    // Threads have to validate that the value exists.
+    int next_section_idx;
+
+    // Communication flags between threads.
+    int8_t abort_flag;
+    int8_t error_flag;
+
+    // State variables for proper cleanup.
+    int8_t mutex_flag;
+    int8_t cv_flag;
+    #endif
+
+} f265_enc_sync;
+
 // Encoder instance.
 struct f265_enc
 {
@@ -2272,6 +2319,9 @@ struct f265_enc
 
     // Lookahead data.
     F265_ALIGN64 f265_lookahead la;
+
+    // Threading data.
+    F265_ALIGN64 f265_enc_sync es;
 };
 
 // Large tables initialized when the library is loaded. FIXME: change name, this
diff --git a/f265/f265.h b/f265/f265.h
index c80ef91..ab85419 100644
--- a/f265/f265.h
+++ b/f265/f265.h
@@ -277,6 +277,11 @@ typedef struct f265_enc_params
     // flags are hardcoded and not documented on purpose.
     uint64_t algo;
 
+    // Tiles.
+
+    // Layout as (columns,rows) pair.
+    int8_t tiles[2];
+
 } f265_enc_params;
 
 // Input frame data.
diff --git a/f265/parse.c b/f265/parse.c
index 7387820..f40be02 100644
--- a/f265/parse.c
+++ b/f265/parse.c
@@ -425,6 +425,23 @@ static void handle_param_algo(f265_parse_ctx *ctx, f265_enc_params *p, f265_pars
             p->algo |= (1<<(nb_flags-i-1));
 }
 
+static void handle_param_tiles(f265_parse_ctx *ctx, f265_enc_params *p, f265_parse_arg *a, int32_t nb_args)
+{
+    p->tiles[0] = a[0].i;
+    p->tiles[1] = a[1].i;
+}
+
+static void handle_param_mt_mode(f265_parse_ctx *ctx, f265_enc_params *p, f265_parse_arg *a, int32_t nb_args)
+{
+    p->mt_mode = a[0].i;
+}
+
+static void handle_param_nb_workers(f265_parse_ctx *ctx, f265_enc_params *p, f265_parse_arg *a, int32_t nb_args)
+{
+    p->nb_workers[0] = a[0].i;
+    p->nb_workers[1] = a[1].i;
+}
+
 // Parameter dispatch table.
 static const f265_parse_entry f265_enc_params_table[] =
 {
@@ -475,6 +492,9 @@ static const f265_parse_entry f265_enc_params_table[] =
     { "lt-conv-min", handle_param_lt_conv_min, 1, 0 },
     { "lt-conv-exp", handle_param_lt_conv_exp, 1, 1 },
     { "algo", handle_param_algo, 1, 2 },
+    { "tiles", handle_param_tiles, 2, 0 },
+    { "mt-mode", handle_param_mt_mode, 1, 0 },
+    { "nb-workers", handle_param_nb_workers, 2, 0 },
 };
 
 void f265_parse_params(f265_enc_params *params, const char *param_string, char **error_handle)

Reply via email to