Hi,
the patch looks good to me. I flagged some minor stuff but I don't think
that further review is required to commit the patch.
Thanks,
Laurent
%%% Change file format to UNIX before committing.
diff --git a/doc/params.txt b/doc/params.txt
index ea08232..26ed3b3 100644
--- a/doc/params.txt
+++ b/doc/params.txt
@@ -291,6 +291,27 @@ Minimum number of frames used to compute the ABR rate control convergence window
Exponent used to compute the ABR rate control convergence window.
+* tiles=COLS,ROWS. Default 1,1.
+
+Number of columns and rows that will be created using uniform spacing. The
+current restrictions are 11 rows and 10 columns. These restrictions conform to a
+level up to 5.2. The actual values used correspond to MIN(COLS, PIC WIDTH CTB)
+and MIN(ROWS, PIC HEIGHT CTB).
+
+
+* mt-mode=X. Default 0.
+
+Activate multithreading. Currently, only no multithreading (mt-mode=0) and tile-
+based multithreading (mt-mode=1) are supported.
+
+
+* nb-workers=X,Y. Default 0,0.
+
+When multithreading is activated, X indicates the number of additional threads
+dedicated to encoding, and Y indicates the number of threads used by the
+lookahead.
+
+
-------------------------------------------------------------------------------
Configurations and regression tests.
diff --git a/f265/bdi.c b/f265/bdi.c
index 2ca8d99..a7ca71d 100644
--- a/f265/bdi.c
+++ b/f265/bdi.c
@@ -180,6 +180,7 @@ void f265_set_default_params(f265_enc_params *p)
p->me_algo[1] = p->me_algo[2] = 1;
p->me_iter[0] = 16;
p->me_iter[1] = p->me_iter[2] = 1;
+ p->tiles[0] = p->tiles[1] = 1;
}
void f265_normalize_params(f265_enc_params *p)
@@ -188,7 +189,9 @@ void f265_normalize_params(f265_enc_params *p)
#define CLF(V) (V) = !!(V)
#define CLP(V) (V) = F265_MAX(0, (V))
for (int i = 0; i < 2; i++) CL(p->clip_dim[i], 1, 16384);
- CL(p->mt_mode, 0, 2);
+ // FIXME. Only tile multi-threading is supported. Update this when other
+ // modes are supported (WPP-based, frame-based, GOP-based, etc.).
+ CL(p->mt_mode, 0, 1);
CL(p->chroma_format, 0, 3);
for (int i = 0; i < 4; i++) CL(p->bit_depth[i], 8, 14);
CL(p->cb_range[0], 3, 6);
@@ -235,6 +238,10 @@ void f265_normalize_params(f265_enc_params *p)
CLP(p->frame_rate_num);
p->frame_rate_den = F265_MAX(p->frame_rate_den, 1);
CL(p->qp, 0, 51);
+ // Assume level is 5 or greater. FIXME. See table A1 in HEVC spec.
+ CL(p->tiles[0], 1, 10);
+ CL(p->tiles[1], 1, 11);
+ CLP(p->nb_workers[0]);
#undef CL
#undef CLF
#undef CLP
@@ -242,8 +249,11 @@ void f265_normalize_params(f265_enc_params *p)
// Make sure there is room for B frames in the lookahead.
p->la_decision_delay = F265_MAX(p->la_decision_delay, p->nb_b_frames);
- // Normalize the number of threads. FIXME.
- p->nb_workers[0] = p->nb_workers[1] = 0;
+ // Normalize the number of lookahead threads. FIXME.
+ p->nb_workers[1] = 0;
+
+ // Prevent WPP and Tiles at the same time. Favour Tiles over WPP.
+ if (p->tiles[0] > 1 || p->tiles[1] > 1) p->wpp_flag = 0;
}
// Dispatch by the parameters/encoder bit depth. We rely on the bit depth being
diff --git a/f265/bdi.h b/f265/bdi.h
index c638a2a..ec478f4 100644
--- a/f265/bdi.h
+++ b/f265/bdi.h
@@ -279,6 +279,9 @@
// Use wavefront parallel processing.
#define F265_PF_WPP (1<<18)
+// Use tiles.
+#define F265_PF_TILES (1<<19)
+
// Number of CABAC contexts.
#define F265_NB_CABAC_CTX 154
diff --git a/f265/bs.c b/f265/bs.c
index 82ab920..0d1993e 100644
--- a/f265/bs.c
+++ b/f265/bs.c
@@ -255,6 +255,7 @@ void fenc_write_pps(f265_vlc_bs *vbs, f265_enc *enc)
{
f265_gen_data *gd = &enc->gd;
int deblock_flag = F265_GET_FLAG(gd->eflags, F265_PF_DEBLOCK);
+ int tiles_enabled = F265_GET_FLAG(gd->eflags, F265_PF_TILES);
FENC_ANNOUNCE_VLC("PPS");
FENC_PUT_UE_V(vbs, 0, "pps_pic_parameter_set_id");
@@ -279,8 +280,19 @@ void fenc_write_pps(f265_vlc_bs *vbs, f265_enc *enc)
FENC_PUT_FLAG(vbs, 0, "weighted_pred_flag");
FENC_PUT_FLAG(vbs, 0, "weighted_bipred_flag");
FENC_PUT_FLAG(vbs, F265_GET_FLAG(gd->eflags, F265_PF_TRANSQUANT_BYPASS), "transquant_bypass_enable_flag");
- FENC_PUT_FLAG(vbs, 0, "tiles_enabled_flag");
+ FENC_PUT_FLAG(vbs, tiles_enabled, "tiles_enabled_flag");
FENC_PUT_FLAG(vbs, F265_GET_FLAG(gd->eflags, F265_PF_WPP), "entropy_coding_sync_enabled_flag");
+ if (tiles_enabled)
+ {
+ FENC_PUT_UE_V(vbs, gd->tiles[0]-1, "num_tile_columns_minus1");
+ FENC_PUT_UE_V(vbs, gd->tiles[1]-1, "num_tile_rows_minus1");
+
+ // FIXME. Change this if non-uniform spacing is supported.
+ FENC_PUT_FLAG(vbs, 1, "uniform_spacing_flag");
+
+ // FIXME. Change if tile boundaries are considered during deblocking.
+ FENC_PUT_FLAG(vbs, 1, "loop_filter_across_tiles_enabled_flag");
+ }
FENC_PUT_FLAG(vbs, 1, "loop_filter_across_slices_enabled_flag");
FENC_PUT_FLAG(vbs, !deblock_flag, "deblocking_filter_control_present_flag");
@@ -303,13 +315,20 @@ void fenc_write_slice_header(f265_vlc_bs *vbs, f265_enc *enc, f265_frame *f, int
uint32_t seg_flags, int ctb_xy, int nal_type)
{
f265_gen_data *gd = &enc->gd;
+ int first_flag = ctb_xy == 0;
int deblock_flag = F265_GET_FLAG(gd->eflags, F265_PF_DEBLOCK);
int tmv_flag = F265_GET_FLAG(enc->gd.eflags, F265_PF_TMV);
FENC_ANNOUNCE_VLC("Slice header");
- FENC_PUT_FLAG(vbs, 1, "first_slice_segment_in_pic_flag");
+ FENC_PUT_FLAG(vbs, first_flag, "first_slice_segment_in_pic_flag");
if (nal_type >= 16 && nal_type <= 23) FENC_PUT_FLAG(vbs, 0, "no_output_of_prior_pics_flag");
FENC_PUT_UE_V(vbs, 0, "slice_pic_parameter_set_id");
+ if (!first_flag)
+ {
+ // FIXME. Handle dependent segments.
+
+ FENC_PUT_BITS(vbs, ctb_xy, 32-__builtin_clz(gd->nb_ctb), "slice_segment_address");
+ }
FENC_PUT_UE_V(vbs, 2-f->frame_type, "slice_type");
if (!(f->gen_flags&F265_FF_IDR))
@@ -383,7 +402,7 @@ void fenc_write_slice_header(f265_vlc_bs *vbs, f265_enc *enc, f265_frame *f, int
if (deblock_flag) FENC_PUT_FLAG(vbs, 1, "slice_loop_filter_across_slices_enabled_flag");
- if (gd->eflags&F265_PF_WPP)
+ if (gd->eflags&(F265_PF_WPP|F265_PF_TILES))
{
FENC_PUT_UE_V(vbs, nb_chunks-1, "num_entry_point_offsets");
diff --git a/f265/enc.c b/f265/enc.c
index 67a3cbb..4d7c726 100644
--- a/f265/enc.c
+++ b/f265/enc.c
@@ -10,9 +10,11 @@
f265_large_tables f265_lt;
-static int fenc_encode_frame_mt_none(f265_enc *e, f265_frame *f);
+static void* fenc_encode_parallel_tile(void *args);
+static int fenc_encode_frame(f265_enc *e, f265_frame *f);
static int fenc_encode_section(f265_enc_thread *t, int section_idx);
static void fenc_encode_ctb(f265_enc_thread *t);
+static void fenc_init_enc_thread(f265_enc *e, f265_frame *f, f265_enc_thread *t, int qp);
static f265_enc_thread* fenc_set_enc_thread(f265_enc *e, f265_frame *f);
static int fenc_check_enc_result(f265_enc_thread *t, f265_frame *next_frame);
@@ -142,10 +144,6 @@ static void fenc_analyze_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, uint8
// Maximum number of slice headers in the frame. FIXME.
int64_t nb_slice_headers = 1;
- // number of sections in the frame. FIXME.
- d->nb_sections = 1;
-
-
// Chroma scaling factors (X shift, Y shift, YUV size numerator, YUV size
// denominator shift).
int csf_table[4][4] = { {0,0,1,0}, {1,1,3,1}, {1,0,2,0}, {0,0,3,0} };
@@ -166,6 +164,15 @@ static void fenc_analyze_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, uint8
}
d->nb_ctb = d->ctb_dim[0]*d->ctb_dim[1];
+ // Tiles and number of sections in the frame.
+ {
+ int tile_cols = F265_MIN(d->pix_dim[0], p->tiles[0]);
+ int tile_rows = F265_MIN(d->pix_dim[1], p->tiles[1]);
+ d->nb_sections = p->mt_mode == F265_MT_ENC_TILE ? tile_cols * tile_rows : 1;
+ p->tiles[0] = tile_cols;
+ p->tiles[1] = tile_rows;
+ }
+
// Frame 4x4/8x8 block dimensions.
int64_t b4_dim[2] = { d->pix_dim[0]>>2, d->pix_dim[1]>>2 };
int64_t nb_b4 = b4_dim[0]*b4_dim[1];
@@ -446,14 +453,19 @@ static void f265_compute_intra_avail_map(f265_gen_data *gd)
}
// Set the CTB segmentation flags.
-static void fenc_set_ctb_seg_flags(f265_enc *enc, f265_frame_ctb *ctb, int ctb_x, int ctb_y, int ctb_xy)
+static void fenc_set_ctb_seg_flags(f265_enc *enc, f265_frame_ctb *ctb, int ctb_x, int ctb_y,
+ int start_flag, int end_flag)
{
f265_gen_data *gd = &enc->gd;
ctb->seg_flags = 0;
- // Frame start/end.
- if (ctb_xy == 0) ctb->seg_flags |= F265_SC_CABAC_INIT;
- if (ctb_xy == gd->nb_ctb - 1) ctb->seg_flags |= F265_SC_SEGMENT_END|F265_SC_CHUNK_END;
+ // Chunk start/end.
+ if (start_flag) ctb->seg_flags |= F265_SC_CABAC_INIT;
+ if (end_flag) ctb->seg_flags |= F265_SC_CHUNK_END;
+
+ // Frame/Section end.
+ if ((ctb_x==gd->ctb_dim[0]-1 && ctb_y==gd->ctb_dim[1]-1) || (end_flag && gd->mt_mode==F265_MT_ENC_TILE))
+ ctb->seg_flags |= F265_SC_SEGMENT_END;
// WPP.
if (gd->eflags&F265_PF_WPP)
@@ -476,6 +488,131 @@ static void fenc_set_ctb_seg_flags(f265_enc *enc, f265_frame_ctb *ctb, int ctb_x
}
}
+// Set the motion bounds.
+static void fenc_set_motion_bounds(f265_frame_ctb *ctb, int ctb_size, int pix_dim[2])
+{
+ // The quarterpel filter refers to 3 pixels left/above of the
+ // current pixel and 4 pixels right/below.
+ //
+ // Example: frame with 64+64+32=160 pixels, ctb_size=64.
+ // Let F265_LUMA_PLANE_PADDING=72, F265_SEARCH_OOB=0
+ // so that the MC and ME bounds are symmetrical.
+ // Padded pixels: left [-72..-1], right [160..231].
+ //
+ // CTB 1 has 64 pixels left, 160-64=96 pixels right.
+ // ME left: 64 - (64+72-4) = -68. Pixels [-68..-4].
+ // ME right: 64 + (96+72-4-64) = 164. Pixels [164..227].
+ //
+ // ME left * ME right
+ // | |<==>| | * | |<==>| |
+ // | <------+----*---------+-> |
+ // |4 64 4| 64 * 64 | 32 |4 64 4|
+ // | | |<==>| | |
+ // ^ ^ Block ^ ^
+ // -72 0 160 131
+
+ // Number of pixels in the frame around the CTB.
+ int l = ctb->pos[0]*ctb_size, t = ctb->pos[1]*ctb_size, r = pix_dim[0]-l, b = pix_dim[1]-t;
+
+ // Motion estimation bounds.
+ int range = F265_MAX_MV_FPEL - F265_SEARCH_OOB;
+ int min_border = F265_LUMA_PLANE_PADDING - 4 - F265_SEARCH_OOB;
+ int max_border = min_border - 64;
+ ctb->me_bounds[0] = -(F265_MIN(range, l + min_border)<<2);
+ ctb->me_bounds[1] = -(F265_MIN(range, t + min_border)<<2);
+ ctb->me_bounds[2] = (F265_MIN(range, r + max_border)<<2);
+ ctb->me_bounds[3] = (F265_MIN(range, b + max_border)<<2);
+
+ // Motion compensation bounds.
+ ctb->mc_bounds[0] = -(l + 64 + 3)<<2;
+ ctb->mc_bounds[1] = -(t + 64 + 3)<<2;
+ ctb->mc_bounds[2] = (r + 2)<<2;
+ ctb->mc_bounds[3] = (b + 2)<<2;
+}
+
+// Use uniformly spaced tiles.
+static void fenc_init_uniform_layout(f265_enc *enc, f265_frame_map *fmap)
+{
+ f265_gen_data *gd = &enc->gd;
+ f265_frame_ctb *ctb = fmap->ctb_map;
+
+ // Frame size.
+ int pix_dim[2] = { gd->pix_dim[0], gd->pix_dim[1] };
+ int ctb_dim[2] = { gd->ctb_dim[0], gd->ctb_dim[1] };
+
+ // Tile layout.
+ int tiles_dim[2] = { gd->tiles[0], gd->tiles[1] };
+
+ // CTB coordinates.
+ int ctb_pos[2] = { 0, 0 };
+
+ // Largest coding unit size.
+ int ctb_size = gd->ctb_size;
+
+ // Section map.
+ int section_idx = 0;
+ int ctb_count = 0;
+ int sections_flag = fmap->nb_sections > 1;
+
+ // Assume a single section with multiple entry points.
+ if (!sections_flag)
+ {
+ fmap->section_map[section_idx][0] = 0;
+ fmap->section_map[section_idx][1] = gd->nb_ctb;
+ }
+
+ for (int i = 0; i < tiles_dim[1]; i++)
+ {
+ int tile_height = ((i+1)*ctb_dim[1])/tiles_dim[1] - (i*ctb_dim[1])/tiles_dim[1];
+
+ // Reset to left-hand size column.
+ ctb_pos[0] = 0;
+
+ for (int j = 0; j < tiles_dim[0]; j++)
+ {
+ int tile_width = ((j+1)*ctb_dim[0])/tiles_dim[0] - (j*ctb_dim[0])/tiles_dim[0];
+
+ for (int y = 0; y < tile_height; y++)
+ {
+ int ctb_y = ctb_pos[1] + y;
+ for (int x = 0; x < tile_width; x++, ctb++)
+ {
+ // Position.
+ int ctb_x = ctb_pos[0] + x;
+ ctb->pos[0] = ctb_x;
+ ctb->pos[1] = ctb_y;
+
+ // Neighbours (D, B, C, A).
+ ctb->neighbours = ((y>0&&x>0)<<0) | ((y>0)<<1) | ((y>0&&x<tile_width-1)<<2) | ((x>0)<<3);
+
+ // Segmentation flags.
+ int start_flag = x==0 && y==0;
+ int end_flag = x==(tile_width-1) && y==(tile_height-1);
+ fenc_set_ctb_seg_flags(enc, ctb, ctb_x, ctb_y, start_flag, end_flag);
+
+ // Motion bounds.
+ fenc_set_motion_bounds(ctb, ctb_size, pix_dim);
+ }
+ }
+
+ if (sections_flag)
+ {
+ int section_size = tile_height*tile_width;
+ fmap->section_map[section_idx][0] = ctb_count;
+ fmap->section_map[section_idx][1] = section_size;
+
+ // Keep track of the tile sizes. That's how the we compute the
+ // offsets to mark the start of each section in the map.
+ ctb_count += section_size;
+ section_idx++;
+ }
+
+ ctb_pos[0] += tile_width;
+ }
+ ctb_pos[1] += tile_height;
+ }
+}
+
// Initialize the encoder memory.
static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **error_handle)
{
@@ -506,6 +643,7 @@ static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **e
gd->pcm_range[i] = p->pcm_range[i];
gd->tb_range[i] = p->tb_range[i];
gd->tb_depth[i] = p->tb_depth[i];
+ gd->tiles[i] = p->tiles[i];
}
gd->qg_log = p->qg_log;
gd->nb_refs = p->nb_refs;
@@ -553,6 +691,7 @@ static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **e
SET_PF(F265_PF_ALLOW_CTB_RECODE, p->allow_recode_flag[1]);
SET_PF(F265_PF_RDOQ, p->rdoq_flag);
SET_PF(F265_PF_WPP, p->wpp_flag);
+ SET_PF(F265_PF_TILES, (p->tiles[0]>1)|(p->tiles[1]>1));
#undef SET_PF
// Chroma QP table. Taken from the spec. Assumes BD 8.
@@ -577,69 +716,10 @@ static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **e
uint8_t *o = (uint8_t*)fmap; o += sizeof(f265_frame_map);
fmap->section_map = (int(*)[2])o; o += d->nb_sections*4*2;
fmap->ctb_map = (f265_frame_ctb*)o; o += d->nb_ctb*sizeof(f265_frame_ctb);
- int pix_dim[2] = { gd->pix_dim[0], gd->pix_dim[1] };
- int ctb_size = gd->ctb_size;
-
- // FIXME: we assume one tile/slice per thread.
fmap->nb_sections = d->nb_sections;
- fmap->section_map[0][0] = 0;
- fmap->section_map[0][1] = gd->nb_ctb;
- for (int y = 0; y < gd->ctb_dim[1]; y++)
- {
- for (int x = 0; x < gd->ctb_dim[0]; x++)
- {
- int ctb_xy = y*gd->ctb_dim[0] + x;
- f265_frame_ctb *ctb = fmap->ctb_map + ctb_xy;
-
- // Position.
- ctb->pos[0] = x;
- ctb->pos[1] = y;
-
- // Neighbours (D, B, C, A).
- ctb->neighbours = ((y>0&&x>0)<<0) | ((y>0)<<1) | ((y>0&&x<gd->ctb_dim[0]-1)<<2) | ((x>0)<<3);
-
- // Set the CTB segmentation flags.
- fenc_set_ctb_seg_flags(enc, ctb, x, y, ctb_xy);
-
- // The quarterpel filter refers to 3 pixels left/above of the
- // current pixel and 4 pixels right/below.
- //
- // Example: frame with 64+64+32=160 pixels, ctb_size=64.
- // Let F265_LUMA_PLANE_PADDING=72, F265_SEARCH_OOB=0
- // so that the MC and ME bounds are symmetrical.
- // Padded pixels: left [-72..-1], right [160..231].
- //
- // CTB 1 has 64 pixels left, 160-64=96 pixels right.
- // ME left: 64 - (64+72-4) = -68. Pixels [-68..-4].
- // ME right: 64 + (96+72-4-64) = 164. Pixels [164..227].
- //
- // ME left * ME right
- // | |<==>| | * | |<==>| |
- // | <------+----*---------+-> |
- // |4 64 4| 64 * 64 | 32 |4 64 4|
- // | | |<==>| | |
- // ^ ^ Block ^ ^
- // -72 0 160 131
-
- // Number of pixels in the frame around the CTB.
- int l = x*ctb_size, t = y*ctb_size, r = pix_dim[0]-l, b = pix_dim[1]-t;
-
- // Motion estimation bounds.
- int range = F265_MAX_MV_FPEL - F265_SEARCH_OOB;
- int min_border = F265_LUMA_PLANE_PADDING - 4 - F265_SEARCH_OOB;
- int max_border = min_border - 64;
- ctb->me_bounds[0] = -(F265_MIN(range, l + min_border)<<2);
- ctb->me_bounds[1] = -(F265_MIN(range, t + min_border)<<2);
- ctb->me_bounds[2] = (F265_MIN(range, r + max_border)<<2);
- ctb->me_bounds[3] = (F265_MIN(range, b + max_border)<<2);
-
- // Motion compensation bounds.
- ctb->mc_bounds[0] = -(l + 64 + 3)<<2;
- ctb->mc_bounds[1] = -(t + 64 + 3)<<2;
- ctb->mc_bounds[2] = (r + 2)<<2;
- ctb->mc_bounds[3] = (b + 2)<<2;
- }
- }
+
+ // FIXME: we only support uniform spacing.
+ fenc_init_uniform_layout(enc, fmap);
}
// Main data.
@@ -699,6 +779,34 @@ static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **e
la->nb_committed = 0;
la->nb_b_frames = p->nb_b_frames;
+ // Multithreading.
+ f265_enc_sync *es = &enc->es;
+ #ifdef F265_HAVE_MT
+ if (gd->mt_mode)
+ {
+ // The encoder synchronization structure is zeroed during startup. Zero
+ // values are expected when pthread functions run smoothly. Any other
+ // value means an error occurred. For simplicity, we negate the returned
+ // value so that 1 represents a success and 0, an error.
+
+ // Initialize public communication channel with the master thread.
+ es->cv_flag = !f265_cond_init(&es->master_cv);
+ if (!es->cv_flag)
+ {
+ *error_handle = "could not init conditional variable";
+ return;
+ }
+
+ // Initialize shared mutex for master and worker threads.
+ es->mutex_flag = !f265_mutex_init(&es->mutex);
+ if (!es->mutex_flag)
+ {
+ *error_handle = "could not init mutex";
+ return;
+ }
+ }
+ #endif
+
// Encoding thread objects.
for (int i = 0; i < d->nb_enc_threads; i++)
{
@@ -728,6 +836,36 @@ static void fenc_init_enc_mem(f265_enc_params *p, f265_enc_mem_data *d, char **e
// considered unsplit.
t->cb[F265_UNAVAIL_CB_IDX].lg_bs = 6;
t->cb[F265_UNAVAIL_CB_IDX].enc_idx = 127;
+
+ // When using multithreading, start a worker that will immediately go to
+ // sleep. Wake it up later on.
+ #ifdef F265_HAVE_MT
+ if (gd->mt_mode)
+ {
+ // The f265_enc_thread instances are zeroed during startup. For
+ // proper cleanup, pthread return values are negated so that 1 means
+ // success and 0, failure. These values are used when releasing the
+ // encoding resources to avoid unnecessary calls to pthread destroy
+ // functions.
+
+ // Init private communication channel.
+ t->cv_flag = !f265_cond_init(&t->worker_cv);
+ if (!t->cv_flag)
+ {
+ *error_handle = "error creating conditional variable";
+ return;
+ }
+
+ // Start the next thread.
+ t->status = F265_THREAD_IDLE;
%%% Extra space after '! '.
+ t->handle_flag = ! f265_thread_create(&t->thread_handle, fenc_encode_parallel_tile, t);
+ if (!t->handle_flag)
+ {
+ *error_handle = "error creating thread";
+ return;
+ }
+ }
+ #endif
}
// Frame objects.
@@ -825,6 +963,44 @@ void fenc_deinit_enc(f265_enc *enc)
// To prevent valgrind errors.
if (van_an_dump_file) fclose(van_an_dump_file);
#endif
+
+ #ifdef F265_HAVE_MT
+ if (gd->mt_mode)
+ {
+ f265_main_data *md = &enc->md;
+ f265_enc_sync *es = &enc->es;
+
+ // Terminate running threads. Check flags to avoid releasing
+ // uninitialized resources.
+ for (int n = 0; n < md->nb_enc_threads; n++)
+ {
+ f265_enc_thread *t = md->enc_threads[n];
+
+ if (t->handle_flag)
+ {
+ // Update its status.
+ f265_mutex_lock(&es->mutex);
+ t->status = F265_THREAD_EXIT;
+ f265_mutex_unlock(&es->mutex);
+
+ // Wake up worker thread so it can exit properly.
+ f265_cond_signal(&t->worker_cv);
+
+ // Wait for proper exit.
+ f265_thread_join(t->thread_handle);
+ }
+
+ // Close private communication channel.
+ if (t->cv_flag) f265_cond_deinit(&t->worker_cv);
+ }
+
+ // Close shared communication channel.
+ if (es->cv_flag) f265_cond_deinit(&es->master_cv);
+
+ // Release shared mutex.
+ if (es->mutex_flag) f265_mutex_deinit(&es->mutex);
+ }
+ #endif
}
void fenc_analyze_params(f265_enc_params *params)
@@ -937,7 +1113,7 @@ int fenc_process_enc_req(f265_enc *e, f265_enc_req *req)
}
// Encode the frame, if any.
- res = fenc_encode_frame_mt_none(e, next_frame);
+ res = fenc_encode_frame(e, next_frame);
} while (0);
@@ -1158,14 +1334,184 @@ static int fenc_encode_frame_mt_none(f265_enc *e, f265_frame *f)
// Frame re-encoding loop.
while (1)
{
- // FIXME: assuming one section.
- int res = fenc_encode_section(t, 0);
- if (res == F265_RET_ABORT) return res;
- res = fenc_check_enc_result(t, f);
+ for (int section_idx = 0; section_idx < f->fmap->nb_sections; section_idx++)
+ if (fenc_encode_section(t, section_idx) == F265_RET_ABORT) return F265_RET_ABORT;
+
+ int res = fenc_check_enc_result(t, f);
if (res != F265_RET_RETRY) return res;
}
}
+// Encode tiles until none left. Release the encoding thread afterwards.
+static void* fenc_encode_parallel_tile(void *args)
+{
+ f265_enc_thread *t = (f265_enc_thread*)args;
%%% s/0/NULL => some compilers might warn about that.
+ f265_enc_thread *next = 0;
+ f265_enc *e = t->enc;
+ f265_main_data *md = &e->md;
+ f265_enc_sync *es = &t->enc->es;
+ f265_mutex *mutex = &es->mutex;
+
+ // Check for colleague.
+ if (t - md->enc_threads[0] < md->nb_enc_threads-1) next = t+1;
+
+ // Worker loop.
+ int exit_flag = 0;
+ while (!exit_flag)
+ {
%%% Judging from a quick search it's usually spelled "spurious wakeup".
+ // Wait for work. Prevent spurious wake up.
+ int status;
+ f265_mutex_lock(mutex);
+ do
+ {
+ // When woken up, make sure the thread is no longer set to IDLE.
+ f265_cond_wait(&t->worker_cv, mutex);
+ status = t->status;
+
+ } while (status == F265_THREAD_IDLE);
+
+ // Wake up colleague.
+ if (status == F265_THREAD_BUSY && next)
+ {
%%% The fact that the main thread is already working during this initialization scares me a little.
%%% But it should work... eventually we can revisit that logic.
%%% The thread should probably initialize its own data to avoid unneeded CL transfers.
%%% Add a note for this please?
+ // Get colleague thread ready for work.
+ fenc_init_enc_thread(e, t->src_frame, next, t->qp[0]);
+ next->status = F265_THREAD_BUSY;
+
+ // Wake up call.
+ f265_cond_signal(&next->worker_cv);
+ }
+
+ // Stop hogging the mutex. Take a breather.
%%% Heh.
+ f265_mutex_unlock(mutex);
+
+ // Tile encoding loop.
+ do
+ {
%%% Unlock-lock back-to-back. That could be optimized by inverting the lock logic eventually.
%%% Add a note please?
+ // Fetch next available tile.
+ f265_mutex_lock(mutex);
+ int section_idx = es->next_section_idx++;
+ exit_flag = es->abort_flag | es->error_flag | status == F265_THREAD_EXIT;
+ f265_mutex_unlock(mutex);
+
+ // Stop work loop entirely if any thread aborted, an error occurred,
+ // or exit was requested.
+ if (exit_flag) break;
+
+ // Pause when all other tiles have been, or are being processed.
+ if (section_idx >= t->src_frame->fmap->nb_sections)
+ {
+ // Update status.
+ f265_mutex_lock(mutex);
+ status = t->status = F265_THREAD_IDLE;
+ f265_mutex_unlock(mutex);
+
+ // Notify master thread.
+ f265_cond_signal(&es->master_cv);
%%% Extra white line.
+
+ }
+
+ // Process tile. Signal any problems to coworkers.
+ else
+ {
+ int res = fenc_encode_section(t, section_idx);
+ if (res == F265_RET_ABORT)
+ {
+ f265_mutex_lock(mutex);
+ es->abort_flag = 1;
+ f265_mutex_unlock(mutex);
+ }
+ }
+ } while (status == F265_THREAD_BUSY);
%%% Ditto.
+
+ }
+
+ // Clean exit. Force EXIT status in case this was caused by an abort signal
+ // or an error during encoding.
+ f265_mutex_lock(mutex);
+ t->status = F265_THREAD_EXIT;
+ f265_mutex_unlock(mutex);
+
+ // Tell the master thread about it.
+ f265_cond_signal(&es->master_cv);
+
+ return NULL;
+}
+
+// Have the main thread use the worker threads. The main thread then makes sure
+// the result is acceptable. The input frame cannot be null. Returns VALID,
+// ABORT or ERROR.
+static int fenc_encode_frame_mt_tile(f265_enc *e, f265_frame *f)
+{
+ f265_main_data *md = &e->md;
+ f265_enc_sync *es = &e->es;
+ f265_mutex *mutex = &es->mutex;
+
+ // Set the DPB.
+ fenc_set_frame_dpb(e, f);
+
+ // Link the encoding object.
+ fenc_link_enc_obj(e, f);
+
+ // Commit to a single frame.
+ md->nb_enc_frames++;
+
+ // Frame re-encoding loop.
+ while (1)
+ {
%%% Ditto "wakeup".
+ // Avoid spurious wake up locking problems.
+ f265_mutex_lock(mutex);
+
+ // Init the first worker thread.
+ f265_enc_thread *t = md->enc_threads[0];
+ fenc_init_enc_thread(e, f, t, -1);
+
%%% its.
+ // Update it's status to BUSY.
+ t->status = F265_THREAD_BUSY;
+
+ // Update job listing.
+ es->next_section_idx = 0;
+
+ f265_mutex_unlock(mutex);
+
+ // Wake it up. It will wake up the others.
+ f265_cond_signal(&t->worker_cv);
+
+ // Wait for all the threads to finish.
+ int busy_flag;
+ f265_mutex_lock(mutex);
+ do
+ {
+ f265_cond_wait(&es->master_cv, mutex);
+
+ busy_flag = 0;
+ for (int n = 0; n < md->nb_enc_threads; n++)
+ busy_flag |= md->enc_threads[n]->status == F265_THREAD_BUSY;
+
+ } while (busy_flag);
+ f265_mutex_unlock(mutex);
+
+ // The main thread validates the results.
+ if (es->error_flag) return F265_RET_ERROR;
+ else if (es->abort_flag) return F265_RET_ABORT;
+ else
+ {
+ int res = fenc_check_enc_result(md->enc_threads[0], f);
+ if (res != F265_RET_RETRY) return res;
+ }
+ }
+}
+
+// Forwarding function based on multi-threading mode.
+static int fenc_encode_frame(f265_enc *e, f265_frame *f)
+{
+ int res;
+ f265_gen_data *gd = &e->gd;
+ // FIXME. Consider WPP-based, frame-based and GOP-based modes.
+ if (gd->mt_mode == F265_MT_ENC_TILE) res = fenc_encode_frame_mt_tile(e, f);
+ else res = fenc_encode_frame_mt_none(e, f);
+ return res;
+}
+
// Encode the current section. Return VALID or ABORT.
static int fenc_encode_section(f265_enc_thread *t, int section_idx)
{
@@ -1606,7 +1952,8 @@ static void fenc_load_ctb(f265_enc_thread *t)
t->rec_planes[4+i] = f->rec_planes[4+i] + (t->ctb_off[1]*stride>>sy) + (t->ctb_off[0]>>sx);
// Set the intra availability.
- t->intra_avail_cutoff[0] = F265_MIN(gd->pix_dim[0] - t->ctb_off[0], gd->ctb_size<<1);
+ int tile_cutoff = gd->ctb_size<<((t->ctb_neighbours>>2)&1);
+ t->intra_avail_cutoff[0] = F265_MIN(gd->pix_dim[0] - t->ctb_off[0], tile_cutoff);
t->intra_avail_cutoff[1] = F265_MIN(gd->pix_dim[1] - t->ctb_off[1], gd->ctb_size);
// Mark CBs unavailable by default. Optimize this eventually.
@@ -1940,24 +2287,14 @@ static double fenc_calc_lambda_chroma(int chroma_qp, int luma_qp, double luma_la
return luma_lambda / weight;
}
-// Assign the frame to the next idle encoding thread and prepare encoding.
-// Return the thread.
-static f265_enc_thread* fenc_set_enc_thread(f265_enc *e, f265_frame *f)
+// Transfer contextual information to the thread prior to encoding.
+static void fenc_init_enc_thread(f265_enc *e, f265_frame *f, f265_enc_thread *t, int qp)
{
f265_main_data *md = &e->md;
- // Set the DPB.
- fenc_set_frame_dpb(e, f);
-
- // Link the encoding object.
- fenc_link_enc_obj(e, f);
-
// Get the previous frame.
f265_frame *prev = md->enc_frames[0];
- // We have one more encoding thread active.
- f265_enc_thread *t = md->enc_threads[md->nb_enc_frames++];
-
// Assign the frame.
t->src_frame = f;
f->frame_type = f->la_frame_type;
@@ -1969,8 +2306,8 @@ static f265_enc_thread* fenc_set_enc_thread(f265_enc *e, f265_frame *f)
int deblock_flag = t->mflags&F265_PF_DEBLOCK && (t->mflags&F265_PF_DEBLOCK_ALL || f->gen_flags&F265_FF_REF);
F265_SET_FLAG(f->gen_flags, F265_FF_DEBLOCK, deblock_flag);
- // Set the frame QP.
- f->qp = fenc_rc_frame_start(t, prev);
+ // Set the frame QP. Only use the rate control when the QP is unknown.
+ f->qp = qp == -1 ? fenc_rc_frame_start(t, prev) : qp;
// Set the QP.
t->qp[0] = f->qp;
@@ -1986,6 +2323,25 @@ static f265_enc_thread* fenc_set_enc_thread(f265_enc *e, f265_frame *f)
// Build the inter configuration.
fenc_set_frame_inter(t, f);
+}
+
+// Assign the frame to the next idle encoding thread and prepare encoding.
+// Return the thread.
+static f265_enc_thread* fenc_set_enc_thread(f265_enc *e, f265_frame *f)
+{
+ f265_main_data *md = &e->md;
+
+ // Set the DPB.
+ fenc_set_frame_dpb(e, f);
+
+ // Link the encoding object.
+ fenc_link_enc_obj(e, f);
+
+ // We have one more encoding thread active.
+ f265_enc_thread *t = md->enc_threads[md->nb_enc_frames++];
+
+ // Set up context information.
+ fenc_init_enc_thread(e, f, t, -1);
return t;
}
diff --git a/f265/enc.h b/f265/enc.h
index 409e2b1..0edb5a5 100644
--- a/f265/enc.h
+++ b/f265/enc.h
@@ -1951,6 +1951,24 @@ struct f265_enc_thread
// is the reference list. The second index is the reference index plus 1.
// The value is a unique identifier for the referenced frame, if any.
int8_t deblock_frame_id_map[2][17];
+
+
+ // Threading.
+ #ifdef F265_HAVE_MT
+
+ // Actual execution thread.
+ f265_thread_handle thread_handle;
+
+ // Private communication channel between the master and this thread.
+ f265_cond_var worker_cv;
+
+ // Thread status (F265_THREAD_)BUSY, IDLE, EXIT.
+ int16_t status;
+
+ // State variables for proper cleanup.
+ int8_t handle_flag;
+ int8_t cv_flag;
+ #endif
};
// Invariant data shared by all threads in the encoder instance.
@@ -2045,6 +2063,9 @@ typedef struct f265_gen_data
// Log 2 of the parallel merge level.
int8_t parallel_merge_level;
+ // Tile layout.
+ int8_t tiles[2];
+
// == Direct parameter import end ==
// Initial luma QP.
@@ -2261,6 +2282,32 @@ typedef struct f265_lookahead
} f265_lookahead;
+// Data shared between the main thread and the encoding threads.
+typedef struct f265_enc_sync
+{
+ #ifdef F265_HAVE_MT
+ // Lock mechanism to query available job and communication flags.
+ f265_mutex mutex;
+
+ // Shared communication channel between all encoding threads and the master
+ // thread.
+ f265_cond_var master_cv;
+
+ // Index of the next section to be processed.
+ // Threads have to validate that the value exists.
+ int next_section_idx;
+
+ // Communication flags between threads.
+ int8_t abort_flag;
+ int8_t error_flag;
+
+ // State variables for proper cleanup.
+ int8_t mutex_flag;
+ int8_t cv_flag;
+ #endif
+
+} f265_enc_sync;
+
// Encoder instance.
struct f265_enc
{
@@ -2272,6 +2319,9 @@ struct f265_enc
// Lookahead data.
F265_ALIGN64 f265_lookahead la;
+
+ // Threading data.
+ F265_ALIGN64 f265_enc_sync es;
};
// Large tables initialized when the library is loaded. FIXME: change name, this
diff --git a/f265/f265.h b/f265/f265.h
index c80ef91..ab85419 100644
--- a/f265/f265.h
+++ b/f265/f265.h
@@ -277,6 +277,11 @@ typedef struct f265_enc_params
// flags are hardcoded and not documented on purpose.
uint64_t algo;
+ // Tiles.
+
+ // Layout as (columns,rows) pair.
+ int8_t tiles[2];
+
} f265_enc_params;
// Input frame data.
diff --git a/f265/parse.c b/f265/parse.c
index 7387820..f40be02 100644
--- a/f265/parse.c
+++ b/f265/parse.c
@@ -425,6 +425,23 @@ static void handle_param_algo(f265_parse_ctx *ctx, f265_enc_params *p, f265_pars
p->algo |= (1<<(nb_flags-i-1));
}
+static void handle_param_tiles(f265_parse_ctx *ctx, f265_enc_params *p, f265_parse_arg *a, int32_t nb_args)
+{
+ p->tiles[0] = a[0].i;
+ p->tiles[1] = a[1].i;
+}
+
+static void handle_param_mt_mode(f265_parse_ctx *ctx, f265_enc_params *p, f265_parse_arg *a, int32_t nb_args)
+{
+ p->mt_mode = a[0].i;
+}
+
+static void handle_param_nb_workers(f265_parse_ctx *ctx, f265_enc_params *p, f265_parse_arg *a, int32_t nb_args)
+{
+ p->nb_workers[0] = a[0].i;
+ p->nb_workers[1] = a[1].i;
+}
+
// Parameter dispatch table.
static const f265_parse_entry f265_enc_params_table[] =
{
@@ -475,6 +492,9 @@ static const f265_parse_entry f265_enc_params_table[] =
{ "lt-conv-min", handle_param_lt_conv_min, 1, 0 },
{ "lt-conv-exp", handle_param_lt_conv_exp, 1, 1 },
{ "algo", handle_param_algo, 1, 2 },
+ { "tiles", handle_param_tiles, 2, 0 },
+ { "mt-mode", handle_param_mt_mode, 1, 0 },
+ { "nb-workers", handle_param_nb_workers, 2, 0 },
};
void f265_parse_params(f265_enc_params *params, const char *param_string, char **error_handle)