Yes I'm happy with this. I'll submit my pthread_slice.c patch here for discussion - seems silly every codec has to adjust work to the thread-count when it can happen once in the codebase.
On 20 July 2015 at 18:47, Vittorio Giovara <[email protected]> wrote: > On Fri, Jul 17, 2015 at 2:07 AM, Luca Barbato <[email protected]> wrote: >> Enjoy some cache locality and use less threads. >> About 5x speedup (from 60ms to 12ms to decode a 4k frame). >> --- >> libavcodec/hap.h | 2 ++ >> libavcodec/hapdec.c | 33 +++++++++++++++++++++++++-------- >> 2 files changed, 27 insertions(+), 8 deletions(-) >> >> diff --git a/libavcodec/hap.h b/libavcodec/hap.h >> index 1250a6f..75299fd 100644 >> --- a/libavcodec/hap.h >> +++ b/libavcodec/hap.h >> @@ -46,6 +46,8 @@ typedef struct HapContext { >> uint8_t *snappied; /* Buffer interacting with snappy */ >> size_t max_snappy; /* Maximum compressed size for snappy buffer */ >> >> + int slice_size; /* Optimal slice size */ >> + >> /* Pointer to the selected compress or decompress function */ >> int (*tex_fun)(uint8_t *dst, ptrdiff_t stride, const uint8_t *block); >> } HapContext; >> diff --git a/libavcodec/hapdec.c b/libavcodec/hapdec.c >> index 72db9f4..5133a51 100644 >> --- a/libavcodec/hapdec.c >> +++ b/libavcodec/hapdec.c >> @@ -137,16 +137,30 @@ static int setup_texture(AVCodecContext *avctx, size_t >> length) >> } >> >> static int decompress_texture_thread(AVCodecContext *avctx, void *arg, >> - int block_nb, int thread_nb) >> + int slice, int thread_nb) >> { >> HapContext *ctx = avctx->priv_data; >> AVFrame *frame = arg; >> - int x = (TEXTURE_BLOCK_W * block_nb) % avctx->coded_width; >> - int y = TEXTURE_BLOCK_H * (TEXTURE_BLOCK_W * block_nb / >> avctx->coded_width); >> - uint8_t *p = frame->data[0] + x * 4 + y * frame->linesize[0]; >> - const uint8_t *d = ctx->tex_data + block_nb * ctx->tex_rat; >> + const uint8_t *d = ctx->tex_data; >> + int w_block = avctx->coded_width / TEXTURE_BLOCK_W; >> + int x, y; >> + int start_slice, end_slice; >> + >> + start_slice = slice * ctx->slice_size; >> + end_slice = FFMIN(start_slice + ctx->slice_size, avctx->coded_height); >> + >> + start_slice /= TEXTURE_BLOCK_H; >> + end_slice /= TEXTURE_BLOCK_H; >> + >> + for (y = start_slice; y < end_slice; y++) { >> + uint8_t *p = frame->data[0] + y * frame->linesize[0] * >> TEXTURE_BLOCK_H; >> + int off = y * w_block; >> + for (x = 0; x < w_block; x++) { >> + ctx->tex_fun(p + x * 16, frame->linesize[0], >> + d + (off + x) * ctx->tex_rat); >> + } >> + } >> >> - ctx->tex_fun(p, frame->linesize[0], d); >> return 0; >> } >> >> @@ -156,7 +170,10 @@ static int hap_decode(AVCodecContext *avctx, void *data, >> HapContext *ctx = avctx->priv_data; >> ThreadFrame tframe; >> int ret, length; >> - int blocks = avctx->coded_width * avctx->coded_height / >> (TEXTURE_BLOCK_W * TEXTURE_BLOCK_H); >> + int slices = FFMIN(avctx->thread_count, >> + avctx->coded_height / TEXTURE_BLOCK_H); >> + >> + ctx->slice_size = avctx->coded_height / slices; >> >> bytestream2_init(&ctx->gbc, avpkt->data, avpkt->size); >> >> @@ -180,7 +197,7 @@ static int hap_decode(AVCodecContext *avctx, void *data, >> ff_thread_finish_setup(avctx); >> >> /* Use the decompress function on the texture, one block per thread */ >> - avctx->execute2(avctx, decompress_texture_thread, tframe.f, NULL, >> blocks); >> + avctx->execute2(avctx, decompress_texture_thread, tframe.f, NULL, >> slices); >> >> /* Frame is ready to be output */ >> tframe.f->pict_type = AV_PICTURE_TYPE_I; > > I don't see anything wrong with this approach. > Tom, have you got any comments? > -- > Vittorio > _______________________________________________ > libav-devel mailing list > [email protected] > https://lists.libav.org/mailman/listinfo/libav-devel _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
