On Fri, Jul 17, 2015 at 2:07 AM, Luca Barbato <[email protected]> wrote:
> Enjoy some cache locality and use less threads.
> About 5x speedup (from 60ms to 12ms to decode a 4k frame).
> ---
>  libavcodec/hap.h    |  2 ++
>  libavcodec/hapdec.c | 33 +++++++++++++++++++++++++--------
>  2 files changed, 27 insertions(+), 8 deletions(-)
>
> diff --git a/libavcodec/hap.h b/libavcodec/hap.h
> index 1250a6f..75299fd 100644
> --- a/libavcodec/hap.h
> +++ b/libavcodec/hap.h
> @@ -46,6 +46,8 @@ typedef struct HapContext {
>      uint8_t *snappied;       /* Buffer interacting with snappy */
>      size_t max_snappy;       /* Maximum compressed size for snappy buffer */
>
> +    int slice_size;          /* Optimal slice size */
> +
>      /* Pointer to the selected compress or decompress function */
>      int (*tex_fun)(uint8_t *dst, ptrdiff_t stride, const uint8_t *block);
>  } HapContext;
> diff --git a/libavcodec/hapdec.c b/libavcodec/hapdec.c
> index 72db9f4..5133a51 100644
> --- a/libavcodec/hapdec.c
> +++ b/libavcodec/hapdec.c
> @@ -137,16 +137,30 @@ static int setup_texture(AVCodecContext *avctx, size_t 
> length)
>  }
>
>  static int decompress_texture_thread(AVCodecContext *avctx, void *arg,
> -                                     int block_nb, int thread_nb)
> +                                     int slice, int thread_nb)
>  {
>      HapContext *ctx = avctx->priv_data;
>      AVFrame *frame = arg;
> -    int x = (TEXTURE_BLOCK_W * block_nb) % avctx->coded_width;
> -    int y = TEXTURE_BLOCK_H * (TEXTURE_BLOCK_W * block_nb / 
> avctx->coded_width);
> -    uint8_t *p = frame->data[0] + x * 4 + y * frame->linesize[0];
> -    const uint8_t *d = ctx->tex_data + block_nb * ctx->tex_rat;
> +    const uint8_t *d = ctx->tex_data;
> +    int w_block = avctx->coded_width / TEXTURE_BLOCK_W;
> +    int x, y;
> +    int start_slice, end_slice;
> +
> +    start_slice = slice * ctx->slice_size;
> +    end_slice   = FFMIN(start_slice + ctx->slice_size, avctx->coded_height);
> +
> +    start_slice /= TEXTURE_BLOCK_H;
> +    end_slice   /= TEXTURE_BLOCK_H;
> +
> +    for (y = start_slice; y < end_slice; y++) {
> +        uint8_t *p = frame->data[0] + y * frame->linesize[0] * 
> TEXTURE_BLOCK_H;
> +        int off  = y * w_block;
> +        for (x = 0; x < w_block; x++) {
> +            ctx->tex_fun(p + x * 16, frame->linesize[0],
> +                         d + (off + x) * ctx->tex_rat);
> +        }
> +    }
>
> -    ctx->tex_fun(p, frame->linesize[0], d);
>      return 0;
>  }
>
> @@ -156,7 +170,10 @@ static int hap_decode(AVCodecContext *avctx, void *data,
>      HapContext *ctx = avctx->priv_data;
>      ThreadFrame tframe;
>      int ret, length;
> -    int blocks = avctx->coded_width * avctx->coded_height / (TEXTURE_BLOCK_W 
> * TEXTURE_BLOCK_H);
> +    int slices = FFMIN(avctx->thread_count,
> +                       avctx->coded_height / TEXTURE_BLOCK_H);
> +
> +    ctx->slice_size = avctx->coded_height / slices;
>
>      bytestream2_init(&ctx->gbc, avpkt->data, avpkt->size);
>
> @@ -180,7 +197,7 @@ static int hap_decode(AVCodecContext *avctx, void *data,
>      ff_thread_finish_setup(avctx);
>
>      /* Use the decompress function on the texture, one block per thread */
> -    avctx->execute2(avctx, decompress_texture_thread, tframe.f, NULL, 
> blocks);
> +    avctx->execute2(avctx, decompress_texture_thread, tframe.f, NULL, 
> slices);
>
>      /* Frame is ready to be output */
>      tframe.f->pict_type = AV_PICTURE_TYPE_I;

I don't see anything wrong with this approach.
Tom, have you got any comments?
-- 
Vittorio
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to