[FFmpeg-devel] [PATCH] Parallelize vf_lut

2019-02-28 Thread Britt Cyr
This will use ff_filter_get_nb_threads(ctx) threads which was 4x
faster for when I was testing on a 4K video
---
 libavfilter/vf_lut.c | 106 ---
 1 file changed, 70 insertions(+), 36 deletions(-)

diff --git a/libavfilter/vf_lut.c b/libavfilter/vf_lut.c
index c815ddc194..9e5527e4a1 100644
--- a/libavfilter/vf_lut.c
+++ b/libavfilter/vf_lut.c
@@ -72,6 +72,12 @@ typedef struct LutContext {
 int negate_alpha; /* only used by negate */
 } LutContext;
 
+typedef struct ThreadData {
+AVFrame *in;
+AVFrame *out;
+AVFilterLink *link;
+} ThreadData;
+
 #define Y 0
 #define U 1
 #define V 2
@@ -337,26 +343,13 @@ static int config_props(AVFilterLink *inlink)
 return 0;
 }
 
-static int filter_frame(AVFilterLink *inlink, AVFrame *in)
-{
-AVFilterContext *ctx = inlink->dst;
+static int lookup_slice(AVFilterContext *ctx, void *arg, int jobnr, int 
nb_jobs) {
 LutContext *s = ctx->priv;
-AVFilterLink *outlink = ctx->outputs[0];
-AVFrame *out;
-int i, j, plane, direct = 0;
-
-if (av_frame_is_writable(in)) {
-direct = 1;
-out = in;
-} else {
-out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
-if (!out) {
-av_frame_free(&in);
-return AVERROR(ENOMEM);
-}
-av_frame_copy_props(out, in);
-}
-
+int i, j, plane = 0;
+const ThreadData *td = arg;
+const AVFrame *in  = td->in;
+AVFrame *out = td->out;
+const AVFilterLink *inlink = td->link;
 if (s->is_rgb && s->is_16bit && !s->is_planar) {
 /* packed, 16-bit */
 uint16_t *inrow, *outrow, *inrow0, *outrow0;
@@ -366,11 +359,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 const int in_linesize  =  in->linesize[0] / 2;
 const int out_linesize = out->linesize[0] / 2;
 const int step = s->step;
+const int row_min = jobnr / nb_jobs * h;
+const int row_max = (jobnr + 1) / nb_jobs * h;
 
 inrow0  = (uint16_t*) in ->data[0];
 outrow0 = (uint16_t*) out->data[0];
 
-for (i = 0; i < h; i ++) {
+for (i = row_min; i < row_max; i ++) {
 inrow  = inrow0;
 outrow = outrow0;
 for (j = 0; j < w; j++) {
@@ -403,11 +398,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 const int in_linesize  =  in->linesize[0];
 const int out_linesize = out->linesize[0];
 const int step = s->step;
+const int row_min = jobnr / nb_jobs * h;
+const int row_max = (jobnr + 1) / nb_jobs * h;
 
 inrow0  = in ->data[0];
 outrow0 = out->data[0];
 
-for (i = 0; i < h; i ++) {
+for (i = row_min; i < row_max; i ++) {
 inrow  = inrow0;
 outrow = outrow0;
 for (j = 0; j < w; j++) {
@@ -435,11 +432,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 const uint16_t *tab = s->lut[plane];
 const int in_linesize  =  in->linesize[plane] / 2;
 const int out_linesize = out->linesize[plane] / 2;
+const int row_min = jobnr / nb_jobs * h;
+const int row_max = (jobnr + 1) / nb_jobs * h;
 
 inrow  = (uint16_t *)in ->data[plane];
 outrow = (uint16_t *)out->data[plane];
 
-for (i = 0; i < h; i++) {
+for (i = row_min; i < row_max; i++) {
 for (j = 0; j < w; j++) {
 #if HAVE_BIGENDIAN
 outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]);
@@ -463,11 +462,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 const uint16_t *tab = s->lut[plane];
 const int in_linesize  =  in->linesize[plane];
 const int out_linesize = out->linesize[plane];
+const int row_min = jobnr / nb_jobs * h;
+const int row_max = (jobnr + 1) / nb_jobs * h;
 
 inrow  = in ->data[plane];
 outrow = out->data[plane];
 
-for (i = 0; i < h; i++) {
+for (i = row_min; i < row_max; i++) {
 for (j = 0; j < w; j++)
 outrow[j] = tab[inrow[j]];
 inrow  += in_linesize;
@@ -476,9 +477,42 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 }
 }
 
-if (!direct)
+return 0;
+}
+
+static AVFrame *apply_lut(AVFilterLink *inlink, AVFrame *in) {
+AVFilterContext *ctx = inlink->dst;
+AVFilterLink *outlink = ctx->outputs[0];
+AVFrame *out;
+ThreadData td;
+
+if (av_frame_is_writable(in)) {
+out = in;
+} else {
+out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+if (!out) {
+av_frame_free(&in);
+return NULL;
+}
+av_frame_copy_props(out, in);
+}
+td.in  = in;
+td.out = out;
+td.link = inlink;
+ctx->internal->execute(ctx, lookup_slice, &td, NULL, FFMIN(outlink->h, 
ff_filter_get_nb_threa

Re: [FFmpeg-devel] [PATCH] Parallelize vf_lut

2019-02-27 Thread Michael Niedermayer
On Mon, Feb 25, 2019 at 03:25:30PM -0500, Britt Cyr wrote:
> ---
>  libavfilter/vf_lut.c | 106 ---
>  1 file changed, 70 insertions(+), 36 deletions(-)
> 
> diff --git a/libavfilter/vf_lut.c b/libavfilter/vf_lut.c
> index c815ddc194..14386938be 100644
> --- a/libavfilter/vf_lut.c
> +++ b/libavfilter/vf_lut.c
> @@ -72,6 +72,12 @@ typedef struct LutContext {
>  int negate_alpha; /* only used by negate */
>  } LutContext;
>  
> +typedef struct ThreadData {
> +  AVFrame *in;
> +  AVFrame *out;
> +  AVFilterLink *link;
> +} ThreadData;

indention depth is inconsistant


[...]
> @@ -366,11 +359,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame 
> *in)
>  const int in_linesize  =  in->linesize[0] / 2;
>  const int out_linesize = out->linesize[0] / 2;
>  const int step = s->step;
> +const int row_min = jobnr / nb_jobs * h;
> +const int row_max = (jobnr + 1) / nb_jobs * h;
>  
>  inrow0  = (uint16_t*) in ->data[0];
>  outrow0 = (uint16_t*) out->data[0];
>  
> -for (i = 0; i < h; i ++) {
> +for (i = row_min; i < row_max; i ++) {
>  inrow  = inrow0;
>  outrow = outrow0;
>  for (j = 0; j < w; j++) {
> @@ -403,11 +398,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame 
> *in)
>  const int in_linesize  =  in->linesize[0];
>  const int out_linesize = out->linesize[0];
>  const int step = s->step;
> +const int row_min = jobnr / nb_jobs * h;
> +const int row_max = (jobnr + 1) / nb_jobs * h;
>  
>  inrow0  = in ->data[0];
>  outrow0 = out->data[0];
>  
> -for (i = 0; i < h; i ++) {
> +for (i = row_min; i < row_max; i ++) {
>  inrow  = inrow0;
>  outrow = outrow0;
>  for (j = 0; j < w; j++) {
> @@ -435,11 +432,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame 
> *in)
>  const uint16_t *tab = s->lut[plane];
>  const int in_linesize  =  in->linesize[plane] / 2;
>  const int out_linesize = out->linesize[plane] / 2;
> +const int row_min = jobnr / nb_jobs * h;
> +const int row_max = (jobnr + 1) / nb_jobs * h;
>  
>  inrow  = (uint16_t *)in ->data[plane];
>  outrow = (uint16_t *)out->data[plane];
>  
> -for (i = 0; i < h; i++) {
> +for (i = row_min; i < row_max; i++) {
>  for (j = 0; j < w; j++) {
>  #if HAVE_BIGENDIAN
>  outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]);
> @@ -463,11 +462,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame 
> *in)
>  const uint16_t *tab = s->lut[plane];
>  const int in_linesize  =  in->linesize[plane];
>  const int out_linesize = out->linesize[plane];
> +const int row_min = jobnr / nb_jobs * h;
> +const int row_max = (jobnr + 1) / nb_jobs * h;
>  
>  inrow  = in ->data[plane];
>  outrow = out->data[plane];
>  
> -for (i = 0; i < h; i++) {
> +for (i = row_min; i < row_max; i++) {
>  for (j = 0; j < w; j++)
>  outrow[j] = tab[inrow[j]];
>  inrow  += in_linesize;

unreaĺated to your patch, i just spoted this as it makes it obvious
replicating this code 4 times is a bit ugly


> @@ -476,9 +477,42 @@ static int filter_frame(AVFilterLink *inlink, AVFrame 
> *in)
>  }
>  }
>  
> -if (!direct)
> +return 0;
> +}
> +
> +static AVFrame *apply_lut(AVFilterLink *inlink, AVFrame *in) {
> +AVFilterContext *ctx = inlink->dst;
> +AVFilterLink *outlink = ctx->outputs[0];
> +AVFrame *out;
> +ThreadData td;
> +
> +if (av_frame_is_writable(in)) {
> +out = in;
> +} else {
> +out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> +if (!out) {
> +av_frame_free(&in);
> +return NULL;
> +}
> +av_frame_copy_props(out, in);
> +}
> +td.in  = in;
> +td.out = out;
> +td.link = inlink;

> +ctx->internal->execute(ctx, lookup_slice, &td, NULL, FFMIN(outlink->h, 
> 1));

how many tasks does this run in parallel and how much faster is it ?

thanks

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Its not that you shouldnt use gotos but rather that you should write
readable code and code with gotos often but not always is less readable


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] Parallelize vf_lut

2019-02-25 Thread Britt Cyr
---
 libavfilter/vf_lut.c | 106 ---
 1 file changed, 70 insertions(+), 36 deletions(-)

diff --git a/libavfilter/vf_lut.c b/libavfilter/vf_lut.c
index c815ddc194..14386938be 100644
--- a/libavfilter/vf_lut.c
+++ b/libavfilter/vf_lut.c
@@ -72,6 +72,12 @@ typedef struct LutContext {
 int negate_alpha; /* only used by negate */
 } LutContext;
 
+typedef struct ThreadData {
+  AVFrame *in;
+  AVFrame *out;
+  AVFilterLink *link;
+} ThreadData;
+
 #define Y 0
 #define U 1
 #define V 2
@@ -337,26 +343,13 @@ static int config_props(AVFilterLink *inlink)
 return 0;
 }
 
-static int filter_frame(AVFilterLink *inlink, AVFrame *in)
-{
-AVFilterContext *ctx = inlink->dst;
+static int lookup_slice(AVFilterContext *ctx, void *arg, int jobnr, int 
nb_jobs) {
 LutContext *s = ctx->priv;
-AVFilterLink *outlink = ctx->outputs[0];
-AVFrame *out;
-int i, j, plane, direct = 0;
-
-if (av_frame_is_writable(in)) {
-direct = 1;
-out = in;
-} else {
-out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
-if (!out) {
-av_frame_free(&in);
-return AVERROR(ENOMEM);
-}
-av_frame_copy_props(out, in);
-}
-
+int i, j, plane = 0;
+const ThreadData *td = arg;
+const AVFrame *in  = td->in;
+AVFrame *out = td->out;
+const AVFilterLink *inlink = td->link;
 if (s->is_rgb && s->is_16bit && !s->is_planar) {
 /* packed, 16-bit */
 uint16_t *inrow, *outrow, *inrow0, *outrow0;
@@ -366,11 +359,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 const int in_linesize  =  in->linesize[0] / 2;
 const int out_linesize = out->linesize[0] / 2;
 const int step = s->step;
+const int row_min = jobnr / nb_jobs * h;
+const int row_max = (jobnr + 1) / nb_jobs * h;
 
 inrow0  = (uint16_t*) in ->data[0];
 outrow0 = (uint16_t*) out->data[0];
 
-for (i = 0; i < h; i ++) {
+for (i = row_min; i < row_max; i ++) {
 inrow  = inrow0;
 outrow = outrow0;
 for (j = 0; j < w; j++) {
@@ -403,11 +398,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 const int in_linesize  =  in->linesize[0];
 const int out_linesize = out->linesize[0];
 const int step = s->step;
+const int row_min = jobnr / nb_jobs * h;
+const int row_max = (jobnr + 1) / nb_jobs * h;
 
 inrow0  = in ->data[0];
 outrow0 = out->data[0];
 
-for (i = 0; i < h; i ++) {
+for (i = row_min; i < row_max; i ++) {
 inrow  = inrow0;
 outrow = outrow0;
 for (j = 0; j < w; j++) {
@@ -435,11 +432,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 const uint16_t *tab = s->lut[plane];
 const int in_linesize  =  in->linesize[plane] / 2;
 const int out_linesize = out->linesize[plane] / 2;
+const int row_min = jobnr / nb_jobs * h;
+const int row_max = (jobnr + 1) / nb_jobs * h;
 
 inrow  = (uint16_t *)in ->data[plane];
 outrow = (uint16_t *)out->data[plane];
 
-for (i = 0; i < h; i++) {
+for (i = row_min; i < row_max; i++) {
 for (j = 0; j < w; j++) {
 #if HAVE_BIGENDIAN
 outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]);
@@ -463,11 +462,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 const uint16_t *tab = s->lut[plane];
 const int in_linesize  =  in->linesize[plane];
 const int out_linesize = out->linesize[plane];
+const int row_min = jobnr / nb_jobs * h;
+const int row_max = (jobnr + 1) / nb_jobs * h;
 
 inrow  = in ->data[plane];
 outrow = out->data[plane];
 
-for (i = 0; i < h; i++) {
+for (i = row_min; i < row_max; i++) {
 for (j = 0; j < w; j++)
 outrow[j] = tab[inrow[j]];
 inrow  += in_linesize;
@@ -476,9 +477,42 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 }
 }
 
-if (!direct)
+return 0;
+}
+
+static AVFrame *apply_lut(AVFilterLink *inlink, AVFrame *in) {
+AVFilterContext *ctx = inlink->dst;
+AVFilterLink *outlink = ctx->outputs[0];
+AVFrame *out;
+ThreadData td;
+
+if (av_frame_is_writable(in)) {
+out = in;
+} else {
+out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+if (!out) {
+av_frame_free(&in);
+return NULL;
+}
+av_frame_copy_props(out, in);
+}
+td.in  = in;
+td.out = out;
+td.link = inlink;
+ctx->internal->execute(ctx, lookup_slice, &td, NULL, FFMIN(outlink->h, 1));
+
+if (out != in)
 av_frame_free(&in);
 
+return out;
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *in