[FFmpeg-devel] [PATCH] Parallelize vf_lut
This will use ff_filter_get_nb_threads(ctx) threads which was 4x faster for when I was testing on a 4K video --- libavfilter/vf_lut.c | 106 --- 1 file changed, 70 insertions(+), 36 deletions(-) diff --git a/libavfilter/vf_lut.c b/libavfilter/vf_lut.c index c815ddc194..9e5527e4a1 100644 --- a/libavfilter/vf_lut.c +++ b/libavfilter/vf_lut.c @@ -72,6 +72,12 @@ typedef struct LutContext { int negate_alpha; /* only used by negate */ } LutContext; +typedef struct ThreadData { +AVFrame *in; +AVFrame *out; +AVFilterLink *link; +} ThreadData; + #define Y 0 #define U 1 #define V 2 @@ -337,26 +343,13 @@ static int config_props(AVFilterLink *inlink) return 0; } -static int filter_frame(AVFilterLink *inlink, AVFrame *in) -{ -AVFilterContext *ctx = inlink->dst; +static int lookup_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { LutContext *s = ctx->priv; -AVFilterLink *outlink = ctx->outputs[0]; -AVFrame *out; -int i, j, plane, direct = 0; - -if (av_frame_is_writable(in)) { -direct = 1; -out = in; -} else { -out = ff_get_video_buffer(outlink, outlink->w, outlink->h); -if (!out) { -av_frame_free(&in); -return AVERROR(ENOMEM); -} -av_frame_copy_props(out, in); -} - +int i, j, plane = 0; +const ThreadData *td = arg; +const AVFrame *in = td->in; +AVFrame *out = td->out; +const AVFilterLink *inlink = td->link; if (s->is_rgb && s->is_16bit && !s->is_planar) { /* packed, 16-bit */ uint16_t *inrow, *outrow, *inrow0, *outrow0; @@ -366,11 +359,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) const int in_linesize = in->linesize[0] / 2; const int out_linesize = out->linesize[0] / 2; const int step = s->step; +const int row_min = jobnr / nb_jobs * h; +const int row_max = (jobnr + 1) / nb_jobs * h; inrow0 = (uint16_t*) in ->data[0]; outrow0 = (uint16_t*) out->data[0]; -for (i = 0; i < h; i ++) { +for (i = row_min; i < row_max; i ++) { inrow = inrow0; outrow = outrow0; for (j = 0; j < w; j++) { @@ -403,11 +398,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) const int in_linesize = in->linesize[0]; const int out_linesize = out->linesize[0]; const int step = s->step; +const int row_min = jobnr / nb_jobs * h; +const int row_max = (jobnr + 1) / nb_jobs * h; inrow0 = in ->data[0]; outrow0 = out->data[0]; -for (i = 0; i < h; i ++) { +for (i = row_min; i < row_max; i ++) { inrow = inrow0; outrow = outrow0; for (j = 0; j < w; j++) { @@ -435,11 +432,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) const uint16_t *tab = s->lut[plane]; const int in_linesize = in->linesize[plane] / 2; const int out_linesize = out->linesize[plane] / 2; +const int row_min = jobnr / nb_jobs * h; +const int row_max = (jobnr + 1) / nb_jobs * h; inrow = (uint16_t *)in ->data[plane]; outrow = (uint16_t *)out->data[plane]; -for (i = 0; i < h; i++) { +for (i = row_min; i < row_max; i++) { for (j = 0; j < w; j++) { #if HAVE_BIGENDIAN outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]); @@ -463,11 +462,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) const uint16_t *tab = s->lut[plane]; const int in_linesize = in->linesize[plane]; const int out_linesize = out->linesize[plane]; +const int row_min = jobnr / nb_jobs * h; +const int row_max = (jobnr + 1) / nb_jobs * h; inrow = in ->data[plane]; outrow = out->data[plane]; -for (i = 0; i < h; i++) { +for (i = row_min; i < row_max; i++) { for (j = 0; j < w; j++) outrow[j] = tab[inrow[j]]; inrow += in_linesize; @@ -476,9 +477,42 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) } } -if (!direct) +return 0; +} + +static AVFrame *apply_lut(AVFilterLink *inlink, AVFrame *in) { +AVFilterContext *ctx = inlink->dst; +AVFilterLink *outlink = ctx->outputs[0]; +AVFrame *out; +ThreadData td; + +if (av_frame_is_writable(in)) { +out = in; +} else { +out = ff_get_video_buffer(outlink, outlink->w, outlink->h); +if (!out) { +av_frame_free(&in); +return NULL; +} +av_frame_copy_props(out, in); +} +td.in = in; +td.out = out; +td.link = inlink; +ctx->internal->execute(ctx, lookup_slice, &td, NULL, FFMIN(outlink->h, ff_filter_get_nb_threa
Re: [FFmpeg-devel] [PATCH] Parallelize vf_lut
On Mon, Feb 25, 2019 at 03:25:30PM -0500, Britt Cyr wrote: > --- > libavfilter/vf_lut.c | 106 --- > 1 file changed, 70 insertions(+), 36 deletions(-) > > diff --git a/libavfilter/vf_lut.c b/libavfilter/vf_lut.c > index c815ddc194..14386938be 100644 > --- a/libavfilter/vf_lut.c > +++ b/libavfilter/vf_lut.c > @@ -72,6 +72,12 @@ typedef struct LutContext { > int negate_alpha; /* only used by negate */ > } LutContext; > > +typedef struct ThreadData { > + AVFrame *in; > + AVFrame *out; > + AVFilterLink *link; > +} ThreadData; indention depth is inconsistant [...] > @@ -366,11 +359,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame > *in) > const int in_linesize = in->linesize[0] / 2; > const int out_linesize = out->linesize[0] / 2; > const int step = s->step; > +const int row_min = jobnr / nb_jobs * h; > +const int row_max = (jobnr + 1) / nb_jobs * h; > > inrow0 = (uint16_t*) in ->data[0]; > outrow0 = (uint16_t*) out->data[0]; > > -for (i = 0; i < h; i ++) { > +for (i = row_min; i < row_max; i ++) { > inrow = inrow0; > outrow = outrow0; > for (j = 0; j < w; j++) { > @@ -403,11 +398,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame > *in) > const int in_linesize = in->linesize[0]; > const int out_linesize = out->linesize[0]; > const int step = s->step; > +const int row_min = jobnr / nb_jobs * h; > +const int row_max = (jobnr + 1) / nb_jobs * h; > > inrow0 = in ->data[0]; > outrow0 = out->data[0]; > > -for (i = 0; i < h; i ++) { > +for (i = row_min; i < row_max; i ++) { > inrow = inrow0; > outrow = outrow0; > for (j = 0; j < w; j++) { > @@ -435,11 +432,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame > *in) > const uint16_t *tab = s->lut[plane]; > const int in_linesize = in->linesize[plane] / 2; > const int out_linesize = out->linesize[plane] / 2; > +const int row_min = jobnr / nb_jobs * h; > +const int row_max = (jobnr + 1) / nb_jobs * h; > > inrow = (uint16_t *)in ->data[plane]; > outrow = (uint16_t *)out->data[plane]; > > -for (i = 0; i < h; i++) { > +for (i = row_min; i < row_max; i++) { > for (j = 0; j < w; j++) { > #if HAVE_BIGENDIAN > outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]); > @@ -463,11 +462,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame > *in) > const uint16_t *tab = s->lut[plane]; > const int in_linesize = in->linesize[plane]; > const int out_linesize = out->linesize[plane]; > +const int row_min = jobnr / nb_jobs * h; > +const int row_max = (jobnr + 1) / nb_jobs * h; > > inrow = in ->data[plane]; > outrow = out->data[plane]; > > -for (i = 0; i < h; i++) { > +for (i = row_min; i < row_max; i++) { > for (j = 0; j < w; j++) > outrow[j] = tab[inrow[j]]; > inrow += in_linesize; unreaĺated to your patch, i just spoted this as it makes it obvious replicating this code 4 times is a bit ugly > @@ -476,9 +477,42 @@ static int filter_frame(AVFilterLink *inlink, AVFrame > *in) > } > } > > -if (!direct) > +return 0; > +} > + > +static AVFrame *apply_lut(AVFilterLink *inlink, AVFrame *in) { > +AVFilterContext *ctx = inlink->dst; > +AVFilterLink *outlink = ctx->outputs[0]; > +AVFrame *out; > +ThreadData td; > + > +if (av_frame_is_writable(in)) { > +out = in; > +} else { > +out = ff_get_video_buffer(outlink, outlink->w, outlink->h); > +if (!out) { > +av_frame_free(&in); > +return NULL; > +} > +av_frame_copy_props(out, in); > +} > +td.in = in; > +td.out = out; > +td.link = inlink; > +ctx->internal->execute(ctx, lookup_slice, &td, NULL, FFMIN(outlink->h, > 1)); how many tasks does this run in parallel and how much faster is it ? thanks [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB Its not that you shouldnt use gotos but rather that you should write readable code and code with gotos often but not always is less readable signature.asc Description: PGP signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] Parallelize vf_lut
--- libavfilter/vf_lut.c | 106 --- 1 file changed, 70 insertions(+), 36 deletions(-) diff --git a/libavfilter/vf_lut.c b/libavfilter/vf_lut.c index c815ddc194..14386938be 100644 --- a/libavfilter/vf_lut.c +++ b/libavfilter/vf_lut.c @@ -72,6 +72,12 @@ typedef struct LutContext { int negate_alpha; /* only used by negate */ } LutContext; +typedef struct ThreadData { + AVFrame *in; + AVFrame *out; + AVFilterLink *link; +} ThreadData; + #define Y 0 #define U 1 #define V 2 @@ -337,26 +343,13 @@ static int config_props(AVFilterLink *inlink) return 0; } -static int filter_frame(AVFilterLink *inlink, AVFrame *in) -{ -AVFilterContext *ctx = inlink->dst; +static int lookup_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { LutContext *s = ctx->priv; -AVFilterLink *outlink = ctx->outputs[0]; -AVFrame *out; -int i, j, plane, direct = 0; - -if (av_frame_is_writable(in)) { -direct = 1; -out = in; -} else { -out = ff_get_video_buffer(outlink, outlink->w, outlink->h); -if (!out) { -av_frame_free(&in); -return AVERROR(ENOMEM); -} -av_frame_copy_props(out, in); -} - +int i, j, plane = 0; +const ThreadData *td = arg; +const AVFrame *in = td->in; +AVFrame *out = td->out; +const AVFilterLink *inlink = td->link; if (s->is_rgb && s->is_16bit && !s->is_planar) { /* packed, 16-bit */ uint16_t *inrow, *outrow, *inrow0, *outrow0; @@ -366,11 +359,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) const int in_linesize = in->linesize[0] / 2; const int out_linesize = out->linesize[0] / 2; const int step = s->step; +const int row_min = jobnr / nb_jobs * h; +const int row_max = (jobnr + 1) / nb_jobs * h; inrow0 = (uint16_t*) in ->data[0]; outrow0 = (uint16_t*) out->data[0]; -for (i = 0; i < h; i ++) { +for (i = row_min; i < row_max; i ++) { inrow = inrow0; outrow = outrow0; for (j = 0; j < w; j++) { @@ -403,11 +398,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) const int in_linesize = in->linesize[0]; const int out_linesize = out->linesize[0]; const int step = s->step; +const int row_min = jobnr / nb_jobs * h; +const int row_max = (jobnr + 1) / nb_jobs * h; inrow0 = in ->data[0]; outrow0 = out->data[0]; -for (i = 0; i < h; i ++) { +for (i = row_min; i < row_max; i ++) { inrow = inrow0; outrow = outrow0; for (j = 0; j < w; j++) { @@ -435,11 +432,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) const uint16_t *tab = s->lut[plane]; const int in_linesize = in->linesize[plane] / 2; const int out_linesize = out->linesize[plane] / 2; +const int row_min = jobnr / nb_jobs * h; +const int row_max = (jobnr + 1) / nb_jobs * h; inrow = (uint16_t *)in ->data[plane]; outrow = (uint16_t *)out->data[plane]; -for (i = 0; i < h; i++) { +for (i = row_min; i < row_max; i++) { for (j = 0; j < w; j++) { #if HAVE_BIGENDIAN outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]); @@ -463,11 +462,13 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) const uint16_t *tab = s->lut[plane]; const int in_linesize = in->linesize[plane]; const int out_linesize = out->linesize[plane]; +const int row_min = jobnr / nb_jobs * h; +const int row_max = (jobnr + 1) / nb_jobs * h; inrow = in ->data[plane]; outrow = out->data[plane]; -for (i = 0; i < h; i++) { +for (i = row_min; i < row_max; i++) { for (j = 0; j < w; j++) outrow[j] = tab[inrow[j]]; inrow += in_linesize; @@ -476,9 +477,42 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) } } -if (!direct) +return 0; +} + +static AVFrame *apply_lut(AVFilterLink *inlink, AVFrame *in) { +AVFilterContext *ctx = inlink->dst; +AVFilterLink *outlink = ctx->outputs[0]; +AVFrame *out; +ThreadData td; + +if (av_frame_is_writable(in)) { +out = in; +} else { +out = ff_get_video_buffer(outlink, outlink->w, outlink->h); +if (!out) { +av_frame_free(&in); +return NULL; +} +av_frame_copy_props(out, in); +} +td.in = in; +td.out = out; +td.link = inlink; +ctx->internal->execute(ctx, lookup_slice, &td, NULL, FFMIN(outlink->h, 1)); + +if (out != in) av_frame_free(&in); +return out; +} + +static int filter_frame(AVFilterLink *inlink, AVFrame *in