Re: [FFmpeg-devel] [PATCH v4] Improved the performance of 1 decode + N filter graphs and adaptive bitrate.
> >> sizeof(AVFrame) is not part of the ABI. You need to allocate it > >> somewhere. > >> > > Please tell more? > > See the documentation for AVFrame in libavutil/frame.h Use av_frame_alloc() > > Carl Eugen Thanks Carl ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v4] Improved the performance of 1 decode + N filter graphs and adaptive bitrate.
2019-02-13 8:52 GMT+01:00, Wang, Shaofei : >> > +AVFrame input_frm; >> >> sizeof(AVFrame) is not part of the ABI. You need to allocate it >> somewhere. >> > Please tell more? See the documentation for AVFrame in libavutil/frame.h Use av_frame_alloc() Carl Eugen ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v4] Improved the performance of 1 decode + N filter graphs and adaptive bitrate.
> -Original Message- > From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On Behalf Of > Mark Thompson > Sent: Tuesday, February 12, 2019 8:18 AM It should be UTC time when received the email > To: ffmpeg-devel@ffmpeg.org > Subject: Re: [FFmpeg-devel] [PATCH v4] Improved the performance of 1 > decode + N filter graphs and adaptive bitrate. > > On 11/02/2019 22:41, Shaofei Wang wrote: And the above time I've sent the previous email is also a correct UTC time > Please avoid sending messages from the future - the list received this about > thirteen hours before its supposed send time (received "Mon, 11 Feb 2019 > 11:42:09 +0200", sent "Mon, 11 Feb 2019 17:41:04 -0500"). > Probably the sending machine or some intermediate has an incorrect time or > time zone. It may be the reason. > Some numbers for more use-cases and platforms (with different architectures > and core counts) would be a good idea if you intend to enable this by default. It would be better to have more platforms data. Actually, it provide option for user to choose a "faster" path in the previous version. In this patch it simplified code path. > Presumably it's a bit slower on less powerful machines with fewer cores when > it makes many threads, but by how much? Is that acceptable? Is it resource limited machine that we should disable HAVE_THREADS? > > diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c index > > 544f1a1..67b1a2a 100644 > > --- a/fftools/ffmpeg.c > > +++ b/fftools/ffmpeg.c > > @@ -1419,13 +1419,18 @@ static void > finish_output_stream(OutputStream *ost) > > * > > * @return 0 for success, <0 for severe errors > > */ > > -static int reap_filters(int flush) > > +static int reap_filters(int flush, InputFilter * ifilter) > > { > > AVFrame *filtered_frame = NULL; > > int i; > > > > -/* Reap all buffers present in the buffer sinks */ > > +/* Reap all buffers present in the buffer sinks or just reap specified > > + * input filter buffer */ > > for (i = 0; i < nb_output_streams; i++) { > > +if (ifilter) { > > +if (ifilter != output_streams[i]->filter->graph->inputs[0]) > > +continue; > > +} > > No mixed declarations and code. OK. > > OutputStream *ost = output_streams[i]; > > OutputFile*of = output_files[ost->file_index]; > > AVFilterContext *filter; > > How carefully has this been audited to make sure that there are no data races? > The calls to init_output_stream() and do_video_out() both do /a lot/, and in > particular they interact with the InputStream which might be shared with > other threads (and indeed is in all your examples above). Base on the code path of multithread, it won't have duplicated path to call init_output_stream() and do_video_out(), since there's no output stream share multiple filter graphs. And this concern should be hightlight, will investigate more in the code. > > @@ -2179,7 +2184,8 @@ static int ifilter_send_frame(InputFilter *ifilter, > AVFrame *frame) > > } > > } > > > > -ret = reap_filters(1); > > +ret = HAVE_THREADS ? reap_filters(1, ifilter) : > > + reap_filters(1, NULL); > > + > > if (ret < 0 && ret != AVERROR_EOF) { > > av_log(NULL, AV_LOG_ERROR, "Error while filtering: %s\n", > av_err2str(ret)); > > return ret; > > @@ -2208,6 +2214,14 @@ static int ifilter_send_eof(InputFilter > > *ifilter, int64_t pts) > > > > ifilter->eof = 1; > > > > +#if HAVE_THREADS > > +ifilter->waited_frm = NULL; > > +pthread_mutex_lock(>process_mutex); > > +ifilter->t_end = 1; > > +pthread_cond_signal(>process_cond); > > +pthread_mutex_unlock(>process_mutex); > > +pthread_join(ifilter->f_thread, NULL); #endif > > if (ifilter->filter) { > > ret = av_buffersrc_close(ifilter->filter, pts, > AV_BUFFERSRC_FLAG_PUSH); > > if (ret < 0) > > @@ -2252,12 +2266,95 @@ static int decode(AVCodecContext *avctx, > AVFrame *frame, int *got_frame, AVPacke > > return 0; > > } > > > > +#if HAVE_THREADS > > +static void *filter_pipeline(void *arg) { > > +InputFilter *fl = arg; > > +AVFrame *frm; > > +int ret; > > +while(1) { > > +pthread_mutex_lock(>process_mutex); > > +while (fl->waited_frm == NULL && !fl->t_end) > > +pthread_cond_wait(>process_cond, >process_mutex); > > +pthread_mutex_unlock(>process_mutex); > > + > > +if (fl->t_end) break; > > + > > +frm = fl->waited_frm; > > +ret = ifilter_send_frame(fl, frm); > > +if (ret < 0) { > > +av_log(NULL, AV_LOG_ERROR, > > + "Failed to inject frame into filter network: %s\n", > av_err2str(ret)); > > +} else { > > +ret = reap_filters(0, fl); > > +} > > +fl->t_error = ret; > > + > > +pthread_mutex_lock(>finish_mutex); > > +fl->waited_frm = NULL; > > +
Re: [FFmpeg-devel] [PATCH v4] Improved the performance of 1 decode + N filter graphs and adaptive bitrate.
On 11/02/2019 22:41, Shaofei Wang wrote: Please avoid sending messages from the future - the list received this about thirteen hours before its supposed send time (received "Mon, 11 Feb 2019 11:42:09 +0200", sent "Mon, 11 Feb 2019 17:41:04 -0500"). Probably the sending machine or some intermediate has an incorrect time or time zone. > It enabled multiple filter graph concurrency, which bring above about > 4%~20% improvement in some 1:N scenarios by CPU or GPU acceleration > > Below are some test cases and comparison as reference. > (Hardware platform: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz) > (Software: Intel iHD driver - 16.9.00100, CentOS 7) > > For 1:N transcode by GPU acceleration with vaapi: > ./ffmpeg -vaapi_device /dev/dri/renderD128 -hwaccel vaapi \ > -hwaccel_output_format vaapi \ > -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ > -vf "scale_vaapi=1280:720" -c:v h264_vaapi -f null /dev/null \ > -vf "scale_vaapi=720:480" -c:v h264_vaapi -f null /dev/null > > test results: > 2 encoders 5 encoders 10 encoders > Improved 6.1%6.9% 5.5% > > For 1:N transcode by GPU acceleration with QSV: > ./ffmpeg -hwaccel qsv -c:v h264_qsv \ > -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ > -vf "scale_qsv=1280:720:format=nv12" -c:v h264_qsv -f null /dev/null \ > -vf "scale_qsv=720:480:format=nv12" -c:v h264_qsv -f null /dev/null > > test results: > 2 encoders 5 encoders 10 encoders > Improved 6% 4% 15% > > For Intel GPU acceleration case, 1 decode to N scaling, by QSV: > ./ffmpeg -hwaccel qsv -c:v h264_qsv \ > -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ > -vf "scale_qsv=1280:720:format=nv12,hwdownload" -pix_fmt nv12 -f null > /dev/null \ > -vf "scale_qsv=720:480:format=nv12,hwdownload" -pix_fmt nv12 -f null > /dev/null > > test results: > 2 scale 5 scale 10 scale > Improved 12% 21%21% > > For CPU only 1 decode to N scaling: > ./ffmpeg -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ > -vf "scale=1280:720" -pix_fmt nv12 -f null /dev/null \ > -vf "scale=720:480" -pix_fmt nv12 -f null /dev/null > > test results: > 2 scale 5 scale 10 scale > Improved 25%107% 148% > Some numbers for more use-cases and platforms (with different architectures and core counts) would be a good idea if you intend to enable this by default. Presumably it's a bit slower on less powerful machines with fewer cores when it makes many threads, but by how much? Is that acceptable? > Signed-off-by: Wang, Shaofei > Reviewed-by: Zhao, Jun > --- > fftools/ffmpeg.c| 112 > +--- > fftools/ffmpeg.h| 14 ++ > fftools/ffmpeg_filter.c | 4 ++ > 3 files changed, 124 insertions(+), 6 deletions(-) > > diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c > index 544f1a1..67b1a2a 100644 > --- a/fftools/ffmpeg.c > +++ b/fftools/ffmpeg.c > @@ -1419,13 +1419,18 @@ static void finish_output_stream(OutputStream *ost) > * > * @return 0 for success, <0 for severe errors > */ > -static int reap_filters(int flush) > +static int reap_filters(int flush, InputFilter * ifilter) > { > AVFrame *filtered_frame = NULL; > int i; > > -/* Reap all buffers present in the buffer sinks */ > +/* Reap all buffers present in the buffer sinks or just reap specified > + * input filter buffer */ > for (i = 0; i < nb_output_streams; i++) { > +if (ifilter) { > +if (ifilter != output_streams[i]->filter->graph->inputs[0]) > +continue; > +} No mixed declarations and code. > OutputStream *ost = output_streams[i]; > OutputFile*of = output_files[ost->file_index]; > AVFilterContext *filter; How carefully has this been audited to make sure that there are no data races? The calls to init_output_stream() and do_video_out() both do /a lot/, and in particular they interact with the InputStream which might be shared with other threads (and indeed is in all your examples above). > @@ -2179,7 +2184,8 @@ static int ifilter_send_frame(InputFilter *ifilter, > AVFrame *frame) > } > } > > -ret = reap_filters(1); > +ret = HAVE_THREADS ? reap_filters(1, ifilter) : reap_filters(1, > NULL); > + > if (ret < 0 && ret != AVERROR_EOF) { > av_log(NULL, AV_LOG_ERROR, "Error while filtering: %s\n", > av_err2str(ret)); > return ret; > @@ -2208,6 +2214,14 @@ static int ifilter_send_eof(InputFilter *ifilter, > int64_t pts) > > ifilter->eof = 1; > > +#if HAVE_THREADS > +ifilter->waited_frm = NULL; > +pthread_mutex_lock(>process_mutex); > +ifilter->t_end = 1; > +pthread_cond_signal(>process_cond); > +pthread_mutex_unlock(>process_mutex); > +pthread_join(ifilter->f_thread, NULL); > +#endif >
Re: [FFmpeg-devel] [PATCH v4] Improved the performance of 1 decode + N filter graphs and adaptive bitrate.
On Mon, Feb 11, 2019 at 05:41:04PM -0500, Shaofei Wang wrote: > It enabled multiple filter graph concurrency, which bring above about > 4%~20% improvement in some 1:N scenarios by CPU or GPU acceleration > > Below are some test cases and comparison as reference. > (Hardware platform: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz) > (Software: Intel iHD driver - 16.9.00100, CentOS 7) > > For 1:N transcode by GPU acceleration with vaapi: > ./ffmpeg -vaapi_device /dev/dri/renderD128 -hwaccel vaapi \ > -hwaccel_output_format vaapi \ > -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ > -vf "scale_vaapi=1280:720" -c:v h264_vaapi -f null /dev/null \ > -vf "scale_vaapi=720:480" -c:v h264_vaapi -f null /dev/null > > test results: > 2 encoders 5 encoders 10 encoders > Improved 6.1%6.9% 5.5% > > For 1:N transcode by GPU acceleration with QSV: > ./ffmpeg -hwaccel qsv -c:v h264_qsv \ > -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ > -vf "scale_qsv=1280:720:format=nv12" -c:v h264_qsv -f null /dev/null \ > -vf "scale_qsv=720:480:format=nv12" -c:v h264_qsv -f null /dev/null > > test results: > 2 encoders 5 encoders 10 encoders > Improved 6% 4% 15% > > For Intel GPU acceleration case, 1 decode to N scaling, by QSV: > ./ffmpeg -hwaccel qsv -c:v h264_qsv \ > -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ > -vf "scale_qsv=1280:720:format=nv12,hwdownload" -pix_fmt nv12 -f null > /dev/null \ > -vf "scale_qsv=720:480:format=nv12,hwdownload" -pix_fmt nv12 -f null > /dev/null > > test results: > 2 scale 5 scale 10 scale > Improved 12% 21%21% > > For CPU only 1 decode to N scaling: > ./ffmpeg -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ > -vf "scale=1280:720" -pix_fmt nv12 -f null /dev/null \ > -vf "scale=720:480" -pix_fmt nv12 -f null /dev/null > > test results: > 2 scale 5 scale 10 scale > Improved 25%107% 148% > > Signed-off-by: Wang, Shaofei > Reviewed-by: Zhao, Jun > --- > fftools/ffmpeg.c| 112 > +--- > fftools/ffmpeg.h| 14 ++ > fftools/ffmpeg_filter.c | 4 ++ > 3 files changed, 124 insertions(+), 6 deletions(-) breaks fate make: *** [fate-lavf-mxf_d10] Error 1 make: *** [fate-filter-tremolo] Error 1 make: *** [fate-filter-chorus] Error 1 make: *** [tests/data/hls-list-append.m3u8] Error 1 make: *** [fate-filter-atrim-mixed] Error 1 make: *** [fate-filter-atrim-time] Error 1 make: *** [tests/data/live_last_endlist.m3u8] Error 1 make: *** [fate-filter-volume] Error 1 make: *** [fate-filter-join] Error 1 make: *** [fate-lavf-mxf] Error 1 make: *** [fate-swr-resample-s16p-44100-8000] Error 1 make: *** [fate-swr-resample-s16p-44100-2626] Error 1 ... [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB "Nothing to hide" only works if the folks in power share the values of you and everyone you know entirely and always will -- Tom Scott signature.asc Description: PGP signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v4] Improved the performance of 1 decode + N filter graphs and adaptive bitrate.
Code clean and remove the "-abr_pipeline" option, use the perf improved code path by default only if HAVE_THREAD enabled. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH v4] Improved the performance of 1 decode + N filter graphs and adaptive bitrate.
It enabled multiple filter graph concurrency, which bring above about 4%~20% improvement in some 1:N scenarios by CPU or GPU acceleration Below are some test cases and comparison as reference. (Hardware platform: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz) (Software: Intel iHD driver - 16.9.00100, CentOS 7) For 1:N transcode by GPU acceleration with vaapi: ./ffmpeg -vaapi_device /dev/dri/renderD128 -hwaccel vaapi \ -hwaccel_output_format vaapi \ -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ -vf "scale_vaapi=1280:720" -c:v h264_vaapi -f null /dev/null \ -vf "scale_vaapi=720:480" -c:v h264_vaapi -f null /dev/null test results: 2 encoders 5 encoders 10 encoders Improved 6.1%6.9% 5.5% For 1:N transcode by GPU acceleration with QSV: ./ffmpeg -hwaccel qsv -c:v h264_qsv \ -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ -vf "scale_qsv=1280:720:format=nv12" -c:v h264_qsv -f null /dev/null \ -vf "scale_qsv=720:480:format=nv12" -c:v h264_qsv -f null /dev/null test results: 2 encoders 5 encoders 10 encoders Improved 6% 4% 15% For Intel GPU acceleration case, 1 decode to N scaling, by QSV: ./ffmpeg -hwaccel qsv -c:v h264_qsv \ -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ -vf "scale_qsv=1280:720:format=nv12,hwdownload" -pix_fmt nv12 -f null /dev/null \ -vf "scale_qsv=720:480:format=nv12,hwdownload" -pix_fmt nv12 -f null /dev/null test results: 2 scale 5 scale 10 scale Improved 12% 21%21% For CPU only 1 decode to N scaling: ./ffmpeg -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \ -vf "scale=1280:720" -pix_fmt nv12 -f null /dev/null \ -vf "scale=720:480" -pix_fmt nv12 -f null /dev/null test results: 2 scale 5 scale 10 scale Improved 25%107% 148% Signed-off-by: Wang, Shaofei Reviewed-by: Zhao, Jun --- fftools/ffmpeg.c| 112 +--- fftools/ffmpeg.h| 14 ++ fftools/ffmpeg_filter.c | 4 ++ 3 files changed, 124 insertions(+), 6 deletions(-) diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c index 544f1a1..67b1a2a 100644 --- a/fftools/ffmpeg.c +++ b/fftools/ffmpeg.c @@ -1419,13 +1419,18 @@ static void finish_output_stream(OutputStream *ost) * * @return 0 for success, <0 for severe errors */ -static int reap_filters(int flush) +static int reap_filters(int flush, InputFilter * ifilter) { AVFrame *filtered_frame = NULL; int i; -/* Reap all buffers present in the buffer sinks */ +/* Reap all buffers present in the buffer sinks or just reap specified + * input filter buffer */ for (i = 0; i < nb_output_streams; i++) { +if (ifilter) { +if (ifilter != output_streams[i]->filter->graph->inputs[0]) +continue; +} OutputStream *ost = output_streams[i]; OutputFile*of = output_files[ost->file_index]; AVFilterContext *filter; @@ -2179,7 +2184,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame) } } -ret = reap_filters(1); +ret = HAVE_THREADS ? reap_filters(1, ifilter) : reap_filters(1, NULL); + if (ret < 0 && ret != AVERROR_EOF) { av_log(NULL, AV_LOG_ERROR, "Error while filtering: %s\n", av_err2str(ret)); return ret; @@ -2208,6 +2214,14 @@ static int ifilter_send_eof(InputFilter *ifilter, int64_t pts) ifilter->eof = 1; +#if HAVE_THREADS +ifilter->waited_frm = NULL; +pthread_mutex_lock(>process_mutex); +ifilter->t_end = 1; +pthread_cond_signal(>process_cond); +pthread_mutex_unlock(>process_mutex); +pthread_join(ifilter->f_thread, NULL); +#endif if (ifilter->filter) { ret = av_buffersrc_close(ifilter->filter, pts, AV_BUFFERSRC_FLAG_PUSH); if (ret < 0) @@ -2252,12 +2266,95 @@ static int decode(AVCodecContext *avctx, AVFrame *frame, int *got_frame, AVPacke return 0; } +#if HAVE_THREADS +static void *filter_pipeline(void *arg) +{ +InputFilter *fl = arg; +AVFrame *frm; +int ret; +while(1) { +pthread_mutex_lock(>process_mutex); +while (fl->waited_frm == NULL && !fl->t_end) +pthread_cond_wait(>process_cond, >process_mutex); +pthread_mutex_unlock(>process_mutex); + +if (fl->t_end) break; + +frm = fl->waited_frm; +ret = ifilter_send_frame(fl, frm); +if (ret < 0) { +av_log(NULL, AV_LOG_ERROR, + "Failed to inject frame into filter network: %s\n", av_err2str(ret)); +} else { +ret = reap_filters(0, fl); +} +fl->t_error = ret; + +pthread_mutex_lock(>finish_mutex); +fl->waited_frm = NULL; +pthread_cond_signal(>finish_cond); +pthread_mutex_unlock(>finish_mutex); + +if (ret < 0) +break; +} +