Re: [FFmpeg-devel] [PATCH v4] Improved the performance of 1 decode + N filter graphs and adaptive bitrate.

2019-02-13 Thread Wang, Shaofei
> >> sizeof(AVFrame) is not part of the ABI.  You need to allocate it
> >> somewhere.
> >>
> > Please tell more?
> 
> See the documentation for AVFrame in libavutil/frame.h Use av_frame_alloc()
> 
> Carl Eugen

Thanks Carl
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v4] Improved the performance of 1 decode + N filter graphs and adaptive bitrate.

2019-02-13 Thread Carl Eugen Hoyos
2019-02-13 8:52 GMT+01:00, Wang, Shaofei :

>> > +AVFrame input_frm;
>>
>> sizeof(AVFrame) is not part of the ABI.  You need to allocate it
>> somewhere.
>>
> Please tell more?

See the documentation for AVFrame in libavutil/frame.h
Use av_frame_alloc()

Carl Eugen
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v4] Improved the performance of 1 decode + N filter graphs and adaptive bitrate.

2019-02-12 Thread Wang, Shaofei
> -Original Message-
> From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On Behalf Of
> Mark Thompson
> Sent: Tuesday, February 12, 2019 8:18 AM
It should be UTC time when received the email

> To: ffmpeg-devel@ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH v4] Improved the performance of 1
> decode + N filter graphs and adaptive bitrate.
> 
> On 11/02/2019 22:41, Shaofei Wang wrote:
And the above time I've sent the previous email is also a correct UTC time
 
> Please avoid sending messages from the future - the list received this about
> thirteen hours before its supposed send time (received "Mon, 11 Feb 2019
> 11:42:09 +0200", sent "Mon, 11 Feb 2019 17:41:04 -0500").

> Probably the sending machine or some intermediate has an incorrect time or
> time zone.
It may be the reason.

> Some numbers for more use-cases and platforms (with different architectures
> and core counts) would be a good idea if you intend to enable this by default.
It would be better to have more platforms data.
Actually, it provide option for user to choose a "faster" path in the previous
version. In this patch it simplified code path.

> Presumably it's a bit slower on less powerful machines with fewer cores when
> it makes many threads, but by how much?  Is that acceptable?
Is it resource limited machine that we should disable HAVE_THREADS?

> > diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c index
> > 544f1a1..67b1a2a 100644
> > --- a/fftools/ffmpeg.c
> > +++ b/fftools/ffmpeg.c
> > @@ -1419,13 +1419,18 @@ static void
> finish_output_stream(OutputStream *ost)
> >   *
> >   * @return  0 for success, <0 for severe errors
> >   */
> > -static int reap_filters(int flush)
> > +static int reap_filters(int flush, InputFilter * ifilter)
> >  {
> >  AVFrame *filtered_frame = NULL;
> >  int i;
> >
> > -/* Reap all buffers present in the buffer sinks */
> > +/* Reap all buffers present in the buffer sinks or just reap specified
> > + * input filter buffer */
> >  for (i = 0; i < nb_output_streams; i++) {
> > +if (ifilter) {
> > +if (ifilter != output_streams[i]->filter->graph->inputs[0])
> > +continue;
> > +}
> 
> No mixed declarations and code.
OK. 
> >  OutputStream *ost = output_streams[i];
> >  OutputFile*of = output_files[ost->file_index];
> >  AVFilterContext *filter;
> 
> How carefully has this been audited to make sure that there are no data races?
> The calls to init_output_stream() and do_video_out() both do /a lot/, and in
> particular they interact with the InputStream which might be shared with
> other threads (and indeed is in all your examples above).
Base on the code path of multithread, it won't have duplicated path to call
init_output_stream() and do_video_out(), since there's no output stream share
multiple filter graphs. And this concern should be hightlight, will investigate
more in the code.

> > @@ -2179,7 +2184,8 @@ static int ifilter_send_frame(InputFilter *ifilter,
> AVFrame *frame)
> >  }
> >  }
> >
> > -ret = reap_filters(1);
> > +ret = HAVE_THREADS ? reap_filters(1, ifilter) :
> > + reap_filters(1, NULL);
> > +
> >  if (ret < 0 && ret != AVERROR_EOF) {
> >  av_log(NULL, AV_LOG_ERROR, "Error while filtering: %s\n",
> av_err2str(ret));
> >  return ret;
> > @@ -2208,6 +2214,14 @@ static int ifilter_send_eof(InputFilter
> > *ifilter, int64_t pts)
> >
> >  ifilter->eof = 1;
> >
> > +#if HAVE_THREADS
> > +ifilter->waited_frm = NULL;
> > +pthread_mutex_lock(>process_mutex);
> > +ifilter->t_end = 1;
> > +pthread_cond_signal(>process_cond);
> > +pthread_mutex_unlock(>process_mutex);
> > +pthread_join(ifilter->f_thread, NULL); #endif
> >  if (ifilter->filter) {
> >  ret = av_buffersrc_close(ifilter->filter, pts,
> AV_BUFFERSRC_FLAG_PUSH);
> >  if (ret < 0)
> > @@ -2252,12 +2266,95 @@ static int decode(AVCodecContext *avctx,
> AVFrame *frame, int *got_frame, AVPacke
> >  return 0;
> >  }
> >
> > +#if HAVE_THREADS
> > +static void *filter_pipeline(void *arg) {
> > +InputFilter *fl = arg;
> > +AVFrame *frm;
> > +int ret;
> > +while(1) {
> > +pthread_mutex_lock(>process_mutex);
> > +while (fl->waited_frm == NULL && !fl->t_end)
> > +pthread_cond_wait(>process_cond, >process_mutex);
> > +pthread_mutex_unlock(>process_mutex);
> > +
> > +if (fl->t_end) break;
> > +
> > +frm = fl->waited_frm;
> > +ret = ifilter_send_frame(fl, frm);
> > +if (ret < 0) {
> > +av_log(NULL, AV_LOG_ERROR,
> > +   "Failed to inject frame into filter network: %s\n",
> av_err2str(ret));
> > +} else {
> > +ret = reap_filters(0, fl);
> > +}
> > +fl->t_error = ret;
> > +
> > +pthread_mutex_lock(>finish_mutex);
> > +fl->waited_frm = NULL;
> > +

Re: [FFmpeg-devel] [PATCH v4] Improved the performance of 1 decode + N filter graphs and adaptive bitrate.

2019-02-11 Thread Mark Thompson
On 11/02/2019 22:41, Shaofei Wang wrote:

Please avoid sending messages from the future - the list received this about 
thirteen hours before its supposed send time (received "Mon, 11 Feb 2019 
11:42:09 +0200", sent "Mon, 11 Feb 2019 17:41:04 -0500").

Probably the sending machine or some intermediate has an incorrect time or time 
zone.

> It enabled multiple filter graph concurrency, which bring above about
> 4%~20% improvement in some 1:N scenarios by CPU or GPU acceleration
> 
> Below are some test cases and comparison as reference.
> (Hardware platform: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz)
> (Software: Intel iHD driver - 16.9.00100, CentOS 7)
> 
> For 1:N transcode by GPU acceleration with vaapi:
> ./ffmpeg -vaapi_device /dev/dri/renderD128 -hwaccel vaapi \
> -hwaccel_output_format vaapi \
> -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> -vf "scale_vaapi=1280:720" -c:v h264_vaapi -f null /dev/null \
> -vf "scale_vaapi=720:480" -c:v h264_vaapi -f null /dev/null
> 
> test results:
> 2 encoders 5 encoders 10 encoders
> Improved   6.1%6.9%   5.5%
> 
> For 1:N transcode by GPU acceleration with QSV:
> ./ffmpeg -hwaccel qsv -c:v h264_qsv \
> -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> -vf "scale_qsv=1280:720:format=nv12" -c:v h264_qsv -f null /dev/null \
> -vf "scale_qsv=720:480:format=nv12" -c:v h264_qsv -f null /dev/null
> 
> test results:
> 2 encoders  5 encoders 10 encoders
> Improved   6%   4% 15%
> 
> For Intel GPU acceleration case, 1 decode to N scaling, by QSV:
> ./ffmpeg -hwaccel qsv -c:v h264_qsv \
> -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> -vf "scale_qsv=1280:720:format=nv12,hwdownload" -pix_fmt nv12 -f null 
> /dev/null \
> -vf "scale_qsv=720:480:format=nv12,hwdownload" -pix_fmt nv12 -f null 
> /dev/null
> 
> test results:
> 2 scale  5 scale   10 scale
> Improved   12% 21%21%
> 
> For CPU only 1 decode to N scaling:
> ./ffmpeg -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> -vf "scale=1280:720" -pix_fmt nv12 -f null /dev/null \
> -vf "scale=720:480" -pix_fmt nv12 -f null /dev/null
> 
> test results:
> 2 scale  5 scale   10 scale
> Improved   25%107%   148%
> 

Some numbers for more use-cases and platforms (with different architectures and 
core counts) would be a good idea if you intend to enable this by default.

Presumably it's a bit slower on less powerful machines with fewer cores when it 
makes many threads, but by how much?  Is that acceptable?

> Signed-off-by: Wang, Shaofei 
> Reviewed-by: Zhao, Jun 
> ---
>  fftools/ffmpeg.c| 112 
> +---
>  fftools/ffmpeg.h|  14 ++
>  fftools/ffmpeg_filter.c |   4 ++
>  3 files changed, 124 insertions(+), 6 deletions(-)
> 
> diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
> index 544f1a1..67b1a2a 100644
> --- a/fftools/ffmpeg.c
> +++ b/fftools/ffmpeg.c
> @@ -1419,13 +1419,18 @@ static void finish_output_stream(OutputStream *ost)
>   *
>   * @return  0 for success, <0 for severe errors
>   */
> -static int reap_filters(int flush)
> +static int reap_filters(int flush, InputFilter * ifilter)
>  {
>  AVFrame *filtered_frame = NULL;
>  int i;
>  
> -/* Reap all buffers present in the buffer sinks */
> +/* Reap all buffers present in the buffer sinks or just reap specified
> + * input filter buffer */
>  for (i = 0; i < nb_output_streams; i++) {
> +if (ifilter) {
> +if (ifilter != output_streams[i]->filter->graph->inputs[0])
> +continue;
> +}

No mixed declarations and code.

>  OutputStream *ost = output_streams[i];
>  OutputFile*of = output_files[ost->file_index];
>  AVFilterContext *filter;

How carefully has this been audited to make sure that there are no data races?  
The calls to init_output_stream() and do_video_out() both do /a lot/, and in 
particular they interact with the InputStream which might be shared with other 
threads (and indeed is in all your examples above).

> @@ -2179,7 +2184,8 @@ static int ifilter_send_frame(InputFilter *ifilter, 
> AVFrame *frame)
>  }
>  }
>  
> -ret = reap_filters(1);
> +ret = HAVE_THREADS ? reap_filters(1, ifilter) : reap_filters(1, 
> NULL);
> +
>  if (ret < 0 && ret != AVERROR_EOF) {
>  av_log(NULL, AV_LOG_ERROR, "Error while filtering: %s\n", 
> av_err2str(ret));
>  return ret;
> @@ -2208,6 +2214,14 @@ static int ifilter_send_eof(InputFilter *ifilter, 
> int64_t pts)
>  
>  ifilter->eof = 1;
>  
> +#if HAVE_THREADS
> +ifilter->waited_frm = NULL;
> +pthread_mutex_lock(>process_mutex);
> +ifilter->t_end = 1;
> +pthread_cond_signal(>process_cond);
> +pthread_mutex_unlock(>process_mutex);
> +pthread_join(ifilter->f_thread, NULL);
> +#endif
>  

Re: [FFmpeg-devel] [PATCH v4] Improved the performance of 1 decode + N filter graphs and adaptive bitrate.

2019-02-11 Thread Michael Niedermayer
On Mon, Feb 11, 2019 at 05:41:04PM -0500, Shaofei Wang wrote:
> It enabled multiple filter graph concurrency, which bring above about
> 4%~20% improvement in some 1:N scenarios by CPU or GPU acceleration
> 
> Below are some test cases and comparison as reference.
> (Hardware platform: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz)
> (Software: Intel iHD driver - 16.9.00100, CentOS 7)
> 
> For 1:N transcode by GPU acceleration with vaapi:
> ./ffmpeg -vaapi_device /dev/dri/renderD128 -hwaccel vaapi \
> -hwaccel_output_format vaapi \
> -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> -vf "scale_vaapi=1280:720" -c:v h264_vaapi -f null /dev/null \
> -vf "scale_vaapi=720:480" -c:v h264_vaapi -f null /dev/null
> 
> test results:
> 2 encoders 5 encoders 10 encoders
> Improved   6.1%6.9%   5.5%
> 
> For 1:N transcode by GPU acceleration with QSV:
> ./ffmpeg -hwaccel qsv -c:v h264_qsv \
> -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> -vf "scale_qsv=1280:720:format=nv12" -c:v h264_qsv -f null /dev/null \
> -vf "scale_qsv=720:480:format=nv12" -c:v h264_qsv -f null /dev/null
> 
> test results:
> 2 encoders  5 encoders 10 encoders
> Improved   6%   4% 15%
> 
> For Intel GPU acceleration case, 1 decode to N scaling, by QSV:
> ./ffmpeg -hwaccel qsv -c:v h264_qsv \
> -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> -vf "scale_qsv=1280:720:format=nv12,hwdownload" -pix_fmt nv12 -f null 
> /dev/null \
> -vf "scale_qsv=720:480:format=nv12,hwdownload" -pix_fmt nv12 -f null 
> /dev/null
> 
> test results:
> 2 scale  5 scale   10 scale
> Improved   12% 21%21%
> 
> For CPU only 1 decode to N scaling:
> ./ffmpeg -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
> -vf "scale=1280:720" -pix_fmt nv12 -f null /dev/null \
> -vf "scale=720:480" -pix_fmt nv12 -f null /dev/null
> 
> test results:
> 2 scale  5 scale   10 scale
> Improved   25%107%   148%
> 
> Signed-off-by: Wang, Shaofei 
> Reviewed-by: Zhao, Jun 
> ---
>  fftools/ffmpeg.c| 112 
> +---
>  fftools/ffmpeg.h|  14 ++
>  fftools/ffmpeg_filter.c |   4 ++
>  3 files changed, 124 insertions(+), 6 deletions(-)

breaks fate
make: *** [fate-lavf-mxf_d10] Error 1
make: *** [fate-filter-tremolo] Error 1
make: *** [fate-filter-chorus] Error 1
make: *** [tests/data/hls-list-append.m3u8] Error 1
make: *** [fate-filter-atrim-mixed] Error 1
make: *** [fate-filter-atrim-time] Error 1
make: *** [tests/data/live_last_endlist.m3u8] Error 1
make: *** [fate-filter-volume] Error 1
make: *** [fate-filter-join] Error 1
make: *** [fate-lavf-mxf] Error 1
make: *** [fate-swr-resample-s16p-44100-8000] Error 1
make: *** [fate-swr-resample-s16p-44100-2626] Error 1
...

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

"Nothing to hide" only works if the folks in power share the values of
you and everyone you know entirely and always will -- Tom Scott



signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v4] Improved the performance of 1 decode + N filter graphs and adaptive bitrate.

2019-02-11 Thread Wang, Shaofei
Code clean and remove the "-abr_pipeline" option, use the perf improved code 
path by default only if HAVE_THREAD enabled.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v4] Improved the performance of 1 decode + N filter graphs and adaptive bitrate.

2019-02-11 Thread Shaofei Wang
It enabled multiple filter graph concurrency, which bring above about
4%~20% improvement in some 1:N scenarios by CPU or GPU acceleration

Below are some test cases and comparison as reference.
(Hardware platform: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz)
(Software: Intel iHD driver - 16.9.00100, CentOS 7)

For 1:N transcode by GPU acceleration with vaapi:
./ffmpeg -vaapi_device /dev/dri/renderD128 -hwaccel vaapi \
-hwaccel_output_format vaapi \
-i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
-vf "scale_vaapi=1280:720" -c:v h264_vaapi -f null /dev/null \
-vf "scale_vaapi=720:480" -c:v h264_vaapi -f null /dev/null

test results:
2 encoders 5 encoders 10 encoders
Improved   6.1%6.9%   5.5%

For 1:N transcode by GPU acceleration with QSV:
./ffmpeg -hwaccel qsv -c:v h264_qsv \
-i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
-vf "scale_qsv=1280:720:format=nv12" -c:v h264_qsv -f null /dev/null \
-vf "scale_qsv=720:480:format=nv12" -c:v h264_qsv -f null /dev/null

test results:
2 encoders  5 encoders 10 encoders
Improved   6%   4% 15%

For Intel GPU acceleration case, 1 decode to N scaling, by QSV:
./ffmpeg -hwaccel qsv -c:v h264_qsv \
-i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
-vf "scale_qsv=1280:720:format=nv12,hwdownload" -pix_fmt nv12 -f null 
/dev/null \
-vf "scale_qsv=720:480:format=nv12,hwdownload" -pix_fmt nv12 -f null 
/dev/null

test results:
2 scale  5 scale   10 scale
Improved   12% 21%21%

For CPU only 1 decode to N scaling:
./ffmpeg -i ~/Videos/1920x1080p_30.00_x264_qp28.h264 \
-vf "scale=1280:720" -pix_fmt nv12 -f null /dev/null \
-vf "scale=720:480" -pix_fmt nv12 -f null /dev/null

test results:
2 scale  5 scale   10 scale
Improved   25%107%   148%

Signed-off-by: Wang, Shaofei 
Reviewed-by: Zhao, Jun 
---
 fftools/ffmpeg.c| 112 +---
 fftools/ffmpeg.h|  14 ++
 fftools/ffmpeg_filter.c |   4 ++
 3 files changed, 124 insertions(+), 6 deletions(-)

diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index 544f1a1..67b1a2a 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -1419,13 +1419,18 @@ static void finish_output_stream(OutputStream *ost)
  *
  * @return  0 for success, <0 for severe errors
  */
-static int reap_filters(int flush)
+static int reap_filters(int flush, InputFilter * ifilter)
 {
 AVFrame *filtered_frame = NULL;
 int i;
 
-/* Reap all buffers present in the buffer sinks */
+/* Reap all buffers present in the buffer sinks or just reap specified
+ * input filter buffer */
 for (i = 0; i < nb_output_streams; i++) {
+if (ifilter) {
+if (ifilter != output_streams[i]->filter->graph->inputs[0])
+continue;
+}
 OutputStream *ost = output_streams[i];
 OutputFile*of = output_files[ost->file_index];
 AVFilterContext *filter;
@@ -2179,7 +2184,8 @@ static int ifilter_send_frame(InputFilter *ifilter, 
AVFrame *frame)
 }
 }
 
-ret = reap_filters(1);
+ret = HAVE_THREADS ? reap_filters(1, ifilter) : reap_filters(1, NULL);
+
 if (ret < 0 && ret != AVERROR_EOF) {
 av_log(NULL, AV_LOG_ERROR, "Error while filtering: %s\n", 
av_err2str(ret));
 return ret;
@@ -2208,6 +2214,14 @@ static int ifilter_send_eof(InputFilter *ifilter, 
int64_t pts)
 
 ifilter->eof = 1;
 
+#if HAVE_THREADS
+ifilter->waited_frm = NULL;
+pthread_mutex_lock(>process_mutex);
+ifilter->t_end = 1;
+pthread_cond_signal(>process_cond);
+pthread_mutex_unlock(>process_mutex);
+pthread_join(ifilter->f_thread, NULL);
+#endif
 if (ifilter->filter) {
 ret = av_buffersrc_close(ifilter->filter, pts, AV_BUFFERSRC_FLAG_PUSH);
 if (ret < 0)
@@ -2252,12 +2266,95 @@ static int decode(AVCodecContext *avctx, AVFrame 
*frame, int *got_frame, AVPacke
 return 0;
 }
 
+#if HAVE_THREADS
+static void *filter_pipeline(void *arg)
+{
+InputFilter *fl = arg;
+AVFrame *frm;
+int ret;
+while(1) {
+pthread_mutex_lock(>process_mutex);
+while (fl->waited_frm == NULL && !fl->t_end)
+pthread_cond_wait(>process_cond, >process_mutex);
+pthread_mutex_unlock(>process_mutex);
+
+if (fl->t_end) break;
+
+frm = fl->waited_frm;
+ret = ifilter_send_frame(fl, frm);
+if (ret < 0) {
+av_log(NULL, AV_LOG_ERROR,
+   "Failed to inject frame into filter network: %s\n", 
av_err2str(ret));
+} else {
+ret = reap_filters(0, fl);
+}
+fl->t_error = ret;
+
+pthread_mutex_lock(>finish_mutex);
+fl->waited_frm = NULL;
+pthread_cond_signal(>finish_cond);
+pthread_mutex_unlock(>finish_mutex);
+
+if (ret < 0)
+break;
+}
+