Re: [FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support

2019-09-22 Thread Limin Wang
On Sat, Sep 21, 2019 at 11:04:27PM +0200, Michael Niedermayer wrote:
> On Sat, Sep 21, 2019 at 07:45:58AM +0800, Limin Wang wrote:
> > On Fri, Sep 20, 2019 at 07:57:10PM +0200, Michael Niedermayer wrote:
> > > On Fri, Sep 06, 2019 at 11:28:29PM +0800, lance.lmw...@gmail.com wrote:
> > > > From: Limin Wang 
> > > > 
> > > > The multithread is avoid one core cpu is full with other filter like 
> > > > scale etc.
> > > > About the performance, the gain is very small, below is my testing for
> > > > performance.
> > > > In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 
> > > > frame
> > > > only.
> > > > 
> > > > ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo 
> > > > -frames 10
> > > > ~/Movies/1.v210
> > > > 
> > > > master:
> > > > ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 
> > > > -benchmark
> > > > -f null -
> > > > frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > > > speed=1.69x
> > > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > > > muxing
> > > > overhead: unknown
> > > > bench: utime=10.082s stime=13.784s rtime=23.889s
> > > > bench: maxrss=147836928kB
> > > > 
> > > > patch applied:
> > > > ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 
> > > > 100 -i
> > > > ~/Movies/1.v210 -benchmark -f null -
> > > > 
> > > > frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > > > speed=2.22x
> > > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > > > muxing
> > > > overhead: unknown
> > > > bench: utime=11.407s stime=17.258s rtime=18.279s
> > > > bench: maxrss=442884096kB
> > > > 
> > > > Signed-off-by: Limin Wang 
> > > > ---
> > > >  libavcodec/v210dec.c | 135 
> > > > +--
> > > >  libavcodec/v210dec.h |   1 +
> > > >  2 files changed, 88 insertions(+), 48 deletions(-)
> > > > 
> > > > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> > > > index 6ce18aa..2cdb99e 100644
> > > > --- a/libavcodec/v210dec.c
> > > > +++ b/libavcodec/v210dec.c
> > > > @@ -28,6 +28,7 @@
> > > >  #include "libavutil/internal.h"
> > > >  #include "libavutil/mem.h"
> > > >  #include "libavutil/intreadwrite.h"
> > > > +#include "thread.h"
> > > >  
> > > >  #define READ_PIXELS(a, b, c) \
> > > >  do { \
> > > > @@ -37,6 +38,13 @@
> > > >  *c++ = (val >> 20) & 0x3FF;  \
> > > >  } while (0)
> > > >  
> > > > +#define MAX_SLICES 32
> > > > +typedef struct ThreadData {
> > > > +AVFrame *frame;
> > > > +uint8_t *buf;
> > > > +int stride;
> > > > +} ThreadData;
> > > > +
> > > >  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, 
> > > > uint16_t *u, uint16_t *v, int width)
> > > >  {
> > > >  uint32_t val;
> > > > @@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext 
> > > > *avctx)
> > > >  s->aligned_input = 0;
> > > >  ff_v210dec_init(s);
> > > >  
> > > > +s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);
> > > 
> > > why is there a MAX_SLICES ?
> > 
> > It's limit the slice thread count, if it's not OK, I can use 
> > MAX_AUTO_THREADS for max.
> 
> why is a limit needed here ?
> where does avctx->thread_count get a bad value ?
> 
> This feels a bit arbitrary to limit it to 32 (or any number)
> will that be still correct in 10 years ? if not then this is
> not a good way to limit it
Michael, I have fixed and update the patch, please review it.
Can I remove similar thread restrictions for other modules that support 
threads? 
When we refer to multi-threaded code, they also cause misleading.

> 
> thx
> 
> [...]
> 
> -- 
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> The misfortune of the wise is better than the prosperity of the fool.
> -- Epicurus



> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support

2019-09-21 Thread Limin Wang
On Sat, Sep 21, 2019 at 11:04:27PM +0200, Michael Niedermayer wrote:
> On Sat, Sep 21, 2019 at 07:45:58AM +0800, Limin Wang wrote:
> > On Fri, Sep 20, 2019 at 07:57:10PM +0200, Michael Niedermayer wrote:
> > > On Fri, Sep 06, 2019 at 11:28:29PM +0800, lance.lmw...@gmail.com wrote:
> > > > From: Limin Wang 
> > > > 
> > > > The multithread is avoid one core cpu is full with other filter like 
> > > > scale etc.
> > > > About the performance, the gain is very small, below is my testing for
> > > > performance.
> > > > In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 
> > > > frame
> > > > only.
> > > > 
> > > > ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo 
> > > > -frames 10
> > > > ~/Movies/1.v210
> > > > 
> > > > master:
> > > > ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 
> > > > -benchmark
> > > > -f null -
> > > > frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > > > speed=1.69x
> > > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > > > muxing
> > > > overhead: unknown
> > > > bench: utime=10.082s stime=13.784s rtime=23.889s
> > > > bench: maxrss=147836928kB
> > > > 
> > > > patch applied:
> > > > ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 
> > > > 100 -i
> > > > ~/Movies/1.v210 -benchmark -f null -
> > > > 
> > > > frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > > > speed=2.22x
> > > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > > > muxing
> > > > overhead: unknown
> > > > bench: utime=11.407s stime=17.258s rtime=18.279s
> > > > bench: maxrss=442884096kB
> > > > 
> > > > Signed-off-by: Limin Wang 
> > > > ---
> > > >  libavcodec/v210dec.c | 135 
> > > > +--
> > > >  libavcodec/v210dec.h |   1 +
> > > >  2 files changed, 88 insertions(+), 48 deletions(-)
> > > > 
> > > > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> > > > index 6ce18aa..2cdb99e 100644
> > > > --- a/libavcodec/v210dec.c
> > > > +++ b/libavcodec/v210dec.c
> > > > @@ -28,6 +28,7 @@
> > > >  #include "libavutil/internal.h"
> > > >  #include "libavutil/mem.h"
> > > >  #include "libavutil/intreadwrite.h"
> > > > +#include "thread.h"
> > > >  
> > > >  #define READ_PIXELS(a, b, c) \
> > > >  do { \
> > > > @@ -37,6 +38,13 @@
> > > >  *c++ = (val >> 20) & 0x3FF;  \
> > > >  } while (0)
> > > >  
> > > > +#define MAX_SLICES 32
> > > > +typedef struct ThreadData {
> > > > +AVFrame *frame;
> > > > +uint8_t *buf;
> > > > +int stride;
> > > > +} ThreadData;
> > > > +
> > > >  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, 
> > > > uint16_t *u, uint16_t *v, int width)
> > > >  {
> > > >  uint32_t val;
> > > > @@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext 
> > > > *avctx)
> > > >  s->aligned_input = 0;
> > > >  ff_v210dec_init(s);
> > > >  
> > > > +s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);
> > > 
> > > why is there a MAX_SLICES ?
> > 
> > It's limit the slice thread count, if it's not OK, I can use 
> > MAX_AUTO_THREADS for max.
> 
> why is a limit needed here ?
> where does avctx->thread_count get a bad value ?

No other limit, only thread_count need great than 0, by my testing, I'll be 
auto thread even
using -threads 0, so it's OK to remove it.

> 
> This feels a bit arbitrary to limit it to 32 (or any number)
> will that be still correct in 10 years ? if not then this is
> not a good way to limit it
> 
> thx
> 
> [...]
> 
> -- 
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> The misfortune of the wise is better than the prosperity of the fool.
> -- Epicurus



> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support

2019-09-21 Thread Michael Niedermayer
On Sat, Sep 21, 2019 at 07:45:58AM +0800, Limin Wang wrote:
> On Fri, Sep 20, 2019 at 07:57:10PM +0200, Michael Niedermayer wrote:
> > On Fri, Sep 06, 2019 at 11:28:29PM +0800, lance.lmw...@gmail.com wrote:
> > > From: Limin Wang 
> > > 
> > > The multithread is avoid one core cpu is full with other filter like 
> > > scale etc.
> > > About the performance, the gain is very small, below is my testing for
> > > performance.
> > > In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 
> > > frame
> > > only.
> > > 
> > > ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo 
> > > -frames 10
> > > ~/Movies/1.v210
> > > 
> > > master:
> > > ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 
> > > -benchmark
> > > -f null -
> > > frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > > speed=1.69x
> > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > > muxing
> > > overhead: unknown
> > > bench: utime=10.082s stime=13.784s rtime=23.889s
> > > bench: maxrss=147836928kB
> > > 
> > > patch applied:
> > > ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 
> > > 100 -i
> > > ~/Movies/1.v210 -benchmark -f null -
> > > 
> > > frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > > speed=2.22x
> > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > > muxing
> > > overhead: unknown
> > > bench: utime=11.407s stime=17.258s rtime=18.279s
> > > bench: maxrss=442884096kB
> > > 
> > > Signed-off-by: Limin Wang 
> > > ---
> > >  libavcodec/v210dec.c | 135 
> > > +--
> > >  libavcodec/v210dec.h |   1 +
> > >  2 files changed, 88 insertions(+), 48 deletions(-)
> > > 
> > > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> > > index 6ce18aa..2cdb99e 100644
> > > --- a/libavcodec/v210dec.c
> > > +++ b/libavcodec/v210dec.c
> > > @@ -28,6 +28,7 @@
> > >  #include "libavutil/internal.h"
> > >  #include "libavutil/mem.h"
> > >  #include "libavutil/intreadwrite.h"
> > > +#include "thread.h"
> > >  
> > >  #define READ_PIXELS(a, b, c) \
> > >  do { \
> > > @@ -37,6 +38,13 @@
> > >  *c++ = (val >> 20) & 0x3FF;  \
> > >  } while (0)
> > >  
> > > +#define MAX_SLICES 32
> > > +typedef struct ThreadData {
> > > +AVFrame *frame;
> > > +uint8_t *buf;
> > > +int stride;
> > > +} ThreadData;
> > > +
> > >  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, 
> > > uint16_t *u, uint16_t *v, int width)
> > >  {
> > >  uint32_t val;
> > > @@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext *avctx)
> > >  s->aligned_input = 0;
> > >  ff_v210dec_init(s);
> > >  
> > > +s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);
> > 
> > why is there a MAX_SLICES ?
> 
> It's limit the slice thread count, if it's not OK, I can use MAX_AUTO_THREADS 
> for max.

why is a limit needed here ?
where does avctx->thread_count get a bad value ?

This feels a bit arbitrary to limit it to 32 (or any number)
will that be still correct in 10 years ? if not then this is
not a good way to limit it

thx

[...]

-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

The misfortune of the wise is better than the prosperity of the fool.
-- Epicurus


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support

2019-09-20 Thread Limin Wang
On Fri, Sep 20, 2019 at 07:57:10PM +0200, Michael Niedermayer wrote:
> On Fri, Sep 06, 2019 at 11:28:29PM +0800, lance.lmw...@gmail.com wrote:
> > From: Limin Wang 
> > 
> > The multithread is avoid one core cpu is full with other filter like scale 
> > etc.
> > About the performance, the gain is very small, below is my testing for
> > performance.
> > In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 
> > frame
> > only.
> > 
> > ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo 
> > -frames 10
> > ~/Movies/1.v210
> > 
> > master:
> > ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 
> > -benchmark
> > -f null -
> > frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > speed=1.69x
> > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > muxing
> > overhead: unknown
> > bench: utime=10.082s stime=13.784s rtime=23.889s
> > bench: maxrss=147836928kB
> > 
> > patch applied:
> > ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 
> > -i
> > ~/Movies/1.v210 -benchmark -f null -
> > 
> > frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > speed=2.22x
> > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > muxing
> > overhead: unknown
> > bench: utime=11.407s stime=17.258s rtime=18.279s
> > bench: maxrss=442884096kB
> > 
> > Signed-off-by: Limin Wang 
> > ---
> >  libavcodec/v210dec.c | 135 
> > +--
> >  libavcodec/v210dec.h |   1 +
> >  2 files changed, 88 insertions(+), 48 deletions(-)
> > 
> > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> > index 6ce18aa..2cdb99e 100644
> > --- a/libavcodec/v210dec.c
> > +++ b/libavcodec/v210dec.c
> > @@ -28,6 +28,7 @@
> >  #include "libavutil/internal.h"
> >  #include "libavutil/mem.h"
> >  #include "libavutil/intreadwrite.h"
> > +#include "thread.h"
> >  
> >  #define READ_PIXELS(a, b, c) \
> >  do { \
> > @@ -37,6 +38,13 @@
> >  *c++ = (val >> 20) & 0x3FF;  \
> >  } while (0)
> >  
> > +#define MAX_SLICES 32
> > +typedef struct ThreadData {
> > +AVFrame *frame;
> > +uint8_t *buf;
> > +int stride;
> > +} ThreadData;
> > +
> >  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, 
> > uint16_t *u, uint16_t *v, int width)
> >  {
> >  uint32_t val;
> > @@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext *avctx)
> >  s->aligned_input = 0;
> >  ff_v210dec_init(s);
> >  
> > +s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);
> 
> why is there a MAX_SLICES ?

It's limit the slice thread count, if it's not OK, I can use MAX_AUTO_THREADS 
for max.

> 
> [...]
> > @@ -193,6 +228,10 @@ AVCodec ff_v210_decoder = {
> >  .priv_data_size = sizeof(V210DecContext),
> >  .init   = decode_init,
> >  .decode = decode_frame,
> > -.capabilities   = AV_CODEC_CAP_DR1,
> > +.capabilities   = AV_CODEC_CAP_DR1 |
> > +  AV_CODEC_CAP_SLICE_THREADS |
> > +  AV_CODEC_CAP_FRAME_THREADS,
> >  .priv_class = _class,
> 
> > +.caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
> > +  FF_CODEC_CAP_INIT_CLEANUP,
> 
> This appears unrelated and if so should be in a seperate patch
OK, will split it.

> 
> [...]
> -- 
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> You can kill me, but you cannot change the truth.



> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support

2019-09-20 Thread Michael Niedermayer
On Fri, Sep 06, 2019 at 11:28:29PM +0800, lance.lmw...@gmail.com wrote:
> From: Limin Wang 
> 
> The multithread is avoid one core cpu is full with other filter like scale 
> etc.
> About the performance, the gain is very small, below is my testing for
> performance.
> In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> only.
> 
> ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 
> 10
> ~/Movies/1.v210
> 
> master:
> ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 
> -benchmark
> -f null -
> frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=10.082s stime=13.784s rtime=23.889s
> bench: maxrss=147836928kB
> 
> patch applied:
> ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> ~/Movies/1.v210 -benchmark -f null -
> 
> frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=11.407s stime=17.258s rtime=18.279s
> bench: maxrss=442884096kB
> 
> Signed-off-by: Limin Wang 
> ---
>  libavcodec/v210dec.c | 135 
> +--
>  libavcodec/v210dec.h |   1 +
>  2 files changed, 88 insertions(+), 48 deletions(-)
> 
> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> index 6ce18aa..2cdb99e 100644
> --- a/libavcodec/v210dec.c
> +++ b/libavcodec/v210dec.c
> @@ -28,6 +28,7 @@
>  #include "libavutil/internal.h"
>  #include "libavutil/mem.h"
>  #include "libavutil/intreadwrite.h"
> +#include "thread.h"
>  
>  #define READ_PIXELS(a, b, c) \
>  do { \
> @@ -37,6 +38,13 @@
>  *c++ = (val >> 20) & 0x3FF;  \
>  } while (0)
>  
> +#define MAX_SLICES 32
> +typedef struct ThreadData {
> +AVFrame *frame;
> +uint8_t *buf;
> +int stride;
> +} ThreadData;
> +
>  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t 
> *u, uint16_t *v, int width)
>  {
>  uint32_t val;
> @@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext *avctx)
>  s->aligned_input = 0;
>  ff_v210dec_init(s);
>  
> +s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);

why is there a MAX_SLICES ?


[...]
> @@ -193,6 +228,10 @@ AVCodec ff_v210_decoder = {
>  .priv_data_size = sizeof(V210DecContext),
>  .init   = decode_init,
>  .decode = decode_frame,
> -.capabilities   = AV_CODEC_CAP_DR1,
> +.capabilities   = AV_CODEC_CAP_DR1 |
> +  AV_CODEC_CAP_SLICE_THREADS |
> +  AV_CODEC_CAP_FRAME_THREADS,
>  .priv_class = _class,

> +.caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
> +  FF_CODEC_CAP_INIT_CLEANUP,

This appears unrelated and if so should be in a seperate patch

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

You can kill me, but you cannot change the truth.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support

2019-09-19 Thread Limin Wang

Another friendly ping.

On Fri, Sep 06, 2019 at 11:28:29PM +0800, lance.lmw...@gmail.com wrote:
> From: Limin Wang 
> 
> The multithread is avoid one core cpu is full with other filter like scale 
> etc.
> About the performance, the gain is very small, below is my testing for
> performance.
> In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> only.
> 
> ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 
> 10
> ~/Movies/1.v210
> 
> master:
> ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 
> -benchmark
> -f null -
> frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=10.082s stime=13.784s rtime=23.889s
> bench: maxrss=147836928kB
> 
> patch applied:
> ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> ~/Movies/1.v210 -benchmark -f null -
> 
> frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=11.407s stime=17.258s rtime=18.279s
> bench: maxrss=442884096kB
> 
> Signed-off-by: Limin Wang 
> ---
>  libavcodec/v210dec.c | 135 
> +--
>  libavcodec/v210dec.h |   1 +
>  2 files changed, 88 insertions(+), 48 deletions(-)
> 
> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> index 6ce18aa..2cdb99e 100644
> --- a/libavcodec/v210dec.c
> +++ b/libavcodec/v210dec.c
> @@ -28,6 +28,7 @@
>  #include "libavutil/internal.h"
>  #include "libavutil/mem.h"
>  #include "libavutil/intreadwrite.h"
> +#include "thread.h"
>  
>  #define READ_PIXELS(a, b, c) \
>  do { \
> @@ -37,6 +38,13 @@
>  *c++ = (val >> 20) & 0x3FF;  \
>  } while (0)
>  
> +#define MAX_SLICES 32
> +typedef struct ThreadData {
> +AVFrame *frame;
> +uint8_t *buf;
> +int stride;
> +} ThreadData;
> +
>  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t 
> *u, uint16_t *v, int width)
>  {
>  uint32_t val;
> @@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext *avctx)
>  s->aligned_input = 0;
>  ff_v210dec_init(s);
>  
> +s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);
>  return 0;
>  }
>  
> -static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
> -AVPacket *avpkt)
> +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, 
> int nb_jobs)
>  {
>  V210DecContext *s = avctx->priv_data;
> -
> -int h, w, ret, stride, aligned_input;
> -AVFrame *pic = data;
> -const uint8_t *psrc = avpkt->data;
> +int h, w;
> +ThreadData *td = arg;
> +AVFrame *frame = td->frame;
> +int stride = td->stride;
> +int slice_h = avctx->height / s->slice_count;
> +int slice_m = avctx->height % s->slice_count;
> +int slice_start = jobnr * slice_h;
> +int slice_end = slice_start + slice_h;
> +const uint8_t *psrc = td->buf + stride * slice_start;
>  uint16_t *y, *u, *v;
>  
> -if (s->custom_stride )
> -stride = s->custom_stride;
> -else {
> -int aligned_width = ((avctx->width + 47) / 48) * 48;
> -stride = aligned_width * 8 / 3;
> -}
> -
> -if (avpkt->size < stride * avctx->height) {
> -if avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == 
> avpkt->size) {
> -stride = avpkt->size / avctx->height;
> -if (!s->stride_warning_shown)
> -av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small 
> padding (64 byte) detected\n");
> -s->stride_warning_shown = 1;
> -} else {
> -av_log(avctx, AV_LOG_ERROR, "packet too small\n");
> -return AVERROR_INVALIDDATA;
> -}
> -}
> -if (avctx->codec_tag == MKTAG('C', '2', '1', '0')
> -&& AV_RN32(psrc) == AV_RN32("INFO")
> -&& avpkt->size - 64 >= stride * avctx->height)
> -psrc += 64;
> -
> -aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f);
> -if (aligned_input != s->aligned_input) {
> -s->aligned_input = aligned_input;
> -ff_v210dec_init(s);
> -}
> -
> -if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
> -return ret;
> -
> -y = (uint16_t*)pic->data[0];
> -u = (uint16_t*)pic->data[1];
> -v = (uint16_t*)pic->data[2];
> -pic->pict_type = AV_PICTURE_TYPE_I;
> -pic->key_frame = 1;
> +/* add the remaining slice for the last job */
> +if (jobnr == s->slice_count - 1)
> +slice_end += slice_m;
>  
> -for (h = 0; h < avctx->height; h++) {
> +y = (uint16_t*)frame->data[0] + slice_start * frame->linesize[0] / 2;
> +u = (uint16_t*)frame->data[1] + slice_start * frame->linesize[1] / 2;
> +v = 

Re: [FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support

2019-09-12 Thread Limin Wang

ping the v210 decode thread.

On Fri, Sep 06, 2019 at 11:28:29PM +0800, lance.lmw...@gmail.com wrote:
> From: Limin Wang 
> 
> The multithread is avoid one core cpu is full with other filter like scale 
> etc.
> About the performance, the gain is very small, below is my testing for
> performance.
> In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> only.
> 
> ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 
> 10
> ~/Movies/1.v210
> 
> master:
> ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 
> -benchmark
> -f null -
> frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=10.082s stime=13.784s rtime=23.889s
> bench: maxrss=147836928kB
> 
> patch applied:
> ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> ~/Movies/1.v210 -benchmark -f null -
> 
> frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=11.407s stime=17.258s rtime=18.279s
> bench: maxrss=442884096kB
> 
> Signed-off-by: Limin Wang 
> ---
>  libavcodec/v210dec.c | 135 
> +--
>  libavcodec/v210dec.h |   1 +
>  2 files changed, 88 insertions(+), 48 deletions(-)
> 
> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> index 6ce18aa..2cdb99e 100644
> --- a/libavcodec/v210dec.c
> +++ b/libavcodec/v210dec.c
> @@ -28,6 +28,7 @@
>  #include "libavutil/internal.h"
>  #include "libavutil/mem.h"
>  #include "libavutil/intreadwrite.h"
> +#include "thread.h"
>  
>  #define READ_PIXELS(a, b, c) \
>  do { \
> @@ -37,6 +38,13 @@
>  *c++ = (val >> 20) & 0x3FF;  \
>  } while (0)
>  
> +#define MAX_SLICES 32
> +typedef struct ThreadData {
> +AVFrame *frame;
> +uint8_t *buf;
> +int stride;
> +} ThreadData;
> +
>  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t 
> *u, uint16_t *v, int width)
>  {
>  uint32_t val;
> @@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext *avctx)
>  s->aligned_input = 0;
>  ff_v210dec_init(s);
>  
> +s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);
>  return 0;
>  }
>  
> -static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
> -AVPacket *avpkt)
> +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, 
> int nb_jobs)
>  {
>  V210DecContext *s = avctx->priv_data;
> -
> -int h, w, ret, stride, aligned_input;
> -AVFrame *pic = data;
> -const uint8_t *psrc = avpkt->data;
> +int h, w;
> +ThreadData *td = arg;
> +AVFrame *frame = td->frame;
> +int stride = td->stride;
> +int slice_h = avctx->height / s->slice_count;
> +int slice_m = avctx->height % s->slice_count;
> +int slice_start = jobnr * slice_h;
> +int slice_end = slice_start + slice_h;
> +const uint8_t *psrc = td->buf + stride * slice_start;
>  uint16_t *y, *u, *v;
>  
> -if (s->custom_stride )
> -stride = s->custom_stride;
> -else {
> -int aligned_width = ((avctx->width + 47) / 48) * 48;
> -stride = aligned_width * 8 / 3;
> -}
> -
> -if (avpkt->size < stride * avctx->height) {
> -if avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == 
> avpkt->size) {
> -stride = avpkt->size / avctx->height;
> -if (!s->stride_warning_shown)
> -av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small 
> padding (64 byte) detected\n");
> -s->stride_warning_shown = 1;
> -} else {
> -av_log(avctx, AV_LOG_ERROR, "packet too small\n");
> -return AVERROR_INVALIDDATA;
> -}
> -}
> -if (avctx->codec_tag == MKTAG('C', '2', '1', '0')
> -&& AV_RN32(psrc) == AV_RN32("INFO")
> -&& avpkt->size - 64 >= stride * avctx->height)
> -psrc += 64;
> -
> -aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f);
> -if (aligned_input != s->aligned_input) {
> -s->aligned_input = aligned_input;
> -ff_v210dec_init(s);
> -}
> -
> -if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
> -return ret;
> -
> -y = (uint16_t*)pic->data[0];
> -u = (uint16_t*)pic->data[1];
> -v = (uint16_t*)pic->data[2];
> -pic->pict_type = AV_PICTURE_TYPE_I;
> -pic->key_frame = 1;
> +/* add the remaining slice for the last job */
> +if (jobnr == s->slice_count - 1)
> +slice_end += slice_m;
>  
> -for (h = 0; h < avctx->height; h++) {
> +y = (uint16_t*)frame->data[0] + slice_start * frame->linesize[0] / 2;
> +u = (uint16_t*)frame->data[1] + slice_start * frame->linesize[1] / 2;
> +v = 

[FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support

2019-09-06 Thread lance . lmwang
From: Limin Wang 

The multithread is avoid one core cpu is full with other filter like scale etc.
About the performance, the gain is very small, below is my testing for
performance.
In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
only.

./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 10
~/Movies/1.v210

master:
./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark
-f null -
frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
overhead: unknown
bench: utime=10.082s stime=13.784s rtime=23.889s
bench: maxrss=147836928kB

patch applied:
./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
~/Movies/1.v210 -benchmark -f null -

frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
overhead: unknown
bench: utime=11.407s stime=17.258s rtime=18.279s
bench: maxrss=442884096kB

Signed-off-by: Limin Wang 
---
 libavcodec/v210dec.c | 135 +--
 libavcodec/v210dec.h |   1 +
 2 files changed, 88 insertions(+), 48 deletions(-)

diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index 6ce18aa..2cdb99e 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -28,6 +28,7 @@
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/intreadwrite.h"
+#include "thread.h"
 
 #define READ_PIXELS(a, b, c) \
 do { \
@@ -37,6 +38,13 @@
 *c++ = (val >> 20) & 0x3FF;  \
 } while (0)
 
+#define MAX_SLICES 32
+typedef struct ThreadData {
+AVFrame *frame;
+uint8_t *buf;
+int stride;
+} ThreadData;
+
 static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t 
*u, uint16_t *v, int width)
 {
 uint32_t val;
@@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext *avctx)
 s->aligned_input = 0;
 ff_v210dec_init(s);
 
+s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);
 return 0;
 }
 
-static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
-AVPacket *avpkt)
+static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int 
nb_jobs)
 {
 V210DecContext *s = avctx->priv_data;
-
-int h, w, ret, stride, aligned_input;
-AVFrame *pic = data;
-const uint8_t *psrc = avpkt->data;
+int h, w;
+ThreadData *td = arg;
+AVFrame *frame = td->frame;
+int stride = td->stride;
+int slice_h = avctx->height / s->slice_count;
+int slice_m = avctx->height % s->slice_count;
+int slice_start = jobnr * slice_h;
+int slice_end = slice_start + slice_h;
+const uint8_t *psrc = td->buf + stride * slice_start;
 uint16_t *y, *u, *v;
 
-if (s->custom_stride )
-stride = s->custom_stride;
-else {
-int aligned_width = ((avctx->width + 47) / 48) * 48;
-stride = aligned_width * 8 / 3;
-}
-
-if (avpkt->size < stride * avctx->height) {
-if avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == 
avpkt->size) {
-stride = avpkt->size / avctx->height;
-if (!s->stride_warning_shown)
-av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small 
padding (64 byte) detected\n");
-s->stride_warning_shown = 1;
-} else {
-av_log(avctx, AV_LOG_ERROR, "packet too small\n");
-return AVERROR_INVALIDDATA;
-}
-}
-if (avctx->codec_tag == MKTAG('C', '2', '1', '0')
-&& AV_RN32(psrc) == AV_RN32("INFO")
-&& avpkt->size - 64 >= stride * avctx->height)
-psrc += 64;
-
-aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f);
-if (aligned_input != s->aligned_input) {
-s->aligned_input = aligned_input;
-ff_v210dec_init(s);
-}
-
-if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
-return ret;
-
-y = (uint16_t*)pic->data[0];
-u = (uint16_t*)pic->data[1];
-v = (uint16_t*)pic->data[2];
-pic->pict_type = AV_PICTURE_TYPE_I;
-pic->key_frame = 1;
+/* add the remaining slice for the last job */
+if (jobnr == s->slice_count - 1)
+slice_end += slice_m;
 
-for (h = 0; h < avctx->height; h++) {
+y = (uint16_t*)frame->data[0] + slice_start * frame->linesize[0] / 2;
+u = (uint16_t*)frame->data[1] + slice_start * frame->linesize[1] / 2;
+v = (uint16_t*)frame->data[2] + slice_start * frame->linesize[2] / 2;
+for (h = slice_start; h < slice_end; h++) {
 const uint32_t *src = (const uint32_t*)psrc;
 uint32_t val;
 
@@ -154,10 +136,63 @@ static int decode_frame(AVCodecContext *avctx, void 
*data, int *got_frame,
 }
 
 psrc += stride;
-y += pic->linesize[0] / 2 - avctx->width + (avctx->width & 1);
-