Re: [FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support

2019-09-22 Thread Limin Wang
On Sat, Sep 21, 2019 at 11:04:27PM +0200, Michael Niedermayer wrote:
> On Sat, Sep 21, 2019 at 07:45:58AM +0800, Limin Wang wrote:
> > On Fri, Sep 20, 2019 at 07:57:10PM +0200, Michael Niedermayer wrote:
> > > On Fri, Sep 06, 2019 at 11:28:29PM +0800, lance.lmw...@gmail.com wrote:
> > > > From: Limin Wang 
> > > > 
> > > > The multithread is avoid one core cpu is full with other filter like 
> > > > scale etc.
> > > > About the performance, the gain is very small, below is my testing for
> > > > performance.
> > > > In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 
> > > > frame
> > > > only.
> > > > 
> > > > ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo 
> > > > -frames 10
> > > > ~/Movies/1.v210
> > > > 
> > > > master:
> > > > ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 
> > > > -benchmark
> > > > -f null -
> > > > frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > > > speed=1.69x
> > > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > > > muxing
> > > > overhead: unknown
> > > > bench: utime=10.082s stime=13.784s rtime=23.889s
> > > > bench: maxrss=147836928kB
> > > > 
> > > > patch applied:
> > > > ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 
> > > > 100 -i
> > > > ~/Movies/1.v210 -benchmark -f null -
> > > > 
> > > > frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > > > speed=2.22x
> > > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > > > muxing
> > > > overhead: unknown
> > > > bench: utime=11.407s stime=17.258s rtime=18.279s
> > > > bench: maxrss=442884096kB
> > > > 
> > > > Signed-off-by: Limin Wang 
> > > > ---
> > > >  libavcodec/v210dec.c | 135 
> > > > +--
> > > >  libavcodec/v210dec.h |   1 +
> > > >  2 files changed, 88 insertions(+), 48 deletions(-)
> > > > 
> > > > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> > > > index 6ce18aa..2cdb99e 100644
> > > > --- a/libavcodec/v210dec.c
> > > > +++ b/libavcodec/v210dec.c
> > > > @@ -28,6 +28,7 @@
> > > >  #include "libavutil/internal.h"
> > > >  #include "libavutil/mem.h"
> > > >  #include "libavutil/intreadwrite.h"
> > > > +#include "thread.h"
> > > >  
> > > >  #define READ_PIXELS(a, b, c) \
> > > >  do { \
> > > > @@ -37,6 +38,13 @@
> > > >  *c++ = (val >> 20) & 0x3FF;  \
> > > >  } while (0)
> > > >  
> > > > +#define MAX_SLICES 32
> > > > +typedef struct ThreadData {
> > > > +AVFrame *frame;
> > > > +uint8_t *buf;
> > > > +int stride;
> > > > +} ThreadData;
> > > > +
> > > >  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, 
> > > > uint16_t *u, uint16_t *v, int width)
> > > >  {
> > > >  uint32_t val;
> > > > @@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext 
> > > > *avctx)
> > > >  s->aligned_input = 0;
> > > >  ff_v210dec_init(s);
> > > >  
> > > > +s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);
> > > 
> > > why is there a MAX_SLICES ?
> > 
> > It's limit the slice thread count, if it's not OK, I can use 
> > MAX_AUTO_THREADS for max.
> 
> why is a limit needed here ?
> where does avctx->thread_count get a bad value ?
> 
> This feels a bit arbitrary to limit it to 32 (or any number)
> will that be still correct in 10 years ? if not then this is
> not a good way to limit it
Michael, I have fixed and update the patch, please review it.
Can I remove similar thread restrictions for other modules that support 
threads? 
When we refer to multi-threaded code, they also cause misleading.

> 
> thx
> 
> [...]
> 
> -- 
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> The misfortune of the wise is better than the prosperity of the fool.
> -- Epicurus



> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support

2019-09-21 Thread Limin Wang
On Sat, Sep 21, 2019 at 11:04:27PM +0200, Michael Niedermayer wrote:
> On Sat, Sep 21, 2019 at 07:45:58AM +0800, Limin Wang wrote:
> > On Fri, Sep 20, 2019 at 07:57:10PM +0200, Michael Niedermayer wrote:
> > > On Fri, Sep 06, 2019 at 11:28:29PM +0800, lance.lmw...@gmail.com wrote:
> > > > From: Limin Wang 
> > > > 
> > > > The multithread is avoid one core cpu is full with other filter like 
> > > > scale etc.
> > > > About the performance, the gain is very small, below is my testing for
> > > > performance.
> > > > In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 
> > > > frame
> > > > only.
> > > > 
> > > > ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo 
> > > > -frames 10
> > > > ~/Movies/1.v210
> > > > 
> > > > master:
> > > > ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 
> > > > -benchmark
> > > > -f null -
> > > > frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > > > speed=1.69x
> > > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > > > muxing
> > > > overhead: unknown
> > > > bench: utime=10.082s stime=13.784s rtime=23.889s
> > > > bench: maxrss=147836928kB
> > > > 
> > > > patch applied:
> > > > ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 
> > > > 100 -i
> > > > ~/Movies/1.v210 -benchmark -f null -
> > > > 
> > > > frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > > > speed=2.22x
> > > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > > > muxing
> > > > overhead: unknown
> > > > bench: utime=11.407s stime=17.258s rtime=18.279s
> > > > bench: maxrss=442884096kB
> > > > 
> > > > Signed-off-by: Limin Wang 
> > > > ---
> > > >  libavcodec/v210dec.c | 135 
> > > > +--
> > > >  libavcodec/v210dec.h |   1 +
> > > >  2 files changed, 88 insertions(+), 48 deletions(-)
> > > > 
> > > > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> > > > index 6ce18aa..2cdb99e 100644
> > > > --- a/libavcodec/v210dec.c
> > > > +++ b/libavcodec/v210dec.c
> > > > @@ -28,6 +28,7 @@
> > > >  #include "libavutil/internal.h"
> > > >  #include "libavutil/mem.h"
> > > >  #include "libavutil/intreadwrite.h"
> > > > +#include "thread.h"
> > > >  
> > > >  #define READ_PIXELS(a, b, c) \
> > > >  do { \
> > > > @@ -37,6 +38,13 @@
> > > >  *c++ = (val >> 20) & 0x3FF;  \
> > > >  } while (0)
> > > >  
> > > > +#define MAX_SLICES 32
> > > > +typedef struct ThreadData {
> > > > +AVFrame *frame;
> > > > +uint8_t *buf;
> > > > +int stride;
> > > > +} ThreadData;
> > > > +
> > > >  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, 
> > > > uint16_t *u, uint16_t *v, int width)
> > > >  {
> > > >  uint32_t val;
> > > > @@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext 
> > > > *avctx)
> > > >  s->aligned_input = 0;
> > > >  ff_v210dec_init(s);
> > > >  
> > > > +s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);
> > > 
> > > why is there a MAX_SLICES ?
> > 
> > It's limit the slice thread count, if it's not OK, I can use 
> > MAX_AUTO_THREADS for max.
> 
> why is a limit needed here ?
> where does avctx->thread_count get a bad value ?

No other limit, only thread_count need great than 0, by my testing, I'll be 
auto thread even
using -threads 0, so it's OK to remove it.

> 
> This feels a bit arbitrary to limit it to 32 (or any number)
> will that be still correct in 10 years ? if not then this is
> not a good way to limit it
> 
> thx
> 
> [...]
> 
> -- 
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> The misfortune of the wise is better than the prosperity of the fool.
> -- Epicurus



> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support

2019-09-21 Thread Michael Niedermayer
On Sat, Sep 21, 2019 at 07:45:58AM +0800, Limin Wang wrote:
> On Fri, Sep 20, 2019 at 07:57:10PM +0200, Michael Niedermayer wrote:
> > On Fri, Sep 06, 2019 at 11:28:29PM +0800, lance.lmw...@gmail.com wrote:
> > > From: Limin Wang 
> > > 
> > > The multithread is avoid one core cpu is full with other filter like 
> > > scale etc.
> > > About the performance, the gain is very small, below is my testing for
> > > performance.
> > > In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 
> > > frame
> > > only.
> > > 
> > > ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo 
> > > -frames 10
> > > ~/Movies/1.v210
> > > 
> > > master:
> > > ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 
> > > -benchmark
> > > -f null -
> > > frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > > speed=1.69x
> > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > > muxing
> > > overhead: unknown
> > > bench: utime=10.082s stime=13.784s rtime=23.889s
> > > bench: maxrss=147836928kB
> > > 
> > > patch applied:
> > > ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 
> > > 100 -i
> > > ~/Movies/1.v210 -benchmark -f null -
> > > 
> > > frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > > speed=2.22x
> > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > > muxing
> > > overhead: unknown
> > > bench: utime=11.407s stime=17.258s rtime=18.279s
> > > bench: maxrss=442884096kB
> > > 
> > > Signed-off-by: Limin Wang 
> > > ---
> > >  libavcodec/v210dec.c | 135 
> > > +--
> > >  libavcodec/v210dec.h |   1 +
> > >  2 files changed, 88 insertions(+), 48 deletions(-)
> > > 
> > > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> > > index 6ce18aa..2cdb99e 100644
> > > --- a/libavcodec/v210dec.c
> > > +++ b/libavcodec/v210dec.c
> > > @@ -28,6 +28,7 @@
> > >  #include "libavutil/internal.h"
> > >  #include "libavutil/mem.h"
> > >  #include "libavutil/intreadwrite.h"
> > > +#include "thread.h"
> > >  
> > >  #define READ_PIXELS(a, b, c) \
> > >  do { \
> > > @@ -37,6 +38,13 @@
> > >  *c++ = (val >> 20) & 0x3FF;  \
> > >  } while (0)
> > >  
> > > +#define MAX_SLICES 32
> > > +typedef struct ThreadData {
> > > +AVFrame *frame;
> > > +uint8_t *buf;
> > > +int stride;
> > > +} ThreadData;
> > > +
> > >  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, 
> > > uint16_t *u, uint16_t *v, int width)
> > >  {
> > >  uint32_t val;
> > > @@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext *avctx)
> > >  s->aligned_input = 0;
> > >  ff_v210dec_init(s);
> > >  
> > > +s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);
> > 
> > why is there a MAX_SLICES ?
> 
> It's limit the slice thread count, if it's not OK, I can use MAX_AUTO_THREADS 
> for max.

why is a limit needed here ?
where does avctx->thread_count get a bad value ?

This feels a bit arbitrary to limit it to 32 (or any number)
will that be still correct in 10 years ? if not then this is
not a good way to limit it

thx

[...]

-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

The misfortune of the wise is better than the prosperity of the fool.
-- Epicurus


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support

2019-09-20 Thread Limin Wang
On Fri, Sep 20, 2019 at 07:57:10PM +0200, Michael Niedermayer wrote:
> On Fri, Sep 06, 2019 at 11:28:29PM +0800, lance.lmw...@gmail.com wrote:
> > From: Limin Wang 
> > 
> > The multithread is avoid one core cpu is full with other filter like scale 
> > etc.
> > About the performance, the gain is very small, below is my testing for
> > performance.
> > In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 
> > frame
> > only.
> > 
> > ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo 
> > -frames 10
> > ~/Movies/1.v210
> > 
> > master:
> > ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 
> > -benchmark
> > -f null -
> > frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > speed=1.69x
> > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > muxing
> > overhead: unknown
> > bench: utime=10.082s stime=13.784s rtime=23.889s
> > bench: maxrss=147836928kB
> > 
> > patch applied:
> > ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 
> > -i
> > ~/Movies/1.v210 -benchmark -f null -
> > 
> > frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A 
> > speed=2.22x
> > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB 
> > muxing
> > overhead: unknown
> > bench: utime=11.407s stime=17.258s rtime=18.279s
> > bench: maxrss=442884096kB
> > 
> > Signed-off-by: Limin Wang 
> > ---
> >  libavcodec/v210dec.c | 135 
> > +--
> >  libavcodec/v210dec.h |   1 +
> >  2 files changed, 88 insertions(+), 48 deletions(-)
> > 
> > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> > index 6ce18aa..2cdb99e 100644
> > --- a/libavcodec/v210dec.c
> > +++ b/libavcodec/v210dec.c
> > @@ -28,6 +28,7 @@
> >  #include "libavutil/internal.h"
> >  #include "libavutil/mem.h"
> >  #include "libavutil/intreadwrite.h"
> > +#include "thread.h"
> >  
> >  #define READ_PIXELS(a, b, c) \
> >  do { \
> > @@ -37,6 +38,13 @@
> >  *c++ = (val >> 20) & 0x3FF;  \
> >  } while (0)
> >  
> > +#define MAX_SLICES 32
> > +typedef struct ThreadData {
> > +AVFrame *frame;
> > +uint8_t *buf;
> > +int stride;
> > +} ThreadData;
> > +
> >  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, 
> > uint16_t *u, uint16_t *v, int width)
> >  {
> >  uint32_t val;
> > @@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext *avctx)
> >  s->aligned_input = 0;
> >  ff_v210dec_init(s);
> >  
> > +s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);
> 
> why is there a MAX_SLICES ?

It's limit the slice thread count, if it's not OK, I can use MAX_AUTO_THREADS 
for max.

> 
> [...]
> > @@ -193,6 +228,10 @@ AVCodec ff_v210_decoder = {
> >  .priv_data_size = sizeof(V210DecContext),
> >  .init   = decode_init,
> >  .decode = decode_frame,
> > -.capabilities   = AV_CODEC_CAP_DR1,
> > +.capabilities   = AV_CODEC_CAP_DR1 |
> > +  AV_CODEC_CAP_SLICE_THREADS |
> > +  AV_CODEC_CAP_FRAME_THREADS,
> >  .priv_class = &v210dec_class,
> 
> > +.caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
> > +  FF_CODEC_CAP_INIT_CLEANUP,
> 
> This appears unrelated and if so should be in a seperate patch
OK, will split it.

> 
> [...]
> -- 
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> You can kill me, but you cannot change the truth.



> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support

2019-09-20 Thread Michael Niedermayer
On Fri, Sep 06, 2019 at 11:28:29PM +0800, lance.lmw...@gmail.com wrote:
> From: Limin Wang 
> 
> The multithread is avoid one core cpu is full with other filter like scale 
> etc.
> About the performance, the gain is very small, below is my testing for
> performance.
> In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> only.
> 
> ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 
> 10
> ~/Movies/1.v210
> 
> master:
> ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 
> -benchmark
> -f null -
> frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=10.082s stime=13.784s rtime=23.889s
> bench: maxrss=147836928kB
> 
> patch applied:
> ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> ~/Movies/1.v210 -benchmark -f null -
> 
> frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=11.407s stime=17.258s rtime=18.279s
> bench: maxrss=442884096kB
> 
> Signed-off-by: Limin Wang 
> ---
>  libavcodec/v210dec.c | 135 
> +--
>  libavcodec/v210dec.h |   1 +
>  2 files changed, 88 insertions(+), 48 deletions(-)
> 
> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> index 6ce18aa..2cdb99e 100644
> --- a/libavcodec/v210dec.c
> +++ b/libavcodec/v210dec.c
> @@ -28,6 +28,7 @@
>  #include "libavutil/internal.h"
>  #include "libavutil/mem.h"
>  #include "libavutil/intreadwrite.h"
> +#include "thread.h"
>  
>  #define READ_PIXELS(a, b, c) \
>  do { \
> @@ -37,6 +38,13 @@
>  *c++ = (val >> 20) & 0x3FF;  \
>  } while (0)
>  
> +#define MAX_SLICES 32
> +typedef struct ThreadData {
> +AVFrame *frame;
> +uint8_t *buf;
> +int stride;
> +} ThreadData;
> +
>  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t 
> *u, uint16_t *v, int width)
>  {
>  uint32_t val;
> @@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext *avctx)
>  s->aligned_input = 0;
>  ff_v210dec_init(s);
>  
> +s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);

why is there a MAX_SLICES ?


[...]
> @@ -193,6 +228,10 @@ AVCodec ff_v210_decoder = {
>  .priv_data_size = sizeof(V210DecContext),
>  .init   = decode_init,
>  .decode = decode_frame,
> -.capabilities   = AV_CODEC_CAP_DR1,
> +.capabilities   = AV_CODEC_CAP_DR1 |
> +  AV_CODEC_CAP_SLICE_THREADS |
> +  AV_CODEC_CAP_FRAME_THREADS,
>  .priv_class = &v210dec_class,

> +.caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
> +  FF_CODEC_CAP_INIT_CLEANUP,

This appears unrelated and if so should be in a seperate patch

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

You can kill me, but you cannot change the truth.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support

2019-09-19 Thread Limin Wang

Another friendly ping.

On Fri, Sep 06, 2019 at 11:28:29PM +0800, lance.lmw...@gmail.com wrote:
> From: Limin Wang 
> 
> The multithread is avoid one core cpu is full with other filter like scale 
> etc.
> About the performance, the gain is very small, below is my testing for
> performance.
> In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> only.
> 
> ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 
> 10
> ~/Movies/1.v210
> 
> master:
> ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 
> -benchmark
> -f null -
> frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=10.082s stime=13.784s rtime=23.889s
> bench: maxrss=147836928kB
> 
> patch applied:
> ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> ~/Movies/1.v210 -benchmark -f null -
> 
> frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=11.407s stime=17.258s rtime=18.279s
> bench: maxrss=442884096kB
> 
> Signed-off-by: Limin Wang 
> ---
>  libavcodec/v210dec.c | 135 
> +--
>  libavcodec/v210dec.h |   1 +
>  2 files changed, 88 insertions(+), 48 deletions(-)
> 
> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> index 6ce18aa..2cdb99e 100644
> --- a/libavcodec/v210dec.c
> +++ b/libavcodec/v210dec.c
> @@ -28,6 +28,7 @@
>  #include "libavutil/internal.h"
>  #include "libavutil/mem.h"
>  #include "libavutil/intreadwrite.h"
> +#include "thread.h"
>  
>  #define READ_PIXELS(a, b, c) \
>  do { \
> @@ -37,6 +38,13 @@
>  *c++ = (val >> 20) & 0x3FF;  \
>  } while (0)
>  
> +#define MAX_SLICES 32
> +typedef struct ThreadData {
> +AVFrame *frame;
> +uint8_t *buf;
> +int stride;
> +} ThreadData;
> +
>  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t 
> *u, uint16_t *v, int width)
>  {
>  uint32_t val;
> @@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext *avctx)
>  s->aligned_input = 0;
>  ff_v210dec_init(s);
>  
> +s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);
>  return 0;
>  }
>  
> -static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
> -AVPacket *avpkt)
> +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, 
> int nb_jobs)
>  {
>  V210DecContext *s = avctx->priv_data;
> -
> -int h, w, ret, stride, aligned_input;
> -AVFrame *pic = data;
> -const uint8_t *psrc = avpkt->data;
> +int h, w;
> +ThreadData *td = arg;
> +AVFrame *frame = td->frame;
> +int stride = td->stride;
> +int slice_h = avctx->height / s->slice_count;
> +int slice_m = avctx->height % s->slice_count;
> +int slice_start = jobnr * slice_h;
> +int slice_end = slice_start + slice_h;
> +const uint8_t *psrc = td->buf + stride * slice_start;
>  uint16_t *y, *u, *v;
>  
> -if (s->custom_stride )
> -stride = s->custom_stride;
> -else {
> -int aligned_width = ((avctx->width + 47) / 48) * 48;
> -stride = aligned_width * 8 / 3;
> -}
> -
> -if (avpkt->size < stride * avctx->height) {
> -if avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == 
> avpkt->size) {
> -stride = avpkt->size / avctx->height;
> -if (!s->stride_warning_shown)
> -av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small 
> padding (64 byte) detected\n");
> -s->stride_warning_shown = 1;
> -} else {
> -av_log(avctx, AV_LOG_ERROR, "packet too small\n");
> -return AVERROR_INVALIDDATA;
> -}
> -}
> -if (avctx->codec_tag == MKTAG('C', '2', '1', '0')
> -&& AV_RN32(psrc) == AV_RN32("INFO")
> -&& avpkt->size - 64 >= stride * avctx->height)
> -psrc += 64;
> -
> -aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f);
> -if (aligned_input != s->aligned_input) {
> -s->aligned_input = aligned_input;
> -ff_v210dec_init(s);
> -}
> -
> -if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
> -return ret;
> -
> -y = (uint16_t*)pic->data[0];
> -u = (uint16_t*)pic->data[1];
> -v = (uint16_t*)pic->data[2];
> -pic->pict_type = AV_PICTURE_TYPE_I;
> -pic->key_frame = 1;
> +/* add the remaining slice for the last job */
> +if (jobnr == s->slice_count - 1)
> +slice_end += slice_m;
>  
> -for (h = 0; h < avctx->height; h++) {
> +y = (uint16_t*)frame->data[0] + slice_start * frame->linesize[0] / 2;
> +u = (uint16_t*)frame->data[1] + slice_start * frame->linesize[1] / 2;
> +v = (uint16_t*)frame->d

Re: [FFmpeg-devel] [PATCH v2 2/2] avcodec/v210dec: add the frame and slice threading support

2019-09-12 Thread Limin Wang

ping the v210 decode thread.

On Fri, Sep 06, 2019 at 11:28:29PM +0800, lance.lmw...@gmail.com wrote:
> From: Limin Wang 
> 
> The multithread is avoid one core cpu is full with other filter like scale 
> etc.
> About the performance, the gain is very small, below is my testing for
> performance.
> In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> only.
> 
> ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 
> 10
> ~/Movies/1.v210
> 
> master:
> ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 
> -benchmark
> -f null -
> frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=10.082s stime=13.784s rtime=23.889s
> bench: maxrss=147836928kB
> 
> patch applied:
> ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> ~/Movies/1.v210 -benchmark -f null -
> 
> frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=11.407s stime=17.258s rtime=18.279s
> bench: maxrss=442884096kB
> 
> Signed-off-by: Limin Wang 
> ---
>  libavcodec/v210dec.c | 135 
> +--
>  libavcodec/v210dec.h |   1 +
>  2 files changed, 88 insertions(+), 48 deletions(-)
> 
> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> index 6ce18aa..2cdb99e 100644
> --- a/libavcodec/v210dec.c
> +++ b/libavcodec/v210dec.c
> @@ -28,6 +28,7 @@
>  #include "libavutil/internal.h"
>  #include "libavutil/mem.h"
>  #include "libavutil/intreadwrite.h"
> +#include "thread.h"
>  
>  #define READ_PIXELS(a, b, c) \
>  do { \
> @@ -37,6 +38,13 @@
>  *c++ = (val >> 20) & 0x3FF;  \
>  } while (0)
>  
> +#define MAX_SLICES 32
> +typedef struct ThreadData {
> +AVFrame *frame;
> +uint8_t *buf;
> +int stride;
> +} ThreadData;
> +
>  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t 
> *u, uint16_t *v, int width)
>  {
>  uint32_t val;
> @@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext *avctx)
>  s->aligned_input = 0;
>  ff_v210dec_init(s);
>  
> +s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES);
>  return 0;
>  }
>  
> -static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
> -AVPacket *avpkt)
> +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, 
> int nb_jobs)
>  {
>  V210DecContext *s = avctx->priv_data;
> -
> -int h, w, ret, stride, aligned_input;
> -AVFrame *pic = data;
> -const uint8_t *psrc = avpkt->data;
> +int h, w;
> +ThreadData *td = arg;
> +AVFrame *frame = td->frame;
> +int stride = td->stride;
> +int slice_h = avctx->height / s->slice_count;
> +int slice_m = avctx->height % s->slice_count;
> +int slice_start = jobnr * slice_h;
> +int slice_end = slice_start + slice_h;
> +const uint8_t *psrc = td->buf + stride * slice_start;
>  uint16_t *y, *u, *v;
>  
> -if (s->custom_stride )
> -stride = s->custom_stride;
> -else {
> -int aligned_width = ((avctx->width + 47) / 48) * 48;
> -stride = aligned_width * 8 / 3;
> -}
> -
> -if (avpkt->size < stride * avctx->height) {
> -if avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == 
> avpkt->size) {
> -stride = avpkt->size / avctx->height;
> -if (!s->stride_warning_shown)
> -av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small 
> padding (64 byte) detected\n");
> -s->stride_warning_shown = 1;
> -} else {
> -av_log(avctx, AV_LOG_ERROR, "packet too small\n");
> -return AVERROR_INVALIDDATA;
> -}
> -}
> -if (avctx->codec_tag == MKTAG('C', '2', '1', '0')
> -&& AV_RN32(psrc) == AV_RN32("INFO")
> -&& avpkt->size - 64 >= stride * avctx->height)
> -psrc += 64;
> -
> -aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f);
> -if (aligned_input != s->aligned_input) {
> -s->aligned_input = aligned_input;
> -ff_v210dec_init(s);
> -}
> -
> -if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
> -return ret;
> -
> -y = (uint16_t*)pic->data[0];
> -u = (uint16_t*)pic->data[1];
> -v = (uint16_t*)pic->data[2];
> -pic->pict_type = AV_PICTURE_TYPE_I;
> -pic->key_frame = 1;
> +/* add the remaining slice for the last job */
> +if (jobnr == s->slice_count - 1)
> +slice_end += slice_m;
>  
> -for (h = 0; h < avctx->height; h++) {
> +y = (uint16_t*)frame->data[0] + slice_start * frame->linesize[0] / 2;
> +u = (uint16_t*)frame->data[1] + slice_start * frame->linesize[1] / 2;
> +v = (uint16_t*)fr