Re: [FFmpeg-devel] [PATCH] avcodec/prores_ks reduce twice fdct calls
Do we want to apply this? On 1/4/19, Alex Mogurenko wrote: > ping > > On Mon, Dec 31, 2018 at 11:02 PM Alex Mogurenko wrote: > >> thanks, >> just sent new patch >> >> On Mon, Dec 31, 2018 at 5:56 PM Derek Buitenhuis < >> derek.buitenh...@gmail.com> wrote: >> >>> On 31/12/2018 09:12, Alex Mogurenko wrote: >>> > I seems to be lame as failed to find how to run fate to check prores_ks >>> :( >>> >>> General way to run all of FATE: >>> >>> $ configure --samples=/home/path/for/samples/ --enable-gpl [...] >>> $ make fate-rsync >>> $ make fate >>> >>> There are sub-targets, too, as listed in Michaels mail, like >>> fate-vsynth1. >>> >>> - Derek >>> ___ >>> ffmpeg-devel mailing list >>> ffmpeg-devel@ffmpeg.org >>> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel >>> >> > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] avcodec/prores_ks reduce twice fdct calls
ping On Mon, Dec 31, 2018 at 11:02 PM Alex Mogurenko wrote: > thanks, > just sent new patch > > On Mon, Dec 31, 2018 at 5:56 PM Derek Buitenhuis < > derek.buitenh...@gmail.com> wrote: > >> On 31/12/2018 09:12, Alex Mogurenko wrote: >> > I seems to be lame as failed to find how to run fate to check prores_ks >> :( >> >> General way to run all of FATE: >> >> $ configure --samples=/home/path/for/samples/ --enable-gpl [...] >> $ make fate-rsync >> $ make fate >> >> There are sub-targets, too, as listed in Michaels mail, like fate-vsynth1. >> >> - Derek >> ___ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org >> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avcodec/prores_ks reduce twice fdct calls
thanks, just sent new patch On Mon, Dec 31, 2018 at 5:56 PM Derek Buitenhuis wrote: > On 31/12/2018 09:12, Alex Mogurenko wrote: > > I seems to be lame as failed to find how to run fate to check prores_ks > :( > > General way to run all of FATE: > > $ configure --samples=/home/path/for/samples/ --enable-gpl [...] > $ make fate-rsync > $ make fate > > There are sub-targets, too, as listed in Michaels mail, like fate-vsynth1. > > - Derek > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avcodec/prores_ks reduce twice fdct calls
fdct done twice for each block. first time during quant calculation, second during slice encoding. so if we pre-save dct coefficients no need to do fdct second time. disadvantages: requires more memory advantages: improves performance ~4-5% --- libavcodec/proresenc_kostya.c | 74 --- 1 file changed, 52 insertions(+), 22 deletions(-) diff --git a/libavcodec/proresenc_kostya.c b/libavcodec/proresenc_kostya.c index e045a972f1..d2f81e73f4 100644 --- a/libavcodec/proresenc_kostya.c +++ b/libavcodec/proresenc_kostya.c @@ -219,7 +219,6 @@ struct TrellisNode { #define MAX_STORED_Q 16 typedef struct ProresThreadData { -DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE]; DECLARE_ALIGNED(16, uint16_t, emu_buf)[16 * 16]; int16_t custom_q[64]; int16_t custom_chroma_q[64]; @@ -228,7 +227,6 @@ typedef struct ProresThreadData { typedef struct ProresContext { AVClass *class; -DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE]; DECLARE_ALIGNED(16, uint16_t, emu_buf)[16*16]; int16_t quants[MAX_STORED_Q][64]; int16_t quants_chroma[MAX_STORED_Q][64]; @@ -237,6 +235,7 @@ typedef struct ProresContext { const uint8_t *quant_mat; const uint8_t *quant_chroma_mat; const uint8_t *scantable; +int16_t *blocks[MAX_PLANES]; void (*fdct)(FDCTDSPContext *fdsp, const uint16_t *src, ptrdiff_t linesize, int16_t *block); @@ -562,6 +561,8 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic, int plane_factor, is_chroma; uint16_t *qmat; uint16_t *qmat_chroma; +int16_t *blocks; +DECLARE_ALIGNED(16, int16_t, dct_blocks)[16 * 16 * MAX_MBS_PER_SLICE]; if (ctx->pictures_per_frame == 1) line_add = 0; @@ -604,28 +605,38 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic, src = (const uint16_t*)(pic->data[i] + yp * linesize + line_add * pic->linesize[i]) + xp; +if (!ctx->force_quant) { +blocks = ctx->blocks[i] + (y * ctx->slices_width * ctx->mbs_per_slice + x) * 16 * 16; +} else { +blocks = dct_blocks; +} + if (i < 3) { -get_slice_data(ctx, src, linesize, xp, yp, - pwidth, avctx->height / ctx->pictures_per_frame, - ctx->blocks[0], ctx->emu_buf, - mbs_per_slice, num_cblocks, is_chroma); +if (ctx->force_quant) { +get_slice_data(ctx, src, linesize, xp, yp, + pwidth, avctx->height / ctx->pictures_per_frame, + blocks, ctx->emu_buf, + mbs_per_slice, num_cblocks, is_chroma); +} if (!is_chroma) {/* luma quant */ sizes[i] = encode_slice_plane(ctx, pb, src, linesize, - mbs_per_slice, ctx->blocks[0], + mbs_per_slice, blocks, num_cblocks, plane_factor, qmat); } else { /* chroma plane */ sizes[i] = encode_slice_plane(ctx, pb, src, linesize, - mbs_per_slice, ctx->blocks[0], + mbs_per_slice, blocks, num_cblocks, plane_factor, qmat_chroma); } } else { -get_alpha_data(ctx, src, linesize, xp, yp, - pwidth, avctx->height / ctx->pictures_per_frame, - ctx->blocks[0], mbs_per_slice, ctx->alpha_bits); +if (ctx->force_quant) { +get_alpha_data(ctx, src, linesize, xp, yp, + pwidth, avctx->height / ctx->pictures_per_frame, + blocks, mbs_per_slice, ctx->alpha_bits); +} sizes[i] = encode_alpha_plane(ctx, pb, mbs_per_slice, - ctx->blocks[0], quant); + blocks, quant); } total_size += sizes[i]; if (put_bits_left(pb) < 0) { @@ -730,15 +741,15 @@ static int estimate_slice_plane(ProresContext *ctx, int *error, int plane, const uint16_t *src, ptrdiff_t linesize, int mbs_per_slice, int blocks_per_mb, int plane_size_factor, -const int16_t *qmat, ProresThreadData *td) +const int16_t *qmat, int16_t *blocks) { int blocks_per_slice; int bits; blocks_per_slice = mbs_per_slice * blocks_per_mb; -bits =
Re: [FFmpeg-devel] [PATCH] avcodec/prores_ks reduce twice fdct calls
On 31/12/2018 09:12, Alex Mogurenko wrote: > I seems to be lame as failed to find how to run fate to check prores_ks :( General way to run all of FATE: $ configure --samples=/home/path/for/samples/ --enable-gpl [...] $ make fate-rsync $ make fate There are sub-targets, too, as listed in Michaels mail, like fate-vsynth1. - Derek ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avcodec/prores_ks reduce twice fdct calls
I seems to be lame as failed to find how to run fate to check prores_ks :( On Mon, Dec 31, 2018 at 4:46 AM Michael Niedermayer wrote: > On Sun, Dec 30, 2018 at 10:57:23PM +0200, Alex Mogurenko wrote: > > fdct done twice for each block. first time during quant calculation, > second during slice encoding. so if we pre-save dct coefficients no need to > do fdct second time. > > disadvantages: requires more memory > > advantages: improves performance ~4-5% > > --- > > libavcodec/proresenc_kostya.c | 74 --- > > 1 file changed, 52 insertions(+), 22 deletions(-) > > breaks fate > > TESTvsynth1-prores_ks > --- ./tests/ref/vsynth/vsynth1-prores_ks2018-12-28 > 17:54:41.361383177 +0100 > +++ tests/data/fate/vsynth1-prores_ks 2018-12-31 03:45:17.207673799 +0100 > @@ -1,4 +1,4 @@ > -fe41a284da97ea5ec8866ca9a55b84da *tests/data/fate/vsynth1-prores_ks.mov > -3858911 tests/data/fate/vsynth1-prores_ks.mov > -100eb002413fe7a632d440dfbdf7e3ff > *tests/data/fate/vsynth1-prores_ks.out.rawvideo > -stddev:3.17 PSNR: 38.09 MAXDIFF: 39 bytes: 7603200/ 7603200 > +ba6294d95b96f032b90f804f112ab98a *tests/data/fate/vsynth1-prores_ks.mov > +3867422 tests/data/fate/vsynth1-prores_ks.mov > +e85510eadb1ff115e85c480d8e1011a4 > *tests/data/fate/vsynth1-prores_ks.out.rawvideo > +stddev: 19.38 PSNR: 22.38 MAXDIFF: 210 bytes: 7603200/ 7603200 > Test vsynth1-prores_ks failed. Look at > tests/data/fate/vsynth1-prores_ks.err for details. > make: *** [fate-vsynth1-prores_ks] Error 1 > > [...] > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > "You are 36 times more likely to die in a bathtub than at the hands of a > terrorist. Also, you are 2.5 times more likely to become a president and > 2 times more likely to become an astronaut, than to die in a terrorist > attack." -- Thoughty2 > > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avcodec/prores_ks reduce twice fdct calls
On Sun, Dec 30, 2018 at 10:57:23PM +0200, Alex Mogurenko wrote: > fdct done twice for each block. first time during quant calculation, second > during slice encoding. so if we pre-save dct coefficients no need to do fdct > second time. > disadvantages: requires more memory > advantages: improves performance ~4-5% > --- > libavcodec/proresenc_kostya.c | 74 --- > 1 file changed, 52 insertions(+), 22 deletions(-) breaks fate TESTvsynth1-prores_ks --- ./tests/ref/vsynth/vsynth1-prores_ks2018-12-28 17:54:41.361383177 +0100 +++ tests/data/fate/vsynth1-prores_ks 2018-12-31 03:45:17.207673799 +0100 @@ -1,4 +1,4 @@ -fe41a284da97ea5ec8866ca9a55b84da *tests/data/fate/vsynth1-prores_ks.mov -3858911 tests/data/fate/vsynth1-prores_ks.mov -100eb002413fe7a632d440dfbdf7e3ff *tests/data/fate/vsynth1-prores_ks.out.rawvideo -stddev:3.17 PSNR: 38.09 MAXDIFF: 39 bytes: 7603200/ 7603200 +ba6294d95b96f032b90f804f112ab98a *tests/data/fate/vsynth1-prores_ks.mov +3867422 tests/data/fate/vsynth1-prores_ks.mov +e85510eadb1ff115e85c480d8e1011a4 *tests/data/fate/vsynth1-prores_ks.out.rawvideo +stddev: 19.38 PSNR: 22.38 MAXDIFF: 210 bytes: 7603200/ 7603200 Test vsynth1-prores_ks failed. Look at tests/data/fate/vsynth1-prores_ks.err for details. make: *** [fate-vsynth1-prores_ks] Error 1 [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB "You are 36 times more likely to die in a bathtub than at the hands of a terrorist. Also, you are 2.5 times more likely to become a president and 2 times more likely to become an astronaut, than to die in a terrorist attack." -- Thoughty2 signature.asc Description: PGP signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avcodec/prores_ks reduce twice fdct calls
fdct done twice for each block. first time during quant calculation, second during slice encoding. so if we pre-save dct coefficients no need to do fdct second time. disadvantages: requires more memory advantages: improves performance ~4-5% --- libavcodec/proresenc_kostya.c | 74 --- 1 file changed, 52 insertions(+), 22 deletions(-) diff --git a/libavcodec/proresenc_kostya.c b/libavcodec/proresenc_kostya.c index e045a972f1..4d49d6521a 100644 --- a/libavcodec/proresenc_kostya.c +++ b/libavcodec/proresenc_kostya.c @@ -219,7 +219,6 @@ struct TrellisNode { #define MAX_STORED_Q 16 typedef struct ProresThreadData { -DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE]; DECLARE_ALIGNED(16, uint16_t, emu_buf)[16 * 16]; int16_t custom_q[64]; int16_t custom_chroma_q[64]; @@ -228,7 +227,6 @@ typedef struct ProresThreadData { typedef struct ProresContext { AVClass *class; -DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE]; DECLARE_ALIGNED(16, uint16_t, emu_buf)[16*16]; int16_t quants[MAX_STORED_Q][64]; int16_t quants_chroma[MAX_STORED_Q][64]; @@ -237,6 +235,7 @@ typedef struct ProresContext { const uint8_t *quant_mat; const uint8_t *quant_chroma_mat; const uint8_t *scantable; +int16_t *blocks[MAX_PLANES]; void (*fdct)(FDCTDSPContext *fdsp, const uint16_t *src, ptrdiff_t linesize, int16_t *block); @@ -562,6 +561,8 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic, int plane_factor, is_chroma; uint16_t *qmat; uint16_t *qmat_chroma; +int16_t *blocks; +DECLARE_ALIGNED(16, int16_t, dct_blocks)[16 * 16 * MAX_MBS_PER_SLICE]; if (ctx->pictures_per_frame == 1) line_add = 0; @@ -604,28 +605,38 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic, src = (const uint16_t*)(pic->data[i] + yp * linesize + line_add * pic->linesize[i]) + xp; +if (!ctx->force_quant) { +blocks = ctx->blocks[i] + (y * ctx->slices_width + x / ctx->mbs_per_slice) * 16 * 16 * ctx->mbs_per_slice; +} else { +blocks = dct_blocks; +} + if (i < 3) { -get_slice_data(ctx, src, linesize, xp, yp, - pwidth, avctx->height / ctx->pictures_per_frame, - ctx->blocks[0], ctx->emu_buf, - mbs_per_slice, num_cblocks, is_chroma); +if (ctx->force_quant) { +get_slice_data(ctx, src, linesize, xp, yp, + pwidth, avctx->height / ctx->pictures_per_frame, + blocks, ctx->emu_buf, + mbs_per_slice, num_cblocks, is_chroma); +} if (!is_chroma) {/* luma quant */ sizes[i] = encode_slice_plane(ctx, pb, src, linesize, - mbs_per_slice, ctx->blocks[0], + mbs_per_slice, blocks, num_cblocks, plane_factor, qmat); } else { /* chroma plane */ sizes[i] = encode_slice_plane(ctx, pb, src, linesize, - mbs_per_slice, ctx->blocks[0], + mbs_per_slice, blocks, num_cblocks, plane_factor, qmat_chroma); } } else { -get_alpha_data(ctx, src, linesize, xp, yp, - pwidth, avctx->height / ctx->pictures_per_frame, - ctx->blocks[0], mbs_per_slice, ctx->alpha_bits); +if (ctx->force_quant) { +get_alpha_data(ctx, src, linesize, xp, yp, + pwidth, avctx->height / ctx->pictures_per_frame, + blocks, mbs_per_slice, ctx->alpha_bits); +} sizes[i] = encode_alpha_plane(ctx, pb, mbs_per_slice, - ctx->blocks[0], quant); + blocks, quant); } total_size += sizes[i]; if (put_bits_left(pb) < 0) { @@ -730,15 +741,15 @@ static int estimate_slice_plane(ProresContext *ctx, int *error, int plane, const uint16_t *src, ptrdiff_t linesize, int mbs_per_slice, int blocks_per_mb, int plane_size_factor, -const int16_t *qmat, ProresThreadData *td) +const int16_t *qmat, int16_t *blocks) { int blocks_per_slice; int bits; blocks_per_slice = mbs_per_slice * blocks_per_mb; -bits