Thanks, this is certainly an enhancement to x265 lookahead. We would be interested in this - especially if you can also include some efficiency (bitrate vs SSIM) metrics that describe the penalty moving from X265_LOWRES_SCALE of 4 to higher scales.
On Mon, Jul 21, 2014 at 8:49 PM, Nicolas Morey-Chaisemartin < nmo...@kalray.eu> wrote: > Hi, > > We recently profiled x265 pre-analysis to estimate what performance we > could reach using our accelerator and I was quite disappointed by the > performance. > When running on a Core-i7 with AVX at roughly 2.7GHz, we barely reached > the 30fps mark using ultrafast preset on a 4K video. > > After a little bit of browsing I realized that work in LosRew is always > done at 1/4th of the final resolution which seems fair but requires a huge > amount of work for 4K. > It seemed straight forward enough to change the divider at LowRes > initialization but it seems there are a lot of hard coded values that > depend both on the LowRes divider and the LowRes CU Size. > > Here's a patch (definitly not applicable like this but just to give an > idea of where I'm going) that seems to fix most of the hard-coded value. > It still works with a X265_LOWRES_SCALE of 4 and the perf is definilty > improving (29fps => 40fps on a 2048x1024 medium preset on a E5504). > > Would you be interested in a clean version of this? At least the > hard-coded CU_SIZE part? > IMHO it would be better to have "dynamic" value for LowRes depending on > preset (or equivalent) and the input resolution... > 1/4th is fast enough in HD not to be an issue but for RT stream in 4K or > more, 1/16 will be compulsory. > > Nicolas > > --- > x265/source/common/common.h | 1 + > x265/source/common/lowres.cpp | 4 ++-- > x265/source/encoder/frameencoder.cpp | 7 ++++--- > x265/source/encoder/ratecontrol.cpp | 16 ++++++++-------- > x265/source/encoder/slicetype.cpp | 8 ++++---- > 5 files changed, 19 insertions(+), 17 deletions(-) > > diff --git a/x265/source/common/common.h b/x265/source/common/common.h > index 06f60e7..00e73fc 100644 > --- a/x265/source/common/common.h > +++ b/x265/source/common/common.h > @@ -156,6 +156,7 @@ typedef int32_t coeff_t; // transform coefficient > // high cost estimates (intra and inter both suffer) > #define X265_LOWRES_CU_SIZE 8 > #define X265_LOWRES_CU_BITS 3 > +#define X265_LOWRES_SCALE 2 > #define X265_MALLOC(type, count) (type*)x265_malloc(sizeof(type) * > (count)) > #define X265_FREE(ptr) x265_free(ptr) > diff --git a/x265/source/common/lowres.cpp b/x265/source/common/lowres.cpp > index 5fc2f6b..6138023 100644 > --- a/x265/source/common/lowres.cpp > +++ b/x265/source/common/lowres.cpp > @@ -31,8 +31,8 @@ bool Lowres::create(TComPicYuv *orig, int _bframes, bool > bAQEnabled) > { > isLowres = true; > bframes = _bframes; > - width = orig->getWidth() / 2; > - lines = orig->getHeight() / 2; > + width = orig->getWidth() / X265_LOWRES_SCALE; > + lines = orig->getHeight() / X265_LOWRES_SCALE; > lumaStride = width + 2 * orig->getLumaMarginX(); > if (lumaStride & 31) > lumaStride += 32 - (lumaStride & 31); > diff --git a/x265/source/encoder/frameencoder.cpp b/x265/source/encoder/ > frameencoder.cpp > index 8c3ee26..7213f60 100644 > --- a/x265/source/encoder/frameencoder.cpp > +++ b/x265/source/encoder/frameencoder.cpp > @@ -1300,9 +1300,10 @@ int FrameEncoder::calcQpForCu(uint32_t cuAddr, > double baseQp) > /* Derive qpOffet for each CU by averaging offsets for all 16x16 > blocks in the cu. */ > double qp_offset = 0; > - int maxBlockCols = (m_frame->getPicYuvOrg()->getWidth() + (16 - 1)) > / 16; > - int maxBlockRows = (m_frame->getPicYuvOrg()->getHeight() + (16 - 1)) > / 16; > - int noOfBlocks = g_maxCUSize / 16; > + int lowResCu = (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE); > + int maxBlockCols = (m_frame->getPicYuvOrg()->getWidth() + (lowResCu > - 1)) / lowResCu; > + int maxBlockRows = (m_frame->getPicYuvOrg()->getHeight() + (lowResCu > - 1)) / lowResCu; > + int noOfBlocks = g_maxCUSize / lowResCu; > int block_y = (cuAddr / m_frame->getPicSym()->getFrameWidthInCU()) * > noOfBlocks; > int block_x = (cuAddr * noOfBlocks) - block_y * m_frame->getPicSym()-> > getFrameWidthInCU(); > diff --git a/x265/source/encoder/ratecontrol.cpp b/x265/source/encoder/ > ratecontrol.cpp > index 4358994..5fcc27a 100644 > --- a/x265/source/encoder/ratecontrol.cpp > +++ b/x265/source/encoder/ratecontrol.cpp > @@ -161,8 +161,8 @@ void RateControl::calcAdaptiveQuantFrame(Frame *pic) > if (m_param->rc.aqMode == X265_AQ_NONE || m_param->rc.aqStrength == 0) > { > /* Need to init it anyways for CU tree */ > - int cuWidth = ((maxCol / 2) + X265_LOWRES_CU_SIZE - 1) >> > X265_LOWRES_CU_BITS; > - int cuHeight = ((maxRow / 2) + X265_LOWRES_CU_SIZE - 1) >> > X265_LOWRES_CU_BITS; > + int cuWidth = ((maxCol / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE > - 1) >> X265_LOWRES_CU_BITS; > + int cuHeight = ((maxRow / X265_LOWRES_SCALE) + > X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; > int cuCount = cuWidth * cuHeight; > if (m_param->rc.aqMode && m_param->rc.aqStrength == 0) > @@ -194,9 +194,9 @@ void RateControl::calcAdaptiveQuantFrame(Frame *pic) > if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE) > { > double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5); > - for (block_y = 0; block_y < maxRow; block_y += 16) > + for (block_y = 0; block_y < maxRow; block_y += > (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE)) > { > - for (block_x = 0; block_x < maxCol; block_x += 16) > + for (block_x = 0; block_x < maxCol; block_x += > (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE)) > { > uint32_t energy = acEnergyCu(pic, block_x, block_y); > qp_adj = pow(energy + 1, 0.1); > @@ -216,9 +216,9 @@ void RateControl::calcAdaptiveQuantFrame(Frame *pic) > strength = m_param->rc.aqStrength * 1.0397f; > block_xy = 0; > - for (block_y = 0; block_y < maxRow; block_y += 16) > + for (block_y = 0; block_y < maxRow; block_y += > (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE)) > { > - for (block_x = 0; block_x < maxCol; block_x += 16) > + for (block_x = 0; block_x < maxCol; block_x += > (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE)) > { > if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE) > { > @@ -260,8 +260,8 @@ void RateControl::calcAdaptiveQuantFrame(Frame *pic) > RateControl::RateControl(x265_param *p) > { > m_param = p; > - int lowresCuWidth = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE > - 1) >> X265_LOWRES_CU_BITS; > - int lowresCuHeight = ((m_param->sourceHeight / 2) + > X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; > + int lowresCuWidth = ((m_param->sourceWidth / X265_LOWRES_SCALE) + > X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; > + int lowresCuHeight = ((m_param->sourceHeight / X265_LOWRES_SCALE) + > X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; > m_ncu = lowresCuWidth * lowresCuHeight; > if (m_param->rc.cuTree) > diff --git a/x265/source/encoder/slicetype.cpp b/x265/source/encoder/ > slicetype.cpp > index 34d0b3b..4a2f2cb 100644 > --- a/x265/source/encoder/slicetype.cpp > +++ b/x265/source/encoder/slicetype.cpp > @@ -65,8 +65,8 @@ Lookahead::Lookahead(x265_param *param, ThreadPool* > pool) > m_lastNonB = NULL; > m_bFilling = true; > m_bFlushed = false; > - m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) > >> X265_LOWRES_CU_BITS; > - m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - > 1) >> X265_LOWRES_CU_BITS; > + m_widthInCU = ((m_param->sourceWidth / X265_LOWRES_SCALE) + > X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; > + m_heightInCU = ((m_param->sourceHeight / X265_LOWRES_SCALE) + > X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; > m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int)); > memset(m_histogram, 0, sizeof(m_histogram)); > } > @@ -1201,8 +1201,8 @@ CostEstimate::~CostEstimate() > void CostEstimate::init(x265_param *_param, Frame *pic) > { > m_param = _param; > - m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) > >> X265_LOWRES_CU_BITS; > - m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - > 1) >> X265_LOWRES_CU_BITS; > + m_widthInCU = ((m_param->sourceWidth / X265_LOWRES_SCALE) + > X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; > + m_heightInCU = ((m_param->sourceHeight / X265_LOWRES_SCALE) + > X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; > m_rows = new EstimateRow[m_heightInCU]; > for (int i = 0; i < m_heightInCU; i++) > _______________________________________________ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel >
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel