On Fri, Oct 18, 2013 at 5:43 AM, Deepthi Devaki Akkoorath < [email protected]> wrote:
> > > > On Fri, Oct 18, 2013 at 5:22 AM, Steve Borho <[email protected]> wrote: > >> >> >> >> On Thu, Oct 17, 2013 at 5:39 AM, <[email protected]>wrote: >> >>> # HG changeset patch >>> # User Deepthi Devaki <[email protected]> >>> # Date 1382006045 -19800 >>> # Node ID 677d71fd793959c7e98df56f99f0b02a89fe050d >>> # Parent fc9dbd798ac37ec1acc0596aa179f0deb586c092 >>> Lookahead: Implement wavefront parallel processing >>> >> >> this looks good; I think you have the general idea down. It can be >> simplified a great deal, comments below >> >> >>> diff -r fc9dbd798ac3 -r 677d71fd7939 source/common/wavefront.h >>> --- a/source/common/wavefront.h Thu Oct 17 14:14:40 2013 +0530 >>> +++ b/source/common/wavefront.h Thu Oct 17 16:04:05 2013 +0530 >>> @@ -47,11 +47,6 @@ >>> >>> int m_numRows; >>> >>> - // WaveFront's implementation of JobProvider::findJob. Consults >>> - // m_queuedBitmap and calls ProcessRow(row) for lowest numbered >>> queued row >>> - // or returns false >>> - bool findJob(); >>> - >>> public: >>> >>> WaveFront(ThreadPool *pool) : JobProvider(pool), m_queuedBitmap(0) >>> {} >>> @@ -69,6 +64,11 @@ >>> >>> void clearEnabledRowMask(); >>> >>> + // WaveFront's implementation of JobProvider::findJob. Consults >>> + // m_queuedBitmap and calls ProcessRow(row) for lowest numbered >>> queued row >>> + // or returns false >>> + bool findJob(); >>> + >>> >> >> I think this is unnecessary if you refer to your own findJob() method >> (this->findJob()) >> >> > > Do you mean to write separate findJob() for Lookahead? But then it will be > a replica of Wavefront::findjob() > I was confused and thought you were defining your own findJob(); disregard this comment > > >> // Returns true if a row above curRow is available for processing. >>> The processRow() >>> // method may call this function periodically and voluntarily exit >>> bool checkHigherPriorityRow(int curRow); >>> diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/encoder.cpp >>> --- a/source/encoder/encoder.cpp Thu Oct 17 14:14:40 2013 +0530 >>> +++ b/source/encoder/encoder.cpp Thu Oct 17 16:04:05 2013 +0530 >>> @@ -92,6 +92,7 @@ >>> } >>> } >>> m_lookahead = new Lookahead(this); >>> + m_lookahead->setThreadPool(m_threadPool); >>> m_dpb = new DPB(this); >>> m_rateControl = new RateControl(this); >>> } >>> @@ -144,6 +145,7 @@ >>> m_frameEncoder[i].init(this, numRows); >>> } >>> } >>> + m_lookahead->init(); >>> } >>> >>> int Encoder::getStreamHeaders(NALUnitEBSP **nalunits) >>> diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/slicetype.cpp >>> --- a/source/encoder/slicetype.cpp Thu Oct 17 14:14:40 2013 +0530 >>> +++ b/source/encoder/slicetype.cpp Thu Oct 17 16:04:05 2013 +0530 >>> @@ -62,29 +62,67 @@ >>> dst.y = median(a.y, b.y, c.y); >>> } >>> >>> -Lookahead::Lookahead(TEncCfg *_cfg) >>> +Lookahead::Lookahead(TEncCfg *_cfg) : WaveFront(NULL) >>> >> >> Pass the thread pool pointer as a second argument and initialize >> WaveFront with it; then you don't need the setThreadPool() method >> >> >>> { >>> this->cfg = _cfg; >>> numDecided = 0; >>> lastKeyframe = -cfg->param.keyframeMax; >>> lastNonB = NULL; >>> - predictions = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8); >>> - me.setQP(X265_LOOKAHEAD_QP); >>> - me.setSearchMethod(X265_HEX_SEARCH); >>> - me.setSubpelRefine(1); >>> merange = 16; >>> widthInCU = ((cfg->param.sourceWidth / 2) + X265_LOWRES_CU_SIZE - >>> 1) >> X265_LOWRES_CU_BITS; >>> heightInCU = ((cfg->param.sourceHeight / 2) + X265_LOWRES_CU_SIZE - >>> 1) >> X265_LOWRES_CU_BITS; >>> + >>> + lock = new Lock[heightInCU]; >>> + rowme = new MotionEstimate[heightInCU]; >>> + >>> + active = (bool*)X265_MALLOC(bool, heightInCU); >>> + completed = (uint32_t*)X265_MALLOC(uint32_t, heightInCU); >>> + rowpredictions = (pixel**)X265_MALLOC(pixel*, heightInCU); >>> + rowcostEst = (int*)X265_MALLOC(int, heightInCU); >>> + rowcostIntra = (int*)X265_MALLOC(int, heightInCU); >>> + rowIntraMbs = (int*)X265_MALLOC(int, heightInCU); >>> + for (int i = 0; i < heightInCU; i++) >>> + { >>> + rowme[i].setQP(X265_LOOKAHEAD_QP); >>> + rowme[i].setSearchMethod(X265_HEX_SEARCH); >>> + rowme[i].setSubpelRefine(1); >>> + rowpredictions[i] = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8); >>> + rowcostEst[i] = 0; >>> + rowcostIntra[i] = 0; >>> + rowIntraMbs[i] = 0; >>> + } >>> >> >> This would be much cleaner as a per-row struct similar to CTURow >> (LookaheadRow?) >> >> >>> } >>> >>> Lookahead::~Lookahead() >>> { >>> } >>> >>> +void Lookahead::init() >>> +{ >>> + if (!WaveFront::init(heightInCU)) >>> + { >>> + m_pool = NULL; >>> + } >>> + memset((void*)completed, 0, heightInCU * sizeof(uint32_t)); >>> + memset((void*)active, 0, heightInCU * sizeof(bool)); >>> >> >> these memsets are unnecessary >> >> >>> +} >>> + >>> void Lookahead::destroy() >>> { >>> - if (predictions) >>> - X265_FREE(predictions); >>> + delete[] lock; >>> + delete[] rowme; >>> + >>> + X265_FREE((void*)active); >>> + X265_FREE((void*)completed); >>> + for (int i = 0; i < heightInCU; i++) >>> + { >>> + X265_FREE(rowpredictions[i]); >>> + } >>> + >>> + X265_FREE(rowpredictions); >>> + X265_FREE(rowcostEst); >>> + X265_FREE(rowcostIntra); >>> + X265_FREE(rowIntraMbs); >>> >>> // these two queues will be empty, unless the encode was aborted >>> while (!inputQueue.empty()) >>> @@ -165,9 +203,12 @@ >>> int Lookahead::estimateFrameCost(int p0, int p1, int b, bool >>> bIntraPenalty) >>> { >>> int score = 0; >>> - bool bDoSearch[2]; >>> Lowres *fenc = frames[b]; >>> >>> + curb = b; >>> + curp0 = p0; >>> + curp1 = p1; >>> + >>> if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 >>> - b][0] != -1) >>> score = fenc->costEst[b - p0][p1 - b]; >>> else >>> @@ -185,23 +226,58 @@ >>> * predictors in the main encode. This considerably improves MV >>> * prediction overall. */ >>> // TODO: use lowres MVs as motion candidates in full-res search >>> - me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride); >>> - for (int j = heightInCU - 1; j >= 0; j--) >>> + >>> + for (int i = 0; i < heightInCU; i++) >>> { >>> - if (!fenc->bIntraCalculated) >>> - fenc->rowSatds[0][0][j] = 0; >>> - fenc->rowSatds[b - p0][p1 - b][j] = 0; >>> + rowme[i].setSourcePlane(fenc->lowresPlane[0], >>> fenc->lumaStride); >>> + } >>> >>> - for (int i = widthInCU - 1; i >= 0; i--) >>> + memset((void*)completed, 0, heightInCU * sizeof(uint32_t)); >>> + memset((void*)active, 0, heightInCU * sizeof(bool)); >>> + memset((void*)rowcostEst, 0, heightInCU * sizeof(int)); >>> + memset((void*)rowcostIntra, 0, heightInCU * sizeof(int)); >>> + memset((void*)rowIntraMbs, 0, heightInCU * sizeof(int)); >>> + rowsCompleted = false; >>> + >>> + if (m_pool && cfg->param.bEnableWavefront) >>> >> >> checking bEnableWavefront is unnecessary here; we can always use >> wave-front lookahead scheduling so long as we have a thread pool. >> >> >>> + { >>> + WaveFront::clearEnabledRowMask(); >>> >> >> We need another method which just sets the enabledRowMask to all 1s >> because it's only purpose is to track reference frame dependencies and we >> have none here. >> >> >>> + WaveFront::enqueue(); >>> + >>> + for (int row = 0; row < heightInCU; row++) >>> { >>> - estimateCUCost(i, j, p0, p1, b, bDoSearch); >>> + enableRow(row); >>> + if (row == 0) >>> + enqueueRow(0); >>> + else >>> + m_pool->pokeIdleThread(); >>> + } >>> >> >> this whole loop is unnecessary (no frame dependencies), just use >> equeueRow(0) >> >> >>> + >>> + while (!rowsCompleted) >>> + { >>> + WaveFront::findJob(); >>> + } >>> + >>> + WaveFront::dequeue(); >>> + } >>> + else >>> + { >>> + for (int row = 0; row < heightInCU; row++) >>> + { >>> + processRow(row); >>> } >>> } >>> >>> + //Accumulate cost from each row >>> >> nit: space after // >> >>> + for (int row = 0; row < heightInCU; row++) >>> + { >>> + score += rowcostEst[row]; >>> + fenc->costEst[0][0] += rowcostIntra[row]; >>> + fenc->intraMbs[b - p0] += rowIntraMbs[row]; >>> >> >> score must be saved in fenc->costEst[b - p0][p1 - b]; >> >> > > Yes. It is stored later. > > >> + } >>> + >>> fenc->bIntraCalculated = true; >>> >>> - score = fenc->costEst[b - p0][p1 - b]; >>> - >>> if (b != p1) >>> score = (uint64_t)score * 100 / (130 + >>> cfg->param.bFrameBias); >>> >>> @@ -218,7 +294,7 @@ >>> return score; >>> } >>> >>> -void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b, >>> bool bDoSearch[2]) >>> +void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b, >>> bool bDoSearch[2], int row) >>> >> >> perhaps this would be cleaner if it were a method of LookaheadRow >> >> >>> { >>> Lowres *fref0 = frames[p0]; >>> Lowres *fref1 = frames[p1]; >>> @@ -233,7 +309,7 @@ >>> const bool bFrameScoreCU = (cux > 0 && cux < widthInCU - 1 && >>> cuy > 0 && cuy < heightInCU - 1) || >>> widthInCU <= 2 || heightInCU <= 2; >>> >>> - me.setSourcePU(pelOffset, cuSize, cuSize); >>> + rowme[row].setSourcePU(pelOffset, cuSize, cuSize); >>> >>> MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY], >>> &fenc->lowresMvs[1][p1 - b - 1][cuXY] }; >>> @@ -241,7 +317,7 @@ >>> &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] }; >>> >>> MV mvmin, mvmax; >>> - int bcost = me.COST_MAX; >>> + int bcost = rowme[row].COST_MAX; >>> int listused = 0; >>> >>> // establish search bounds that don't cross extended frame >>> boundaries >>> @@ -286,7 +362,7 @@ >>> median_mv(mvp, mvc[0], mvc[1], mvc[2]); >>> } >>> >>> - *fenc_costs[i] = me.motionEstimate(i ? fref1 : fref0, >>> mvmin, mvmax, mvp, numc, mvc, merange, *fenc_mvs[i]); >>> + *fenc_costs[i] = rowme[row].motionEstimate(i ? fref1 : >>> fref0, mvmin, mvmax, mvp, numc, mvc, merange, *fenc_mvs[i]); >>> COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1); >>> } >>> if (bBidir) >>> @@ -388,23 +464,23 @@ >>> int predsize = cuSize * cuSize; >>> >>> // generate 35 intra predictions into tmp >>> - primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, predictions, >>> cuSize, cuSize, (cuSize <= 16)); >>> + primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, >>> rowpredictions[row], cuSize, cuSize, (cuSize <= 16)); >>> pixel *above = (cuSize >= 8) ? pAbove1 : pAbove0; >>> pixel *left = (cuSize >= 8) ? pLeft1 : pLeft0; >>> - primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + >>> 1, predictions + predsize, cuSize, cuSize); >>> - primitives.intra_pred_allangs[nLog2SizeMinus2](predictions + 2 >>> * predsize, pAbove0, pLeft0, pAbove1, pLeft1, (cuSize <= 16)); >>> + primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + >>> 1, rowpredictions[row] + predsize, cuSize, cuSize); >>> + >>> primitives.intra_pred_allangs[nLog2SizeMinus2](rowpredictions[row] + 2 * >>> predsize, pAbove0, pLeft0, pAbove1, pLeft1, (cuSize <= 16)); >>> >>> // calculate 35 satd costs, keep least cost >>> ALIGN_VAR_32(pixel, buf_trans[32 * 32]); >>> - primitives.transpose[nLog2SizeMinus2](buf_trans, me.fenc, >>> FENC_STRIDE); >>> + primitives.transpose[nLog2SizeMinus2](buf_trans, >>> rowme[row].fenc, FENC_STRIDE); >>> pixelcmp_t satd = primitives.satd[PartitionFromSizes(cuSize, >>> cuSize)]; >>> - int icost = me.COST_MAX, cost; >>> + int icost = rowme[row].COST_MAX, cost; >>> for (UInt mode = 0; mode < 35; mode++) >>> { >>> if ((mode >= 2) && (mode < 18)) >>> - cost = satd(buf_trans, cuSize, &predictions[mode * >>> predsize], cuSize); >>> + cost = satd(buf_trans, cuSize, >>> &rowpredictions[row][mode * predsize], cuSize); >>> else >>> - cost = satd(me.fenc, FENC_STRIDE, &predictions[mode * >>> predsize], cuSize); >>> + cost = satd(rowme[row].fenc, FENC_STRIDE, >>> &rowpredictions[row][mode * predsize], cuSize); >>> if (cost < icost) >>> icost = cost; >>> } >>> @@ -412,14 +488,14 @@ >>> // TOOD: i_icost += intra_penalty + lowres_penalty; >>> fenc->intraCost[cuXY] = icost; >>> fenc->rowSatds[0][0][cuy] += icost; >>> - if (bFrameScoreCU) fenc->costEst[0][0] += icost; >>> + if (bFrameScoreCU) rowcostIntra[row] += icost; >>> } >>> >>> if (!bBidir) >>> { >>> if (fenc->intraCost[cuXY] < bcost) >>> { >>> - if (bFrameScoreCU) fenc->intraMbs[b - p0]++; >>> + if (bFrameScoreCU) rowIntraMbs[row]++; >>> bcost = fenc->intraCost[cuXY]; >>> listused = 0; >>> } >>> @@ -429,7 +505,7 @@ >>> if (p0 != p1) >>> { >>> fenc->rowSatds[b - p0][p1 - b][cuy] += bcost; >>> - if (bFrameScoreCU) fenc->costEst[b - p0][p1 - b] += bcost; >>> + if (bFrameScoreCU) rowcostEst[row] += bcost; >>> } >>> fenc->lowresCosts[b - p0][p1 - b][cuXY] = >>> (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << >>> LOWRES_COST_SHIFT)); >>> } >>> @@ -547,6 +623,7 @@ >>> { >>> frames[i + 1] = &list[i]->m_lowres; >>> } >>> + >>> if (IS_X265_TYPE_I(frames[bframes + 1]->sliceType)) >>> p0 = bframes + 1; >>> else // P >>> @@ -914,7 +991,7 @@ >>> { >>> char paths[2][X265_LOOKAHEAD_MAX + 1]; >>> int num_paths = X265_MIN(cfg->param.bframes + 1, length); >>> - int best_cost = me.COST_MAX; >>> + int best_cost = rowme[0].COST_MAX; >>> int idx = 0; >>> >>> /* Iterate over all currently possible paths */ >>> @@ -993,3 +1070,41 @@ >>> >>> return cost; >>> } >>> + >>> +void Lookahead::processRow(int row) >>> +{ >>> + int realrow = heightInCU - 1 - row; >>> + Lowres *fenc = frames[curb]; >>> + >>> + if (!fenc->bIntraCalculated) >>> + fenc->rowSatds[0][0][realrow] = 0; >>> + fenc->rowSatds[curb - curp0][curp1 - curb][realrow] = 0; >>> + for (int i = widthInCU - 1 - completed[row]; i >= 0; i--) //Go >>> backwards >>> + { >>> + estimateCUCost(i, realrow, curp0, curp1, curb, bDoSearch, row); >>> + completed[row]++; >>> + >>> + if (completed[row] >= 2 && row < heightInCU - 1) >>> + { >>> + ScopedLock below(lock[row + 1]); >>> + if (active[row + 1] == false && >>> + completed[row + 1] + 2 <= completed[row]) >>> + { >>> + active[row + 1] = true; >>> + enqueueRow(row + 1); >>> + } >>> + } >>> + >>> + ScopedLock self(lock[row]); >>> + if (row > 0 && (int32_t)completed[row] < widthInCU - 1 && >>> completed[row - 1] < completed[row] + 2) >>> + { >>> + active[row] = false; >>> + return; >>> + } >>> + } >>> + >>> + if (row == heightInCU - 1) >>> + { >>> + rowsCompleted = true; >>> + } >>> +} >>> diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/slicetype.h >>> --- a/source/encoder/slicetype.h Thu Oct 17 14:14:40 2013 +0530 >>> +++ b/source/encoder/slicetype.h Thu Oct 17 16:04:05 2013 +0530 >>> @@ -26,7 +26,7 @@ >>> >>> #include "motion.h" >>> #include "piclist.h" >>> -#include "common.h" >>> +#include "wavefront.h" >>> >>> namespace x265 { >>> // private namespace >>> @@ -35,11 +35,9 @@ >>> class TComPic; >>> class TEncCfg; >>> >>> -struct Lookahead >>> +struct Lookahead : public WaveFront >>> { >>> - MotionEstimate me; >>> TEncCfg *cfg; >>> - pixel *predictions; // buffer for 35 intra predictions >>> Lowres *frames[X265_LOOKAHEAD_MAX]; >>> Lowres *lastNonB; >>> int merange; >>> @@ -51,9 +49,24 @@ >>> PicList inputQueue; // input pictures in order received >>> PicList outputQueue; // pictures to be encoded, in encode order >>> >>> + bool bDoSearch[2]; >>> + int curb, curp0, curp1; >>> + bool rowsCompleted; >>> + >>> + //For wavefront parallelism >>> + Lock* lock; >>> + volatile bool* active; >>> + volatile uint32_t* completed; >>> + pixel** rowpredictions; // buffer for 35 intra >>> predictions >>> + MotionEstimate* rowme; >>> + int* rowcostEst; >>> + int* rowcostIntra; >>> + int* rowIntraMbs; >>> + >>> Lookahead(TEncCfg *); >>> ~Lookahead(); >>> >>> + void init(); >>> void addPicture(TComPic*, int sliceType); >>> void flush(); >>> void destroy(); >>> @@ -61,13 +74,15 @@ >>> int getEstimatedPictureCost(TComPic *pic); >>> >>> int estimateFrameCost(int p0, int p1, int b, bool bIntraPenalty); >>> - void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool >>> bDoSearch[2]); >>> + void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool >>> bDoSearch[2], int row); >>> >>> void slicetypeAnalyse(bool bKeyframe); >>> int scenecut(int p0, int p1, bool bRealScenecut, int numFrames, int >>> maxSearch); >>> int scenecutInternal(int p0, int p1, bool bRealScenecut); >>> void slicetypePath(int length, char(*best_paths)[X265_LOOKAHEAD_MAX >>> + 1]); >>> int slicetypePathCost(char *path, int threshold); >>> + >>> + void processRow(int row); >>> }; >>> } >>> >>> _______________________________________________ >>> x265-devel mailing list >>> [email protected] >>> https://mailman.videolan.org/listinfo/x265-devel >>> >> >> >> >> -- >> Steve Borho >> >> _______________________________________________ >> x265-devel mailing list >> [email protected] >> https://mailman.videolan.org/listinfo/x265-devel >> >> > > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > > -- Steve Borho
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
