On Thu, Oct 17, 2013 at 5:39 AM, <[email protected]> wrote:
> # HG changeset patch > # User Deepthi Devaki <[email protected]> > # Date 1382006045 -19800 > # Node ID 677d71fd793959c7e98df56f99f0b02a89fe050d > # Parent fc9dbd798ac37ec1acc0596aa179f0deb586c092 > Lookahead: Implement wavefront parallel processing > this looks good; I think you have the general idea down. It can be simplified a great deal, comments below > diff -r fc9dbd798ac3 -r 677d71fd7939 source/common/wavefront.h > --- a/source/common/wavefront.h Thu Oct 17 14:14:40 2013 +0530 > +++ b/source/common/wavefront.h Thu Oct 17 16:04:05 2013 +0530 > @@ -47,11 +47,6 @@ > > int m_numRows; > > - // WaveFront's implementation of JobProvider::findJob. Consults > - // m_queuedBitmap and calls ProcessRow(row) for lowest numbered > queued row > - // or returns false > - bool findJob(); > - > public: > > WaveFront(ThreadPool *pool) : JobProvider(pool), m_queuedBitmap(0) {} > @@ -69,6 +64,11 @@ > > void clearEnabledRowMask(); > > + // WaveFront's implementation of JobProvider::findJob. Consults > + // m_queuedBitmap and calls ProcessRow(row) for lowest numbered > queued row > + // or returns false > + bool findJob(); > + > I think this is unnecessary if you refer to your own findJob() method (this->findJob()) > // Returns true if a row above curRow is available for processing. > The processRow() > // method may call this function periodically and voluntarily exit > bool checkHigherPriorityRow(int curRow); > diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/encoder.cpp > --- a/source/encoder/encoder.cpp Thu Oct 17 14:14:40 2013 +0530 > +++ b/source/encoder/encoder.cpp Thu Oct 17 16:04:05 2013 +0530 > @@ -92,6 +92,7 @@ > } > } > m_lookahead = new Lookahead(this); > + m_lookahead->setThreadPool(m_threadPool); > m_dpb = new DPB(this); > m_rateControl = new RateControl(this); > } > @@ -144,6 +145,7 @@ > m_frameEncoder[i].init(this, numRows); > } > } > + m_lookahead->init(); > } > > int Encoder::getStreamHeaders(NALUnitEBSP **nalunits) > diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/slicetype.cpp > --- a/source/encoder/slicetype.cpp Thu Oct 17 14:14:40 2013 +0530 > +++ b/source/encoder/slicetype.cpp Thu Oct 17 16:04:05 2013 +0530 > @@ -62,29 +62,67 @@ > dst.y = median(a.y, b.y, c.y); > } > > -Lookahead::Lookahead(TEncCfg *_cfg) > +Lookahead::Lookahead(TEncCfg *_cfg) : WaveFront(NULL) > Pass the thread pool pointer as a second argument and initialize WaveFront with it; then you don't need the setThreadPool() method > { > this->cfg = _cfg; > numDecided = 0; > lastKeyframe = -cfg->param.keyframeMax; > lastNonB = NULL; > - predictions = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8); > - me.setQP(X265_LOOKAHEAD_QP); > - me.setSearchMethod(X265_HEX_SEARCH); > - me.setSubpelRefine(1); > merange = 16; > widthInCU = ((cfg->param.sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) > >> X265_LOWRES_CU_BITS; > heightInCU = ((cfg->param.sourceHeight / 2) + X265_LOWRES_CU_SIZE - > 1) >> X265_LOWRES_CU_BITS; > + > + lock = new Lock[heightInCU]; > + rowme = new MotionEstimate[heightInCU]; > + > + active = (bool*)X265_MALLOC(bool, heightInCU); > + completed = (uint32_t*)X265_MALLOC(uint32_t, heightInCU); > + rowpredictions = (pixel**)X265_MALLOC(pixel*, heightInCU); > + rowcostEst = (int*)X265_MALLOC(int, heightInCU); > + rowcostIntra = (int*)X265_MALLOC(int, heightInCU); > + rowIntraMbs = (int*)X265_MALLOC(int, heightInCU); > + for (int i = 0; i < heightInCU; i++) > + { > + rowme[i].setQP(X265_LOOKAHEAD_QP); > + rowme[i].setSearchMethod(X265_HEX_SEARCH); > + rowme[i].setSubpelRefine(1); > + rowpredictions[i] = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8); > + rowcostEst[i] = 0; > + rowcostIntra[i] = 0; > + rowIntraMbs[i] = 0; > + } > This would be much cleaner as a per-row struct similar to CTURow (LookaheadRow?) > } > > Lookahead::~Lookahead() > { > } > > +void Lookahead::init() > +{ > + if (!WaveFront::init(heightInCU)) > + { > + m_pool = NULL; > + } > + memset((void*)completed, 0, heightInCU * sizeof(uint32_t)); > + memset((void*)active, 0, heightInCU * sizeof(bool)); > these memsets are unnecessary > +} > + > void Lookahead::destroy() > { > - if (predictions) > - X265_FREE(predictions); > + delete[] lock; > + delete[] rowme; > + > + X265_FREE((void*)active); > + X265_FREE((void*)completed); > + for (int i = 0; i < heightInCU; i++) > + { > + X265_FREE(rowpredictions[i]); > + } > + > + X265_FREE(rowpredictions); > + X265_FREE(rowcostEst); > + X265_FREE(rowcostIntra); > + X265_FREE(rowIntraMbs); > > // these two queues will be empty, unless the encode was aborted > while (!inputQueue.empty()) > @@ -165,9 +203,12 @@ > int Lookahead::estimateFrameCost(int p0, int p1, int b, bool > bIntraPenalty) > { > int score = 0; > - bool bDoSearch[2]; > Lowres *fenc = frames[b]; > > + curb = b; > + curp0 = p0; > + curp1 = p1; > + > if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - > b][0] != -1) > score = fenc->costEst[b - p0][p1 - b]; > else > @@ -185,23 +226,58 @@ > * predictors in the main encode. This considerably improves MV > * prediction overall. */ > // TODO: use lowres MVs as motion candidates in full-res search > - me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride); > - for (int j = heightInCU - 1; j >= 0; j--) > + > + for (int i = 0; i < heightInCU; i++) > { > - if (!fenc->bIntraCalculated) > - fenc->rowSatds[0][0][j] = 0; > - fenc->rowSatds[b - p0][p1 - b][j] = 0; > + rowme[i].setSourcePlane(fenc->lowresPlane[0], > fenc->lumaStride); > + } > > - for (int i = widthInCU - 1; i >= 0; i--) > + memset((void*)completed, 0, heightInCU * sizeof(uint32_t)); > + memset((void*)active, 0, heightInCU * sizeof(bool)); > + memset((void*)rowcostEst, 0, heightInCU * sizeof(int)); > + memset((void*)rowcostIntra, 0, heightInCU * sizeof(int)); > + memset((void*)rowIntraMbs, 0, heightInCU * sizeof(int)); > + rowsCompleted = false; > + > + if (m_pool && cfg->param.bEnableWavefront) > checking bEnableWavefront is unnecessary here; we can always use wave-front lookahead scheduling so long as we have a thread pool. > + { > + WaveFront::clearEnabledRowMask(); > We need another method which just sets the enabledRowMask to all 1s because it's only purpose is to track reference frame dependencies and we have none here. > + WaveFront::enqueue(); > + > + for (int row = 0; row < heightInCU; row++) > { > - estimateCUCost(i, j, p0, p1, b, bDoSearch); > + enableRow(row); > + if (row == 0) > + enqueueRow(0); > + else > + m_pool->pokeIdleThread(); > + } > this whole loop is unnecessary (no frame dependencies), just use equeueRow(0) > + > + while (!rowsCompleted) > + { > + WaveFront::findJob(); > + } > + > + WaveFront::dequeue(); > + } > + else > + { > + for (int row = 0; row < heightInCU; row++) > + { > + processRow(row); > } > } > > + //Accumulate cost from each row > nit: space after // > + for (int row = 0; row < heightInCU; row++) > + { > + score += rowcostEst[row]; > + fenc->costEst[0][0] += rowcostIntra[row]; > + fenc->intraMbs[b - p0] += rowIntraMbs[row]; > score must be saved in fenc->costEst[b - p0][p1 - b]; > + } > + > fenc->bIntraCalculated = true; > > - score = fenc->costEst[b - p0][p1 - b]; > - > if (b != p1) > score = (uint64_t)score * 100 / (130 + cfg->param.bFrameBias); > > @@ -218,7 +294,7 @@ > return score; > } > > -void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b, > bool bDoSearch[2]) > +void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b, > bool bDoSearch[2], int row) > perhaps this would be cleaner if it were a method of LookaheadRow > { > Lowres *fref0 = frames[p0]; > Lowres *fref1 = frames[p1]; > @@ -233,7 +309,7 @@ > const bool bFrameScoreCU = (cux > 0 && cux < widthInCU - 1 && > cuy > 0 && cuy < heightInCU - 1) || > widthInCU <= 2 || heightInCU <= 2; > > - me.setSourcePU(pelOffset, cuSize, cuSize); > + rowme[row].setSourcePU(pelOffset, cuSize, cuSize); > > MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY], > &fenc->lowresMvs[1][p1 - b - 1][cuXY] }; > @@ -241,7 +317,7 @@ > &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] }; > > MV mvmin, mvmax; > - int bcost = me.COST_MAX; > + int bcost = rowme[row].COST_MAX; > int listused = 0; > > // establish search bounds that don't cross extended frame boundaries > @@ -286,7 +362,7 @@ > median_mv(mvp, mvc[0], mvc[1], mvc[2]); > } > > - *fenc_costs[i] = me.motionEstimate(i ? fref1 : fref0, mvmin, > mvmax, mvp, numc, mvc, merange, *fenc_mvs[i]); > + *fenc_costs[i] = rowme[row].motionEstimate(i ? fref1 : fref0, > mvmin, mvmax, mvp, numc, mvc, merange, *fenc_mvs[i]); > COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1); > } > if (bBidir) > @@ -388,23 +464,23 @@ > int predsize = cuSize * cuSize; > > // generate 35 intra predictions into tmp > - primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, predictions, > cuSize, cuSize, (cuSize <= 16)); > + primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, > rowpredictions[row], cuSize, cuSize, (cuSize <= 16)); > pixel *above = (cuSize >= 8) ? pAbove1 : pAbove0; > pixel *left = (cuSize >= 8) ? pLeft1 : pLeft0; > - primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, > predictions + predsize, cuSize, cuSize); > - primitives.intra_pred_allangs[nLog2SizeMinus2](predictions + 2 * > predsize, pAbove0, pLeft0, pAbove1, pLeft1, (cuSize <= 16)); > + primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, > rowpredictions[row] + predsize, cuSize, cuSize); > + > primitives.intra_pred_allangs[nLog2SizeMinus2](rowpredictions[row] + 2 * > predsize, pAbove0, pLeft0, pAbove1, pLeft1, (cuSize <= 16)); > > // calculate 35 satd costs, keep least cost > ALIGN_VAR_32(pixel, buf_trans[32 * 32]); > - primitives.transpose[nLog2SizeMinus2](buf_trans, me.fenc, > FENC_STRIDE); > + primitives.transpose[nLog2SizeMinus2](buf_trans, rowme[row].fenc, > FENC_STRIDE); > pixelcmp_t satd = primitives.satd[PartitionFromSizes(cuSize, > cuSize)]; > - int icost = me.COST_MAX, cost; > + int icost = rowme[row].COST_MAX, cost; > for (UInt mode = 0; mode < 35; mode++) > { > if ((mode >= 2) && (mode < 18)) > - cost = satd(buf_trans, cuSize, &predictions[mode * > predsize], cuSize); > + cost = satd(buf_trans, cuSize, &rowpredictions[row][mode > * predsize], cuSize); > else > - cost = satd(me.fenc, FENC_STRIDE, &predictions[mode * > predsize], cuSize); > + cost = satd(rowme[row].fenc, FENC_STRIDE, > &rowpredictions[row][mode * predsize], cuSize); > if (cost < icost) > icost = cost; > } > @@ -412,14 +488,14 @@ > // TOOD: i_icost += intra_penalty + lowres_penalty; > fenc->intraCost[cuXY] = icost; > fenc->rowSatds[0][0][cuy] += icost; > - if (bFrameScoreCU) fenc->costEst[0][0] += icost; > + if (bFrameScoreCU) rowcostIntra[row] += icost; > } > > if (!bBidir) > { > if (fenc->intraCost[cuXY] < bcost) > { > - if (bFrameScoreCU) fenc->intraMbs[b - p0]++; > + if (bFrameScoreCU) rowIntraMbs[row]++; > bcost = fenc->intraCost[cuXY]; > listused = 0; > } > @@ -429,7 +505,7 @@ > if (p0 != p1) > { > fenc->rowSatds[b - p0][p1 - b][cuy] += bcost; > - if (bFrameScoreCU) fenc->costEst[b - p0][p1 - b] += bcost; > + if (bFrameScoreCU) rowcostEst[row] += bcost; > } > fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, > LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT)); > } > @@ -547,6 +623,7 @@ > { > frames[i + 1] = &list[i]->m_lowres; > } > + > if (IS_X265_TYPE_I(frames[bframes + 1]->sliceType)) > p0 = bframes + 1; > else // P > @@ -914,7 +991,7 @@ > { > char paths[2][X265_LOOKAHEAD_MAX + 1]; > int num_paths = X265_MIN(cfg->param.bframes + 1, length); > - int best_cost = me.COST_MAX; > + int best_cost = rowme[0].COST_MAX; > int idx = 0; > > /* Iterate over all currently possible paths */ > @@ -993,3 +1070,41 @@ > > return cost; > } > + > +void Lookahead::processRow(int row) > +{ > + int realrow = heightInCU - 1 - row; > + Lowres *fenc = frames[curb]; > + > + if (!fenc->bIntraCalculated) > + fenc->rowSatds[0][0][realrow] = 0; > + fenc->rowSatds[curb - curp0][curp1 - curb][realrow] = 0; > + for (int i = widthInCU - 1 - completed[row]; i >= 0; i--) //Go > backwards > + { > + estimateCUCost(i, realrow, curp0, curp1, curb, bDoSearch, row); > + completed[row]++; > + > + if (completed[row] >= 2 && row < heightInCU - 1) > + { > + ScopedLock below(lock[row + 1]); > + if (active[row + 1] == false && > + completed[row + 1] + 2 <= completed[row]) > + { > + active[row + 1] = true; > + enqueueRow(row + 1); > + } > + } > + > + ScopedLock self(lock[row]); > + if (row > 0 && (int32_t)completed[row] < widthInCU - 1 && > completed[row - 1] < completed[row] + 2) > + { > + active[row] = false; > + return; > + } > + } > + > + if (row == heightInCU - 1) > + { > + rowsCompleted = true; > + } > +} > diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/slicetype.h > --- a/source/encoder/slicetype.h Thu Oct 17 14:14:40 2013 +0530 > +++ b/source/encoder/slicetype.h Thu Oct 17 16:04:05 2013 +0530 > @@ -26,7 +26,7 @@ > > #include "motion.h" > #include "piclist.h" > -#include "common.h" > +#include "wavefront.h" > > namespace x265 { > // private namespace > @@ -35,11 +35,9 @@ > class TComPic; > class TEncCfg; > > -struct Lookahead > +struct Lookahead : public WaveFront > { > - MotionEstimate me; > TEncCfg *cfg; > - pixel *predictions; // buffer for 35 intra predictions > Lowres *frames[X265_LOOKAHEAD_MAX]; > Lowres *lastNonB; > int merange; > @@ -51,9 +49,24 @@ > PicList inputQueue; // input pictures in order received > PicList outputQueue; // pictures to be encoded, in encode order > > + bool bDoSearch[2]; > + int curb, curp0, curp1; > + bool rowsCompleted; > + > + //For wavefront parallelism > + Lock* lock; > + volatile bool* active; > + volatile uint32_t* completed; > + pixel** rowpredictions; // buffer for 35 intra > predictions > + MotionEstimate* rowme; > + int* rowcostEst; > + int* rowcostIntra; > + int* rowIntraMbs; > + > Lookahead(TEncCfg *); > ~Lookahead(); > > + void init(); > void addPicture(TComPic*, int sliceType); > void flush(); > void destroy(); > @@ -61,13 +74,15 @@ > int getEstimatedPictureCost(TComPic *pic); > > int estimateFrameCost(int p0, int p1, int b, bool bIntraPenalty); > - void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool > bDoSearch[2]); > + void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool > bDoSearch[2], int row); > > void slicetypeAnalyse(bool bKeyframe); > int scenecut(int p0, int p1, bool bRealScenecut, int numFrames, int > maxSearch); > int scenecutInternal(int p0, int p1, bool bRealScenecut); > void slicetypePath(int length, char(*best_paths)[X265_LOOKAHEAD_MAX + > 1]); > int slicetypePathCost(char *path, int threshold); > + > + void processRow(int row); > }; > } > > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > -- Steve Borho
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
