On Fri, Oct 18, 2013 at 5:22 AM, Steve Borho <[email protected]> wrote:
> > > > On Thu, Oct 17, 2013 at 5:39 AM, <[email protected]>wrote: > >> # HG changeset patch >> # User Deepthi Devaki <[email protected]> >> # Date 1382006045 -19800 >> # Node ID 677d71fd793959c7e98df56f99f0b02a89fe050d >> # Parent fc9dbd798ac37ec1acc0596aa179f0deb586c092 >> Lookahead: Implement wavefront parallel processing >> > > this looks good; I think you have the general idea down. It can be > simplified a great deal, comments below > > >> diff -r fc9dbd798ac3 -r 677d71fd7939 source/common/wavefront.h >> --- a/source/common/wavefront.h Thu Oct 17 14:14:40 2013 +0530 >> +++ b/source/common/wavefront.h Thu Oct 17 16:04:05 2013 +0530 >> @@ -47,11 +47,6 @@ >> >> int m_numRows; >> >> - // WaveFront's implementation of JobProvider::findJob. Consults >> - // m_queuedBitmap and calls ProcessRow(row) for lowest numbered >> queued row >> - // or returns false >> - bool findJob(); >> - >> public: >> >> WaveFront(ThreadPool *pool) : JobProvider(pool), m_queuedBitmap(0) {} >> @@ -69,6 +64,11 @@ >> >> void clearEnabledRowMask(); >> >> + // WaveFront's implementation of JobProvider::findJob. Consults >> + // m_queuedBitmap and calls ProcessRow(row) for lowest numbered >> queued row >> + // or returns false >> + bool findJob(); >> + >> > > I think this is unnecessary if you refer to your own findJob() method > (this->findJob()) > > Do you mean to write separate findJob() for Lookahead? But then it will be a replica of Wavefront::findjob() > // Returns true if a row above curRow is available for processing. >> The processRow() >> // method may call this function periodically and voluntarily exit >> bool checkHigherPriorityRow(int curRow); >> diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/encoder.cpp >> --- a/source/encoder/encoder.cpp Thu Oct 17 14:14:40 2013 +0530 >> +++ b/source/encoder/encoder.cpp Thu Oct 17 16:04:05 2013 +0530 >> @@ -92,6 +92,7 @@ >> } >> } >> m_lookahead = new Lookahead(this); >> + m_lookahead->setThreadPool(m_threadPool); >> m_dpb = new DPB(this); >> m_rateControl = new RateControl(this); >> } >> @@ -144,6 +145,7 @@ >> m_frameEncoder[i].init(this, numRows); >> } >> } >> + m_lookahead->init(); >> } >> >> int Encoder::getStreamHeaders(NALUnitEBSP **nalunits) >> diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/slicetype.cpp >> --- a/source/encoder/slicetype.cpp Thu Oct 17 14:14:40 2013 +0530 >> +++ b/source/encoder/slicetype.cpp Thu Oct 17 16:04:05 2013 +0530 >> @@ -62,29 +62,67 @@ >> dst.y = median(a.y, b.y, c.y); >> } >> >> -Lookahead::Lookahead(TEncCfg *_cfg) >> +Lookahead::Lookahead(TEncCfg *_cfg) : WaveFront(NULL) >> > > Pass the thread pool pointer as a second argument and initialize WaveFront > with it; then you don't need the setThreadPool() method > > >> { >> this->cfg = _cfg; >> numDecided = 0; >> lastKeyframe = -cfg->param.keyframeMax; >> lastNonB = NULL; >> - predictions = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8); >> - me.setQP(X265_LOOKAHEAD_QP); >> - me.setSearchMethod(X265_HEX_SEARCH); >> - me.setSubpelRefine(1); >> merange = 16; >> widthInCU = ((cfg->param.sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> >> X265_LOWRES_CU_BITS; >> heightInCU = ((cfg->param.sourceHeight / 2) + X265_LOWRES_CU_SIZE - >> 1) >> X265_LOWRES_CU_BITS; >> + >> + lock = new Lock[heightInCU]; >> + rowme = new MotionEstimate[heightInCU]; >> + >> + active = (bool*)X265_MALLOC(bool, heightInCU); >> + completed = (uint32_t*)X265_MALLOC(uint32_t, heightInCU); >> + rowpredictions = (pixel**)X265_MALLOC(pixel*, heightInCU); >> + rowcostEst = (int*)X265_MALLOC(int, heightInCU); >> + rowcostIntra = (int*)X265_MALLOC(int, heightInCU); >> + rowIntraMbs = (int*)X265_MALLOC(int, heightInCU); >> + for (int i = 0; i < heightInCU; i++) >> + { >> + rowme[i].setQP(X265_LOOKAHEAD_QP); >> + rowme[i].setSearchMethod(X265_HEX_SEARCH); >> + rowme[i].setSubpelRefine(1); >> + rowpredictions[i] = (pixel*)X265_MALLOC(pixel, 35 * 8 * 8); >> + rowcostEst[i] = 0; >> + rowcostIntra[i] = 0; >> + rowIntraMbs[i] = 0; >> + } >> > > This would be much cleaner as a per-row struct similar to CTURow > (LookaheadRow?) > > >> } >> >> Lookahead::~Lookahead() >> { >> } >> >> +void Lookahead::init() >> +{ >> + if (!WaveFront::init(heightInCU)) >> + { >> + m_pool = NULL; >> + } >> + memset((void*)completed, 0, heightInCU * sizeof(uint32_t)); >> + memset((void*)active, 0, heightInCU * sizeof(bool)); >> > > these memsets are unnecessary > > >> +} >> + >> void Lookahead::destroy() >> { >> - if (predictions) >> - X265_FREE(predictions); >> + delete[] lock; >> + delete[] rowme; >> + >> + X265_FREE((void*)active); >> + X265_FREE((void*)completed); >> + for (int i = 0; i < heightInCU; i++) >> + { >> + X265_FREE(rowpredictions[i]); >> + } >> + >> + X265_FREE(rowpredictions); >> + X265_FREE(rowcostEst); >> + X265_FREE(rowcostIntra); >> + X265_FREE(rowIntraMbs); >> >> // these two queues will be empty, unless the encode was aborted >> while (!inputQueue.empty()) >> @@ -165,9 +203,12 @@ >> int Lookahead::estimateFrameCost(int p0, int p1, int b, bool >> bIntraPenalty) >> { >> int score = 0; >> - bool bDoSearch[2]; >> Lowres *fenc = frames[b]; >> >> + curb = b; >> + curp0 = p0; >> + curp1 = p1; >> + >> if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 >> - b][0] != -1) >> score = fenc->costEst[b - p0][p1 - b]; >> else >> @@ -185,23 +226,58 @@ >> * predictors in the main encode. This considerably improves MV >> * prediction overall. */ >> // TODO: use lowres MVs as motion candidates in full-res search >> - me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride); >> - for (int j = heightInCU - 1; j >= 0; j--) >> + >> + for (int i = 0; i < heightInCU; i++) >> { >> - if (!fenc->bIntraCalculated) >> - fenc->rowSatds[0][0][j] = 0; >> - fenc->rowSatds[b - p0][p1 - b][j] = 0; >> + rowme[i].setSourcePlane(fenc->lowresPlane[0], >> fenc->lumaStride); >> + } >> >> - for (int i = widthInCU - 1; i >= 0; i--) >> + memset((void*)completed, 0, heightInCU * sizeof(uint32_t)); >> + memset((void*)active, 0, heightInCU * sizeof(bool)); >> + memset((void*)rowcostEst, 0, heightInCU * sizeof(int)); >> + memset((void*)rowcostIntra, 0, heightInCU * sizeof(int)); >> + memset((void*)rowIntraMbs, 0, heightInCU * sizeof(int)); >> + rowsCompleted = false; >> + >> + if (m_pool && cfg->param.bEnableWavefront) >> > > checking bEnableWavefront is unnecessary here; we can always use > wave-front lookahead scheduling so long as we have a thread pool. > > >> + { >> + WaveFront::clearEnabledRowMask(); >> > > We need another method which just sets the enabledRowMask to all 1s > because it's only purpose is to track reference frame dependencies and we > have none here. > > >> + WaveFront::enqueue(); >> + >> + for (int row = 0; row < heightInCU; row++) >> { >> - estimateCUCost(i, j, p0, p1, b, bDoSearch); >> + enableRow(row); >> + if (row == 0) >> + enqueueRow(0); >> + else >> + m_pool->pokeIdleThread(); >> + } >> > > this whole loop is unnecessary (no frame dependencies), just use > equeueRow(0) > > >> + >> + while (!rowsCompleted) >> + { >> + WaveFront::findJob(); >> + } >> + >> + WaveFront::dequeue(); >> + } >> + else >> + { >> + for (int row = 0; row < heightInCU; row++) >> + { >> + processRow(row); >> } >> } >> >> + //Accumulate cost from each row >> > nit: space after // > >> + for (int row = 0; row < heightInCU; row++) >> + { >> + score += rowcostEst[row]; >> + fenc->costEst[0][0] += rowcostIntra[row]; >> + fenc->intraMbs[b - p0] += rowIntraMbs[row]; >> > > score must be saved in fenc->costEst[b - p0][p1 - b]; > > Yes. It is stored later. > + } >> + >> fenc->bIntraCalculated = true; >> >> - score = fenc->costEst[b - p0][p1 - b]; >> - >> if (b != p1) >> score = (uint64_t)score * 100 / (130 + >> cfg->param.bFrameBias); >> >> @@ -218,7 +294,7 @@ >> return score; >> } >> >> -void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b, >> bool bDoSearch[2]) >> +void Lookahead::estimateCUCost(int cux, int cuy, int p0, int p1, int b, >> bool bDoSearch[2], int row) >> > > perhaps this would be cleaner if it were a method of LookaheadRow > > >> { >> Lowres *fref0 = frames[p0]; >> Lowres *fref1 = frames[p1]; >> @@ -233,7 +309,7 @@ >> const bool bFrameScoreCU = (cux > 0 && cux < widthInCU - 1 && >> cuy > 0 && cuy < heightInCU - 1) || >> widthInCU <= 2 || heightInCU <= 2; >> >> - me.setSourcePU(pelOffset, cuSize, cuSize); >> + rowme[row].setSourcePU(pelOffset, cuSize, cuSize); >> >> MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY], >> &fenc->lowresMvs[1][p1 - b - 1][cuXY] }; >> @@ -241,7 +317,7 @@ >> &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] }; >> >> MV mvmin, mvmax; >> - int bcost = me.COST_MAX; >> + int bcost = rowme[row].COST_MAX; >> int listused = 0; >> >> // establish search bounds that don't cross extended frame boundaries >> @@ -286,7 +362,7 @@ >> median_mv(mvp, mvc[0], mvc[1], mvc[2]); >> } >> >> - *fenc_costs[i] = me.motionEstimate(i ? fref1 : fref0, mvmin, >> mvmax, mvp, numc, mvc, merange, *fenc_mvs[i]); >> + *fenc_costs[i] = rowme[row].motionEstimate(i ? fref1 : >> fref0, mvmin, mvmax, mvp, numc, mvc, merange, *fenc_mvs[i]); >> COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1); >> } >> if (bBidir) >> @@ -388,23 +464,23 @@ >> int predsize = cuSize * cuSize; >> >> // generate 35 intra predictions into tmp >> - primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, predictions, >> cuSize, cuSize, (cuSize <= 16)); >> + primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, >> rowpredictions[row], cuSize, cuSize, (cuSize <= 16)); >> pixel *above = (cuSize >= 8) ? pAbove1 : pAbove0; >> pixel *left = (cuSize >= 8) ? pLeft1 : pLeft0; >> - primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + >> 1, predictions + predsize, cuSize, cuSize); >> - primitives.intra_pred_allangs[nLog2SizeMinus2](predictions + 2 * >> predsize, pAbove0, pLeft0, pAbove1, pLeft1, (cuSize <= 16)); >> + primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + >> 1, rowpredictions[row] + predsize, cuSize, cuSize); >> + >> primitives.intra_pred_allangs[nLog2SizeMinus2](rowpredictions[row] + 2 * >> predsize, pAbove0, pLeft0, pAbove1, pLeft1, (cuSize <= 16)); >> >> // calculate 35 satd costs, keep least cost >> ALIGN_VAR_32(pixel, buf_trans[32 * 32]); >> - primitives.transpose[nLog2SizeMinus2](buf_trans, me.fenc, >> FENC_STRIDE); >> + primitives.transpose[nLog2SizeMinus2](buf_trans, >> rowme[row].fenc, FENC_STRIDE); >> pixelcmp_t satd = primitives.satd[PartitionFromSizes(cuSize, >> cuSize)]; >> - int icost = me.COST_MAX, cost; >> + int icost = rowme[row].COST_MAX, cost; >> for (UInt mode = 0; mode < 35; mode++) >> { >> if ((mode >= 2) && (mode < 18)) >> - cost = satd(buf_trans, cuSize, &predictions[mode * >> predsize], cuSize); >> + cost = satd(buf_trans, cuSize, &rowpredictions[row][mode >> * predsize], cuSize); >> else >> - cost = satd(me.fenc, FENC_STRIDE, &predictions[mode * >> predsize], cuSize); >> + cost = satd(rowme[row].fenc, FENC_STRIDE, >> &rowpredictions[row][mode * predsize], cuSize); >> if (cost < icost) >> icost = cost; >> } >> @@ -412,14 +488,14 @@ >> // TOOD: i_icost += intra_penalty + lowres_penalty; >> fenc->intraCost[cuXY] = icost; >> fenc->rowSatds[0][0][cuy] += icost; >> - if (bFrameScoreCU) fenc->costEst[0][0] += icost; >> + if (bFrameScoreCU) rowcostIntra[row] += icost; >> } >> >> if (!bBidir) >> { >> if (fenc->intraCost[cuXY] < bcost) >> { >> - if (bFrameScoreCU) fenc->intraMbs[b - p0]++; >> + if (bFrameScoreCU) rowIntraMbs[row]++; >> bcost = fenc->intraCost[cuXY]; >> listused = 0; >> } >> @@ -429,7 +505,7 @@ >> if (p0 != p1) >> { >> fenc->rowSatds[b - p0][p1 - b][cuy] += bcost; >> - if (bFrameScoreCU) fenc->costEst[b - p0][p1 - b] += bcost; >> + if (bFrameScoreCU) rowcostEst[row] += bcost; >> } >> fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, >> LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT)); >> } >> @@ -547,6 +623,7 @@ >> { >> frames[i + 1] = &list[i]->m_lowres; >> } >> + >> if (IS_X265_TYPE_I(frames[bframes + 1]->sliceType)) >> p0 = bframes + 1; >> else // P >> @@ -914,7 +991,7 @@ >> { >> char paths[2][X265_LOOKAHEAD_MAX + 1]; >> int num_paths = X265_MIN(cfg->param.bframes + 1, length); >> - int best_cost = me.COST_MAX; >> + int best_cost = rowme[0].COST_MAX; >> int idx = 0; >> >> /* Iterate over all currently possible paths */ >> @@ -993,3 +1070,41 @@ >> >> return cost; >> } >> + >> +void Lookahead::processRow(int row) >> +{ >> + int realrow = heightInCU - 1 - row; >> + Lowres *fenc = frames[curb]; >> + >> + if (!fenc->bIntraCalculated) >> + fenc->rowSatds[0][0][realrow] = 0; >> + fenc->rowSatds[curb - curp0][curp1 - curb][realrow] = 0; >> + for (int i = widthInCU - 1 - completed[row]; i >= 0; i--) //Go >> backwards >> + { >> + estimateCUCost(i, realrow, curp0, curp1, curb, bDoSearch, row); >> + completed[row]++; >> + >> + if (completed[row] >= 2 && row < heightInCU - 1) >> + { >> + ScopedLock below(lock[row + 1]); >> + if (active[row + 1] == false && >> + completed[row + 1] + 2 <= completed[row]) >> + { >> + active[row + 1] = true; >> + enqueueRow(row + 1); >> + } >> + } >> + >> + ScopedLock self(lock[row]); >> + if (row > 0 && (int32_t)completed[row] < widthInCU - 1 && >> completed[row - 1] < completed[row] + 2) >> + { >> + active[row] = false; >> + return; >> + } >> + } >> + >> + if (row == heightInCU - 1) >> + { >> + rowsCompleted = true; >> + } >> +} >> diff -r fc9dbd798ac3 -r 677d71fd7939 source/encoder/slicetype.h >> --- a/source/encoder/slicetype.h Thu Oct 17 14:14:40 2013 +0530 >> +++ b/source/encoder/slicetype.h Thu Oct 17 16:04:05 2013 +0530 >> @@ -26,7 +26,7 @@ >> >> #include "motion.h" >> #include "piclist.h" >> -#include "common.h" >> +#include "wavefront.h" >> >> namespace x265 { >> // private namespace >> @@ -35,11 +35,9 @@ >> class TComPic; >> class TEncCfg; >> >> -struct Lookahead >> +struct Lookahead : public WaveFront >> { >> - MotionEstimate me; >> TEncCfg *cfg; >> - pixel *predictions; // buffer for 35 intra predictions >> Lowres *frames[X265_LOOKAHEAD_MAX]; >> Lowres *lastNonB; >> int merange; >> @@ -51,9 +49,24 @@ >> PicList inputQueue; // input pictures in order received >> PicList outputQueue; // pictures to be encoded, in encode order >> >> + bool bDoSearch[2]; >> + int curb, curp0, curp1; >> + bool rowsCompleted; >> + >> + //For wavefront parallelism >> + Lock* lock; >> + volatile bool* active; >> + volatile uint32_t* completed; >> + pixel** rowpredictions; // buffer for 35 intra >> predictions >> + MotionEstimate* rowme; >> + int* rowcostEst; >> + int* rowcostIntra; >> + int* rowIntraMbs; >> + >> Lookahead(TEncCfg *); >> ~Lookahead(); >> >> + void init(); >> void addPicture(TComPic*, int sliceType); >> void flush(); >> void destroy(); >> @@ -61,13 +74,15 @@ >> int getEstimatedPictureCost(TComPic *pic); >> >> int estimateFrameCost(int p0, int p1, int b, bool bIntraPenalty); >> - void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool >> bDoSearch[2]); >> + void estimateCUCost(int cux, int cuy, int p0, int p1, int b, bool >> bDoSearch[2], int row); >> >> void slicetypeAnalyse(bool bKeyframe); >> int scenecut(int p0, int p1, bool bRealScenecut, int numFrames, int >> maxSearch); >> int scenecutInternal(int p0, int p1, bool bRealScenecut); >> void slicetypePath(int length, char(*best_paths)[X265_LOOKAHEAD_MAX >> + 1]); >> int slicetypePathCost(char *path, int threshold); >> + >> + void processRow(int row); >> }; >> } >> >> _______________________________________________ >> x265-devel mailing list >> [email protected] >> https://mailman.videolan.org/listinfo/x265-devel >> > > > > -- > Steve Borho > > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > >
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
