Thanks - very nicely done! On Tue, Dec 8, 2015 at 5:24 AM, Min Chen <[email protected]> wrote:
> # HG changeset patch > # User Min Chen <[email protected]> > # Date 1449511560 21600 > # Node ID c68eec7fb242748363ec985937b20ed1aff73f02 > # Parent 3542d3abd018491d6ad67a79b0e6d05b604d3818 > move SAO into class ParallelFilter and modify it to row based > --- > source/common/common.h | 1 + > source/encoder/frameencoder.cpp | 36 +++++++------- > source/encoder/framefilter.cpp | 95 > +++++++++++++++++++++++++------------- > source/encoder/framefilter.h | 14 +++--- > source/encoder/sao.cpp | 81 ++++++++++++++++++++++++--------- > source/encoder/sao.h | 7 ++- > 6 files changed, 151 insertions(+), 83 deletions(-) > > diff -r 3542d3abd018 -r c68eec7fb242 source/common/common.h > --- a/source/common/common.h Mon Dec 07 12:05:57 2015 -0600 > +++ b/source/common/common.h Mon Dec 07 12:06:00 2015 -0600 > @@ -215,6 +215,7 @@ > > #define X265_MALLOC(type, count) (type*)x265_malloc(sizeof(type) * > (count)) > #define X265_FREE(ptr) x265_free(ptr) > +#define X265_FREE_ZERO(ptr) x265_free(ptr); (ptr) = NULL > #define CHECKED_MALLOC(var, type, count) \ > { \ > var = (type*)x265_malloc(sizeof(type) * (count)); \ > diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/frameencoder.cpp > --- a/source/encoder/frameencoder.cpp Mon Dec 07 12:05:57 2015 -0600 > +++ b/source/encoder/frameencoder.cpp Mon Dec 07 12:06:00 2015 -0600 > @@ -1093,7 +1093,7 @@ > > /* SAO parameter estimation using non-deblocked pixels for CTU > bottom and right boundary areas */ > if (m_param->bEnableSAO && m_param->bSaoNonDeblocked) > - m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, > row); > + > m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame, > col, row); > > /* Deblock with idle threading */ > if (m_param->bEnableLoopFilter) > @@ -1103,24 +1103,24 @@ > if (row > 0) > { > // Waitting last threading finish > - m_frameFilter.m_pdeblock[row - 1].waitForExit(); > + m_frameFilter.m_parallelFilter[row - 1].waitForExit(); > > // Processing new group > - const int allowCol = ((row >= 2) ? > X265_MIN(m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(), (int)col) : > col); > - m_frameFilter.m_pdeblock[row - > 1].m_allowedCol.set(allowCol); > - m_frameFilter.m_pdeblock[row - 1].tryBondPeers(*this, 1); > + const int allowCol = ((row >= 2) ? > X265_MIN(m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get(), (int)col) > : col); > + m_frameFilter.m_parallelFilter[row - > 1].m_allowedCol.set(allowCol); > + m_frameFilter.m_parallelFilter[row - > 1].tryBondPeers(*this, 1); > } > > // Last Row may start early > if (row == m_numRows - 1) > { > // Waitting last threading finish > - m_frameFilter.m_pdeblock[row].waitForExit(); > + m_frameFilter.m_parallelFilter[row].waitForExit(); > > // Processing last row > - const int allowCol = ((row >= 2) ? > X265_MIN(m_frameFilter.m_pdeblock[row - 1].m_lastCol.get(), (int)col) : > col); > - m_frameFilter.m_pdeblock[row].m_allowedCol.set(allowCol); > - m_frameFilter.m_pdeblock[row].tryBondPeers(*this, 1); > + const int allowCol = ((row >= 2) ? > X265_MIN(m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get(), (int)col) > : col); > + > m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol); > + m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, > 1); > } > } > > @@ -1188,17 +1188,17 @@ > if (m_param->bEnableLoopFilter & (row > 0)) > { > /* TODO: Multiple Threading */ > - m_frameFilter.m_pdeblock[row - 1].waitForExit(); > + m_frameFilter.m_parallelFilter[row - 1].waitForExit(); > > /* Check to avoid previous row process slower than current > row */ > if (row >= 2) > { > - int prevCol = m_frameFilter.m_pdeblock[row - > 2].m_lastCol.get(); > + int prevCol = m_frameFilter.m_parallelFilter[row - > 2].m_lastCol.get(); > while(prevCol != (int)numCols) > - prevCol = m_frameFilter.m_pdeblock[row - > 2].m_lastCol.waitForChange(prevCol); > + prevCol = m_frameFilter.m_parallelFilter[row - > 2].m_lastCol.waitForChange(prevCol); > } > - m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(numCols); > - m_frameFilter.m_pdeblock[row - 1].processTasks(-1); > + m_frameFilter.m_parallelFilter[row - > 1].m_allowedCol.set(numCols); > + m_frameFilter.m_parallelFilter[row - 1].processTasks(-1); > } > > /* trigger row-wise loop filters */ > @@ -1217,12 +1217,12 @@ > /* TODO: Early start last row */ > if (m_param->bEnableLoopFilter) > { > - X265_CHECK(m_frameFilter.m_pdeblock[row - > 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed"); > + X265_CHECK(m_frameFilter.m_parallelFilter[row - > 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed"); > > /* NOTE: Last Row not execute before, so didn't need wait > */ > - m_frameFilter.m_pdeblock[row].waitForExit(); > - m_frameFilter.m_pdeblock[row].m_allowedCol.set(numCols); > - m_frameFilter.m_pdeblock[row].processTasks(-1); > + m_frameFilter.m_parallelFilter[row].waitForExit(); > + > m_frameFilter.m_parallelFilter[row].m_allowedCol.set(numCols); > + m_frameFilter.m_parallelFilter[row].processTasks(-1); > } > > for (uint32_t i = m_numRows - m_filterRowDelay; i < > m_numRows; i++) > diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/framefilter.cpp > --- a/source/encoder/framefilter.cpp Mon Dec 07 12:05:57 2015 -0600 > +++ b/source/encoder/framefilter.cpp Mon Dec 07 12:06:00 2015 -0600 > @@ -35,19 +35,22 @@ > static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, > uint32_t width, uint32_t height); > static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, > intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& > cnt); > > -uint32_t FrameFilter::ParallelDeblock::numCols = 0; > +uint32_t FrameFilter::ParallelFilter::numCols = 0; > > void FrameFilter::destroy() > { > - if (m_param->bEnableSAO) > - m_sao.destroy(); > - > X265_FREE(m_ssimBuf); > > - if (m_pdeblock) > + if (m_parallelFilter) > { > - delete[] m_pdeblock; > - m_pdeblock = NULL; > + if (m_param->bEnableSAO) > + { > + for(int row = 0; row < m_numRows; row++) > + m_parallelFilter[row].m_sao.destroy((row == 0 ? 1 : 0)); > + } > + > + delete[] m_parallelFilter; > + m_parallelFilter = NULL; > } > } > > @@ -63,50 +66,65 @@ > m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0; > m_lastHeight = m_param->sourceHeight % g_maxCUSize ? > m_param->sourceHeight % g_maxCUSize : g_maxCUSize; > > - if (m_param->bEnableSAO) > - if (!m_sao.create(m_param)) > - m_param->bEnableSAO = 0; > - > if (m_param->bEnableSsim) > m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3)); > > if (m_param->bEnableLoopFilter) > - m_pdeblock = new ParallelDeblock[numRows]; > + m_parallelFilter = new ParallelFilter[numRows]; > > - if (m_pdeblock) > + if (m_parallelFilter) > { > + if (m_param->bEnableSAO) > + { > + for(int row = 0; row < numRows; row++) > + { > + if (!m_parallelFilter[row].m_sao.create(m_param, (row == > 0 ? 1 : 0))) > + m_param->bEnableSAO = 0; > + else > + { > + if (row != 0) > + > m_parallelFilter[row].m_sao.createFromRootNode(&m_parallelFilter[0].m_sao); > + } > + > + } > + } > + > for(int row = 0; row < numRows; row++) > { > - m_pdeblock[row].m_rowAddr = row * numCols; > - m_pdeblock[row].m_frameEncoder = m_frameEncoder; > + m_parallelFilter[row].m_rowAddr = row * numCols; > + m_parallelFilter[row].m_frameEncoder = m_frameEncoder; > } > } > > // Setting maximum columns > - ParallelDeblock::numCols = numCols; > + ParallelFilter::numCols = numCols; > } > > void FrameFilter::start(Frame *frame, Entropy& initState, int qp) > { > m_frame = frame; > > - if (m_param->bEnableSAO) > - m_sao.startSlice(frame, initState, qp); > - > - // Reset Deblock Data Struct > - if (m_pdeblock) > + // Reset Filter Data Struct > + if (m_parallelFilter) > { > for(int row = 0; row < m_numRows; row++) > { > - m_pdeblock[row].m_lastCol.set(0); > - m_pdeblock[row].m_allowedCol.set(0); > - m_pdeblock[row].m_encData = frame->m_encData; > + if (m_param->bEnableSAO) > + m_parallelFilter[row].m_sao.startSlice(frame, initState, > qp); > + > + m_parallelFilter[row].m_lastCol.set(0); > + m_parallelFilter[row].m_allowedCol.set(0); > + m_parallelFilter[row].m_encData = frame->m_encData; > } > + > + // Reset SAO global/common statistics > + if (m_param->bEnableSAO) > + m_parallelFilter[0].m_sao.resetStats(); > } > } > > // NOTE: Single Threading only > -void FrameFilter::ParallelDeblock::processTasks(int /*workerThreadId*/) > +void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/) > { > const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms; > const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap; > @@ -160,11 +178,11 @@ > SAOParam* saoParam = encData.m_saoParam; > if (m_param->bEnableSAO) > { > - m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext); > - m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext); > - m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext); > + > m_parallelFilter[row].m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext); > + > m_parallelFilter[row].m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext); > + > m_parallelFilter[row].m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext); > > - m_sao.rdoSaoUnitRow(saoParam, row); > + m_parallelFilter[row].m_sao.rdoSaoUnitRow(saoParam, row); > > // NOTE: Delay a row because SAO decide need top row pixels at > next row, is it HM's bug? > if (row >= m_saoRowDelay) > @@ -180,7 +198,7 @@ > { > if (m_param->bEnableSAO) > { > - m_sao.rdoSaoUnitRowEnd(saoParam, > encData.m_slice->m_sps->numCUsInFrame); > + m_parallelFilter[row].m_sao.rdoSaoUnitRowEnd(saoParam, > encData.m_slice->m_sps->numCUsInFrame); > > for (int i = m_numRows - m_saoRowDelay; i < m_numRows; i++) > processSao(i); > @@ -489,12 +507,23 @@ > SAOParam* saoParam = encData.m_saoParam; > > if (saoParam->bSaoFlag[0]) > - m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0); > + { > + > m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, > 0); > + if (row != m_numRows - 1) > + { > + memcpy(m_parallelFilter[row + 1].m_sao.m_tmpU1[0], > m_parallelFilter[row].m_sao.m_tmpU1[0], sizeof(pixel) * > m_param->sourceWidth); > + } > + } > > if (saoParam->bSaoFlag[1]) > { > - m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1); > - m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2); > + > m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, > 1); > + > m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, > 2); > + if (row != m_numRows - 1) > + { > + memcpy(m_parallelFilter[row + 1].m_sao.m_tmpU1[1], > m_parallelFilter[row].m_sao.m_tmpU1[1], sizeof(pixel) * > m_param->sourceWidth); > + memcpy(m_parallelFilter[row + 1].m_sao.m_tmpU1[2], > m_parallelFilter[row].m_sao.m_tmpU1[2], sizeof(pixel) * > m_param->sourceWidth); > + } > } > > if (encData.m_slice->m_pps->bTransquantBypassEnabled) > diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/framefilter.h > --- a/source/encoder/framefilter.h Mon Dec 07 12:05:57 2015 -0600 > +++ b/source/encoder/framefilter.h Mon Dec 07 12:06:00 2015 -0600 > @@ -51,7 +51,6 @@ > int m_vChromaShift; > int m_pad[2]; > > - SAO m_sao; > int m_numRows; > int m_saoRowDelay; > int m_lastHeight; > @@ -59,41 +58,42 @@ > void* m_ssimBuf; /* Temp storage for ssim computation */ > > #define MAX_PFILTER_CUS (4) /* maximum CUs for every thread */ > - class ParallelDeblock : public BondedTaskGroup, public Deblock > + class ParallelFilter : public BondedTaskGroup, public Deblock > { > public: > static uint32_t numCols; > uint32_t m_rowAddr; > FrameEncoder* m_frameEncoder; > FrameData* m_encData; > + SAO m_sao; > ThreadSafeInteger m_lastCol; /* The column that next > to process */ > ThreadSafeInteger m_allowedCol; /* The column that > processed from Encode pipeline */ > > - ParallelDeblock() > + ParallelFilter() > : m_rowAddr(0) > , m_frameEncoder(NULL) > , m_encData(NULL) > { > } > > - ~ParallelDeblock() > + ~ParallelFilter() > { } > > void processTasks(int workerThreadId); > > protected: > > - ParallelDeblock operator=(const ParallelDeblock&); > + ParallelFilter operator=(const ParallelFilter&); > }; > > - ParallelDeblock* m_pdeblock; > + ParallelFilter* m_parallelFilter; > > FrameFilter() > : m_param(NULL) > , m_frame(NULL) > , m_frameEncoder(NULL) > , m_ssimBuf(NULL) > - , m_pdeblock(NULL) > + , m_parallelFilter(NULL) > { > } > > diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/sao.cpp > --- a/source/encoder/sao.cpp Mon Dec 07 12:05:57 2015 -0600 > +++ b/source/encoder/sao.cpp Mon Dec 07 12:06:00 2015 -0600 > @@ -103,7 +103,7 @@ > m_depthSaoRate[1][3] = 0; > } > > -bool SAO::create(x265_param* param) > +bool SAO::create(x265_param* param, int initCommon) > { > m_param = param; > m_chromaFormat = param->internalCsp; > @@ -131,12 +131,24 @@ > m_tmpU2[i] += 1; > } > > - CHECKED_MALLOC(m_count, PerClass, NUM_PLANE); > - CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE); > - CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE); > + if (initCommon) > + { > + CHECKED_MALLOC(m_count, PerClass, NUM_PLANE); > + CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE); > + CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE); > > - CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu); > - CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu); > + CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu); > + CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu); > + } > + else > + { > + // must initialize these common pointer outside of function > + m_count = NULL; > + m_offset = NULL; > + m_offsetOrg = NULL; > + m_countPreDblk = NULL; > + m_offsetOrgPreDblk = NULL; > + } > > m_clipTable = &(m_clipTableBase[rangeExt]); > > @@ -155,24 +167,50 @@ > return false; > } > > -void SAO::destroy() > +void SAO::createFromRootNode(SAO* root) > { > - X265_FREE(m_clipTableBase); > + X265_CHECK(m_count == NULL, "duplicate initialize on m_count"); > + X265_CHECK(m_offset == NULL, "duplicate initialize on m_offset"); > + X265_CHECK(m_offsetOrg == NULL, "duplicate initialize on > m_offsetOrg"); > + X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on > m_countPreDblk"); > + X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on > m_offsetOrgPreDblk"); > > - X265_FREE(m_tmpL1); > - X265_FREE(m_tmpL2); > + m_count = root->m_count; > + m_offset = root->m_offset; > + m_offsetOrg = root->m_offsetOrg; > + m_countPreDblk = root->m_countPreDblk; > + m_offsetOrgPreDblk = root->m_offsetOrgPreDblk; > +} > + > +void SAO::destroy(int destoryCommon) > +{ > + X265_FREE_ZERO(m_clipTableBase); > + > + X265_FREE_ZERO(m_tmpL1); > + X265_FREE_ZERO(m_tmpL2); > > for (int i = 0; i < 3; i++) > { > - if (m_tmpU1[i]) X265_FREE(m_tmpU1[i] - 1); > - if (m_tmpU2[i]) X265_FREE(m_tmpU2[i] - 1); > + if (m_tmpU1[i]) > + { > + X265_FREE(m_tmpU1[i] - 1); > + m_tmpU1[i] = NULL; > + } > + if (m_tmpU2[i]) > + { > + X265_FREE(m_tmpU2[i] - 1); > + m_tmpU2[i] = NULL; > + } > } > > - X265_FREE(m_count); > - X265_FREE(m_offset); > - X265_FREE(m_offsetOrg); > - X265_FREE(m_countPreDblk); > - X265_FREE(m_offsetOrgPreDblk); > + if (destoryCommon) > + { > + X265_FREE(m_count); > + X265_FREE(m_offset); > + X265_FREE(m_offsetOrg); > + X265_FREE(m_countPreDblk); > + X265_FREE(m_offsetOrgPreDblk); > + } > } > > /* allocate memory for SAO parameters */ > @@ -210,8 +248,6 @@ > break; > } > > - resetStats(); > - > m_entropyCoder.load(initState); > m_rdContexts.next.load(initState); > m_rdContexts.cur.load(initState); > @@ -586,15 +622,14 @@ > ctuHeight >>= m_vChromaShift; > } > > + int addr = idxY * m_numCuInWidth; > + pixel* rec = reconPic->getPlaneAddr(plane, addr); > + > if (!idxY) > { > - pixel* rec = reconPic->m_picOrg[plane]; > memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidth); > } > > - int addr = idxY * m_numCuInWidth; > - pixel* rec = plane ? reconPic->getChromaAddr(plane, addr) : > reconPic->getLumaAddr(addr); > - > for (int i = 0; i < ctuHeight + 1; i++) > { > m_tmpL1[i] = rec[0]; > diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/sao.h > --- a/source/encoder/sao.h Mon Dec 07 12:05:57 2015 -0600 > +++ b/source/encoder/sao.h Mon Dec 07 12:06:00 2015 -0600 > @@ -120,8 +120,9 @@ > > SAO(); > > - bool create(x265_param* param); > - void destroy(); > + bool create(x265_param* param, int initCommon); > + void createFromRootNode(SAO *root); > + void destroy(int destoryCommon); > > void allocSaoParam(SAOParam* saoParam) const; > > @@ -147,6 +148,8 @@ > > void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus); > void rdoSaoUnitRow(SAOParam* saoParam, int idxY); > + > + friend class FrameFilter; > }; > > } > > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > -- Deepthi Nandakumar Engineering Manager, x265 Multicoreware, Inc
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
