# HG changeset patch # User Gopu Govindaswamy <g...@multicorewareinc.com> # Date 1410866456 -19800 # Tue Sep 16 16:50:56 2014 +0530 # Node ID 717ea14104cf32bbcafe8e9b8ddef17867807936 # Parent 1de67321275e70d510f0df3d5b7d4b9d391a1e66 analysis: intra picture estimation (mode and split decision)information sharing
when --analysis-mode=save - the encoder runs a full encode and dump the best split and mode decisions into x265_analysis.dat(default file name if file name is not provided) file when --analysis-mode=load - the encoder reads the best split and mode decisions from x265_analysis.dat and bypass the actual split and mode decisions, and therefore perform a much faster encode diff -r 1de67321275e -r 717ea14104cf source/Lib/TLibCommon/TComRom.cpp --- a/source/Lib/TLibCommon/TComRom.cpp Mon Sep 15 15:00:13 2014 +0200 +++ b/source/Lib/TLibCommon/TComRom.cpp Tue Sep 16 16:50:56 2014 +0530 @@ -505,5 +505,18 @@ 0x38, }; + /* Contains how much to increment shared depth buffer for different ctu sizes to get next best depth + * here, depth 0 = 64x64, depth 1 = 32x32, depth 2 = 16x16 and depth 3 = 8x8 + * if ctu = 64, depth buffer size is 256 combination of depth values 0, 1, 2, 3 + * if ctu = 32, depth buffer size is 64 combination of depth values 1, 2, 3 + * if ctu = 16, depth buffer size is 16 combination of depth values 2, 3 */ + +const uint32_t g_depthInc[3][4] = +{ + { 16, 4, 0, 0}, + { 64, 16, 4, 1}, + {256, 64, 16, 4} +}; + } //! \} diff -r 1de67321275e -r 717ea14104cf source/Lib/TLibCommon/TComRom.h --- a/source/Lib/TLibCommon/TComRom.h Mon Sep 15 15:00:13 2014 +0200 +++ b/source/Lib/TLibCommon/TComRom.h Tue Sep 16 16:50:56 2014 +0530 @@ -155,6 +155,8 @@ // Intra tables extern const uint8_t g_intraFilterFlags[35]; +extern const uint32_t g_depthInc[3][4]; + } #endif //ifndef X265_TCOMROM_H diff -r 1de67321275e -r 717ea14104cf source/encoder/analysis.cpp --- a/source/encoder/analysis.cpp Mon Sep 15 15:00:13 2014 +0200 +++ b/source/encoder/analysis.cpp Tue Sep 16 16:50:56 2014 +0530 @@ -311,14 +311,25 @@ uint32_t numPartition = cu->getTotalNumPart(); if (m_bestCU[0]->m_slice->m_sliceType == I_SLICE) { - compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, cu->m_CULocalData); - if (m_param->analysisMode == 1) + if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_bestCU[0]->m_pic->m_intraData) { - memcpy(&m_bestCU[0]->m_pic->m_intraData->depth[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getDepth(), sizeof(uint8_t) * cu->getTotalNumPart()); - memcpy(&m_bestCU[0]->m_pic->m_intraData->modes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getLumaIntraDir(), sizeof(uint8_t) * cu->getTotalNumPart()); - memcpy(&m_bestCU[0]->m_pic->m_intraData->partSizes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getPartitionSize(), sizeof(char) * cu->getTotalNumPart()); - m_bestCU[0]->m_pic->m_intraData->cuAddr[cu->getAddr()] = cu->getAddr(); - m_bestCU[0]->m_pic->m_intraData->poc[cu->getAddr()] = cu->m_pic->m_POC; + uint32_t zOrder = 0; + compressSharedIntraCTU(m_bestCU[0], m_tempCU[0], false, cu, cu->m_CULocalData, + &m_bestCU[0]->m_pic->m_intraData->depth[cu->getAddr() * cu->m_numPartitions], + &m_bestCU[0]->m_pic->m_intraData->partSizes[cu->getAddr() * cu->m_numPartitions], + &m_bestCU[0]->m_pic->m_intraData->modes[cu->getAddr() * cu->m_numPartitions], zOrder); + } + else + { + compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, cu->m_CULocalData); + if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_bestCU[0]->m_pic->m_intraData) + { + memcpy(&m_bestCU[0]->m_pic->m_intraData->depth[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getDepth(), sizeof(uint8_t) * cu->getTotalNumPart()); + memcpy(&m_bestCU[0]->m_pic->m_intraData->modes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getLumaIntraDir(), sizeof(uint8_t) * cu->getTotalNumPart()); + memcpy(&m_bestCU[0]->m_pic->m_intraData->partSizes[cu->getAddr() * cu->m_numPartitions], m_bestCU[0]->getPartitionSize(), sizeof(char) * cu->getTotalNumPart()); + m_bestCU[0]->m_pic->m_intraData->cuAddr[cu->getAddr()] = cu->getAddr(); + m_bestCU[0]->m_pic->m_intraData->poc[cu->getAddr()] = cu->m_pic->m_POC; + } } if (m_param->bLogCuStats || m_param->rc.bStatWrite) { @@ -424,9 +435,9 @@ if (cu_unsplit_flag) { m_quant.setQPforQuant(outTempCU); - checkIntra(outBestCU, outTempCU, SIZE_2Nx2N, cu); + checkIntra(outBestCU, outTempCU, SIZE_2Nx2N, cu, NULL); if (depth == g_maxCUDepth) - checkIntra(outBestCU, outTempCU, SIZE_NxN, cu); + checkIntra(outBestCU, outTempCU, SIZE_NxN, cu, NULL); else { m_entropyCoder->resetBits(); @@ -533,7 +544,141 @@ #endif } -void Analysis::checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu) +void Analysis::compressSharedIntraCTU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth, TComDataCU* cuPicsym, CU *cu, uint8_t* sharedDepth, char* sharedPartSizes, uint8_t* sharedModes, uint32_t &zOrder) +{ + Frame* pic = outBestCU->m_pic; + + // if current depth == shared depth then skip further splitting. + bool bSubBranch = true; + + // index to g_depthInc array to increment zOrder offset to next depth + int32_t ctuToDepthIndex = g_maxCUDepth - 1; + + if (!depth) + m_origYuv[depth]->copyFromPicYuv(pic->getPicYuvOrg(), outBestCU->getAddr(), outBestCU->getZorderIdxInCU()); + else + m_origYuv[0]->copyPartToYuv(m_origYuv[depth], outBestCU->getZorderIdxInCU()); + + Slice* slice = outTempCU->m_slice; + int32_t cu_split_flag = !(cu->flags & CU::LEAF); + int32_t cu_unsplit_flag = !(cu->flags & CU::SPLIT_MANDATORY); + + if (cu_unsplit_flag && ((zOrder == outBestCU->getZorderIdxInCU()) && (depth == sharedDepth[zOrder]))) + { + m_quant.setQPforQuant(outTempCU); + checkIntra(outBestCU, outTempCU, (PartSize)sharedPartSizes[zOrder], cu, &sharedModes[zOrder]); + + if (!(depth == g_maxCUDepth)) + { + m_entropyCoder->resetBits(); + m_entropyCoder->codeSplitFlag(outBestCU, 0, depth); + outBestCU->m_totalBits += m_entropyCoder->getNumberOfWrittenBits(); + } + + // set current best CU cost to 0 marking as best CU present in shared CU data + outBestCU->m_totalRDCost = 0; + bSubBranch = false; + + // increment zOrder offset to point to next best depth in sharedDepth buffer + zOrder += g_depthInc[ctuToDepthIndex][sharedDepth[zOrder]]; + } + + // copy original YUV samples in lossless mode + if (outBestCU->isLosslessCoded(0)) + fillOrigYUVBuffer(outBestCU, m_origYuv[depth]); + + // further split + if (cu_split_flag && bSubBranch) + { + uint32_t nextDepth = depth + 1; + TComDataCU* subBestPartCU = m_bestCU[nextDepth]; + TComDataCU* subTempPartCU = m_tempCU[nextDepth]; + for (uint32_t partUnitIdx = 0; partUnitIdx < 4; partUnitIdx++) + { + CU *child_cu = cuPicsym->m_CULocalData + cu->childIdx + partUnitIdx; + + if (child_cu->flags & CU::PRESENT) + { + int32_t qp = outTempCU->getQP(0); + subBestPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init. + subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth, qp); // clear sub partition datas or init. + + if (partUnitIdx) // initialize RD with previous depth buffer + m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[nextDepth][CI_NEXT_BEST]); + else + m_rdEntropyCoders[nextDepth][CI_CURR_BEST].load(m_rdEntropyCoders[depth][CI_CURR_BEST]); + + // set current best CU cost to 1 marking as non-best CU by default + subTempPartCU->m_totalRDCost = 1; + + compressSharedIntraCTU(subBestPartCU, subTempPartCU, nextDepth, cuPicsym, child_cu, sharedDepth, sharedPartSizes, sharedModes, zOrder); + outTempCU->copyPartFrom(subBestPartCU, partUnitIdx, nextDepth); // Keep best part data to current temporary data. + + if(!subBestPartCU->m_totalRDCost) // if cost is 0, CU is best CU + outTempCU->m_totalRDCost = 0; // set outTempCU cost to 0, so later check will use this CU as best CU + + copyYuv2Tmp(subBestPartCU->getTotalNumPart() * partUnitIdx, nextDepth); + } + else + { + subBestPartCU->copyToPic(nextDepth); + outTempCU->copyPartFrom(subBestPartCU, partUnitIdx, nextDepth); + + // increment zOrder offset to point to next best depth in sharedDepth buffer + zOrder += g_depthInc[ctuToDepthIndex][nextDepth]; + } + } + + if (cu->flags & CU::PRESENT) + { + m_entropyCoder->resetBits(); + m_entropyCoder->codeSplitFlag(outTempCU, 0, depth); + outTempCU->m_totalBits += m_entropyCoder->getNumberOfWrittenBits(); // split bits + } + if (depth == slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP) + { + bool hasResidual = false; + for (uint32_t blkIdx = 0; blkIdx < outTempCU->getTotalNumPart(); blkIdx++) + { + if (outTempCU->getCbf(blkIdx, TEXT_LUMA) || outTempCU->getCbf(blkIdx, TEXT_CHROMA_U) || + outTempCU->getCbf(blkIdx, TEXT_CHROMA_V)) + { + hasResidual = true; + break; + } + } + + uint32_t targetPartIdx = 0; + if (hasResidual) + { + bool foundNonZeroCbf = false; + outTempCU->setQPSubCUs(outTempCU->getRefQP(targetPartIdx), outTempCU, 0, depth, foundNonZeroCbf); + X265_CHECK(foundNonZeroCbf, "expected to find non-zero CBF\n"); + } + else + outTempCU->setQPSubParts(outTempCU->getRefQP(targetPartIdx), 0, depth); // set QP to default QP + } + m_rdEntropyCoders[nextDepth][CI_NEXT_BEST].store(m_rdEntropyCoders[depth][CI_TEMP_BEST]); + checkBestMode(outBestCU, outTempCU, depth); + } + outBestCU->copyToPic(depth); + copyYuv2Pic(pic, outBestCU->getAddr(), outBestCU->getZorderIdxInCU(), depth); + +#if CHECKED_BUILD || _DEBUG + X265_CHECK(outBestCU->getPartitionSize(0) != SIZE_NONE, "no best partition size\n"); + X265_CHECK(outBestCU->getPredictionMode(0) != MODE_NONE, "no best partition mode\n"); + if (m_rdCost.m_psyRd) + { + X265_CHECK(outBestCU->m_totalPsyCost != MAX_INT64, "no best partition cost\n"); + } + else + { + X265_CHECK(outBestCU->m_totalRDCost != MAX_INT64, "no best partition cost\n"); + } +#endif +} + +void Analysis::checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu, uint8_t* sharedModes) { //PPAScopeEvent(CheckRDCostIntra + depth); uint32_t depth = g_log2Size[m_param->maxCUSize] - cu->log2CUSize; @@ -544,7 +689,10 @@ uint32_t tuDepthRange[2]; outTempCU->getQuadtreeTULog2MinSizeInCU(tuDepthRange, 0); - estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange); + if (sharedModes) + sharedEstIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange, sharedModes); + else + estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], tuDepthRange); estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]); diff -r 1de67321275e -r 717ea14104cf source/encoder/analysis.h --- a/source/encoder/analysis.h Mon Sep 15 15:00:13 2014 +0200 +++ b/source/encoder/analysis.h Tue Sep 16 16:50:56 2014 +0530 @@ -110,7 +110,8 @@ /* Warning: The interface for these functions will undergo significant changes as a major refactor is under progress */ void compressIntraCU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth, TComDataCU* cuPicsym, CU *cu); - void checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu); + void checkIntra(TComDataCU*& outBestCU, TComDataCU*& outTempCU, PartSize partSize, CU *cu, uint8_t* sharedModes); + void compressSharedIntraCTU(TComDataCU*& outBestCU, TComDataCU*& outTempCU, uint32_t depth, TComDataCU* cuPicsym, CU *cu, uint8_t* sharedDepth, char* sharedPartSizes, uint8_t* sharedModes, uint32_t &zOrder); void compressInterCU_rd0_4(TComDataCU*& outBestCU, TComDataCU*& outTempCU, TComDataCU* cu, uint32_t depth, TComDataCU* cuPicsym, CU *cu_t, int bInsidePicture, uint32_t partitionIndex, uint32_t minDepth); diff -r 1de67321275e -r 717ea14104cf source/encoder/search.cpp --- a/source/encoder/search.cpp Mon Sep 15 15:00:13 2014 +0200 +++ b/source/encoder/search.cpp Tue Sep 16 16:50:56 2014 +0530 @@ -1393,6 +1393,61 @@ x265_emms(); } +void Search::sharedEstIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes) +{ + uint32_t depth = cu->getDepth(0); + uint32_t initTrDepth = cu->getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1; + uint32_t numPU = 1 << (2 * initTrDepth); + uint32_t log2TrSize = cu->getLog2CUSize(0) - initTrDepth; + uint32_t qNumParts = cu->getTotalNumPart() >> 2; + + // loop over partitions + uint32_t partOffset = 0; + uint64_t puCost = 0; + uint32_t bits = 0; + uint32_t dststride = cu->m_pic->getPicYuvRec()->getStride(); + uint32_t srcstride = reconYuv->getStride(); + + for (uint32_t pu = 0; pu < numPU; pu++, partOffset += qNumParts) + { + cu->setLumaIntraDirSubParts(sharedModes[pu], partOffset, depth + initTrDepth); + + // set context models + m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]); + + // update overall distortion (rate and r-d costs are determined later) + cu->m_totalDistortion += xRecurIntraCodingQT(cu, initTrDepth, partOffset, fencYuv, predYuv, resiYuv, true, puCost, bits, depthRange); + xSetIntraResultQT(cu, initTrDepth, partOffset, reconYuv); + + if (pu != numPU - 1) + { + uint32_t zorder = cu->getZorderIdxInCU() + partOffset; + pixel* dst = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder); + pixel* src = reconYuv->getLumaAddr(partOffset); + primitives.luma_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride); + } + + // update PU data + cu->setLumaIntraDirSubParts(sharedModes[pu], partOffset, depth + initTrDepth); + cu->copyToPic((uint8_t)depth, pu, initTrDepth); + } + + if (numPU > 1) + { + // set Cbf for all blocks + uint32_t combCbfY = 0; + uint32_t partIdx = 0; + for (uint32_t part = 0; part < 4; part++, partIdx += qNumParts) + combCbfY |= cu->getCbf(partIdx, TEXT_LUMA, 1); + + for (uint32_t offs = 0; offs < 4 * qNumParts; offs++) + cu->getCbf(TEXT_LUMA)[offs] |= combCbfY; + } + + // reset context models + m_entropyCoder->load(m_rdEntropyCoders[depth][CI_CURR_BEST]); +} + void Search::getBestIntraModeChroma(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv) { uint32_t depth = cu->getDepth(0); diff -r 1de67321275e -r 717ea14104cf source/encoder/search.h --- a/source/encoder/search.h Mon Sep 15 15:00:13 2014 +0200 +++ b/source/encoder/search.h Tue Sep 16 16:50:56 2014 +0530 @@ -80,6 +80,7 @@ bool initSearch(x265_param *param, ScalingList& scalingList); void estIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2]); + void sharedEstIntraPredQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv, uint32_t depthRange[2], uint8_t* sharedModes); void estIntraPredChromaQT(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, TComYuv* reconYuv); // estimation inter prediction (non-skip) _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel