Below are the performance testing on Haswell with and without depth-search applied on pmode.
*preset VERYSLOW* *Before* D:\ashok>x265_b.exe --input \\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset veryslow --hash=1 --no-info --psnr --ssim -o test_b.hevc --pmode encoded 500 frames in 901.20s (0.55 fps), 8813.94 kb/s, Avg QP:37.97, Global PSNR: 30.369, SSIM Mean Y: 0.8159502 ( 7.351 dB) *After* D:\ashok>x265_a.exe --input \\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset veryslow --hash=1 --no-info --psnr --ssim -o test_a.hevc --pmode --limit-refs 1 encoded 500 frames in 632.76s (0.79 fps), 8666.79 kb/s, Avg QP:37.90, Global PSNR: 30.311, SSIM Mean Y: 0.8134978 ( 7.293 dB) *preset SLOWER* *Before* D:\ashok>x265_b.exe --input \\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset slower --hash=1 --no-info --psnr --ssim -o test_b.hevc --pmode encoded 500 frames in 585.06s (0.85 fps), 8796.19 kb/s, Avg QP:38.03, Global PSNR: 30.356, SSIM Mean Y: 0.8153866 ( 7.337 dB) *After* D:\ashok>x265_a.exe --input \\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset slower --hash=1 --no-info --psnr --ssim -o test_a.hevc --pmode --limit-refs 1 encoded 500 frames in 439.82s (1.14 fps), 8740.25 kb/s, Avg QP:37.99, Global PSNR: 30.334, SSIM Mean Y: 0.8144196 ( 7.315 dB) *preset SLOW* *Before* D:\ashok>x265_b.exe --input \\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset slow --hash=1 --no-info --psnr --ssim -o test_b.hevc --pmode encoded 500 frames in 148.11s (3.38 fps), 8764.71 kb/s, Avg QP:38.08, Global PSNR: 30.282, SSIM Mean Y: 0.8124724 ( 7.269 dB) *After* D:\ashok>x265_a.exe --input \\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset slow --hash=1 --no-info --psnr --ssim -o test_a.hevc --pmode --limit-refs 1 encoded 500 frames in 110.45s (4.53 fps), 8712.04 kb/s, Avg QP:38.04, Global PSNR: 30.265, SSIM Mean Y: 0.8117029 ( 7.252 dB) *preset MEDIUM* *Before* D:\ashok>x265_b.exe --input \\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset medium --hash=1 --no-info --psnr --ssim -o test_b.hevc --pmode encoded 500 frames in 67.01s (7.46 fps), 8975.61 kb/s, Avg QP:37.97, Global PSNR: 30.076, SSIM Mean Y: 0.8040911 ( 7.079 dB) *After* D:\ashok>x265_a.exe --input \\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset medium --hash=1 --no-info --psnr --ssim -o test_a.hevc --pmode --limit-refs 1 encoded 500 frames in 37.56s (13.31 fps), 8954.61 kb/s, Avg QP:37.96, Global PSNR: 30.070, SSIM Mean Y: 0.8038041 ( 7.073 dB) On Fri, Aug 14, 2015 at 4:27 PM, Ashok Kumar Mishra < [email protected]> wrote: > Yes, performance has improved, will send after some time for all presets. > > On Fri, Aug 14, 2015 at 4:20 PM, Steve Borho <[email protected]> wrote: > >> On 08/14, [email protected] wrote: >> > # HG changeset patch >> > # User Ashok Kumar Mishra<[email protected]> >> > # Date 1439540228 -19800 >> > # Fri Aug 14 13:47:08 2015 +0530 >> > # Node ID 9e26bef14543025908ed979b3d217417baf1ac8f >> > # Parent d56b2466c04459205287e1581d8a36eebf372ba6 >> > analysis: re-order analysis to do splits before ME or intra for pmode >> >> I'm happy to see these patches. They look good, do you have any example >> before/after performance and compression numbers? >> >> > diff -r d56b2466c044 -r 9e26bef14543 source/encoder/analysis.cpp >> > --- a/source/encoder/analysis.cpp Wed Aug 12 18:12:20 2015 +0530 >> > +++ b/source/encoder/analysis.cpp Fri Aug 14 13:47:08 2015 +0530 >> > @@ -505,16 +505,82 @@ >> > >> > X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not >> support RD 0 or 1\n"); >> > >> > + PMODE pmode(*this, cuGeom); >> > + >> > + if (mightNotSplit && depth >= minDepth) >> > + { >> > + /* Initialize all prediction CUs based on parentCTU */ >> > + md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); >> > + md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); >> > + >> > + if (m_param->rdLevel <= 4) >> > + checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], >> md.pred[PRED_MERGE], cuGeom); >> > + else >> > + checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], >> md.pred[PRED_MERGE], cuGeom, false); >> > + } >> > + >> > + bool bNoSplit = false; >> > + if (md.bestMode) >> > + { >> > + bNoSplit = md.bestMode->cu.isSkipped(0); >> > + if (mightSplit && depth && depth >= minDepth && !bNoSplit && >> m_param->rdLevel <= 4) >> > + bNoSplit = recursionDepthCheck(parentCTU, cuGeom, >> *md.bestMode); >> > + } >> > + >> > + if (mightSplit && !bNoSplit) >> > + { >> > + Mode* splitPred = &md.pred[PRED_SPLIT]; >> > + splitPred->initCosts(); >> > + CUData* splitCU = &splitPred->cu; >> > + splitCU->initSubCU(parentCTU, cuGeom, qp); >> > + >> > + uint32_t nextDepth = depth + 1; >> > + ModeDepth& nd = m_modeDepth[nextDepth]; >> > + invalidateContexts(nextDepth); >> > + Entropy* nextContext = &m_rqt[depth].cur; >> > + int nextQP = qp; >> > + >> > + for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) >> > + { >> > + const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + >> subPartIdx); >> > + if (childGeom.flags & CUGeom::PRESENT) >> > + { >> > + m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, >> childGeom.absPartIdx); >> > + m_rqt[nextDepth].cur.load(*nextContext); >> > + >> > + if (m_slice->m_pps->bUseDQP && nextDepth <= >> m_slice->m_pps->maxCuDQPDepth) >> > + nextQP = setLambdaFromQP(parentCTU, >> calculateQpforCuSize(parentCTU, childGeom)); >> > + >> > + compressInterCU_dist(parentCTU, childGeom, nextQP); >> > + >> > + // Save best CU and pred data for this sub CU >> > + splitCU->copyPartFrom(nd.bestMode->cu, childGeom, >> subPartIdx); >> > + splitPred->addSubCosts(*nd.bestMode); >> > + >> > + >> nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, >> childGeom.numPartitions * subPartIdx); >> > + nextContext = &nd.bestMode->contexts; >> > + } >> > + else >> > + splitCU->setEmptyPart(childGeom, subPartIdx); >> > + } >> > + nextContext->store(splitPred->contexts); >> > + >> > + if (mightNotSplit) >> > + addSplitFlagCost(*splitPred, cuGeom.depth); >> > + else >> > + updateModeCost(*splitPred); >> > + >> > + checkDQPForSplitPred(*splitPred, cuGeom); >> > + } >> > + >> > if (mightNotSplit && depth >= minDepth) >> > { >> > int bTryAmp = m_slice->m_sps->maxAMPDepth > depth; >> > int bTryIntra = m_slice->m_sliceType != B_SLICE || >> m_param->bIntraInBFrames; >> > >> > - PMODE pmode(*this, cuGeom); >> > + if (m_slice->m_pps->bUseDQP && depth <= >> m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0) >> > + setLambdaFromQP(parentCTU, qp); >> > >> > - /* Initialize all prediction CUs based on parentCTU */ >> > - md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); >> > - md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); >> > if (bTryIntra) >> > { >> > md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp); >> > @@ -548,8 +614,6 @@ >> > >> > if (m_param->rdLevel <= 4) >> > { >> > - checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], >> md.pred[PRED_MERGE], cuGeom); >> > - >> > { >> > ProfileCUScope(parentCTU, pmodeBlockTime, >> countPModeMasters); >> > pmode.waitForExit(); >> > @@ -632,14 +696,13 @@ >> > } >> > else >> > { >> > - checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], >> md.pred[PRED_MERGE], cuGeom, false); >> > { >> > ProfileCUScope(parentCTU, pmodeBlockTime, >> countPModeMasters); >> > pmode.waitForExit(); >> > } >> > >> > checkBestMode(md.pred[PRED_2Nx2N], depth); >> > - if (m_slice->m_sliceType == B_SLICE) >> > + if (m_slice->m_sliceType == B_SLICE && >> md.pred[PRED_BIDIR].sa8dCost < MAX_INT64) >> > checkBestMode(md.pred[PRED_BIDIR], depth); >> > >> > if (m_param->bEnableRectInter) >> > @@ -664,14 +727,6 @@ >> > } >> > } >> > >> > - if (md.bestMode->rdCost == MAX_INT64 && !bTryIntra) >> > - { >> > - md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp); >> > - checkIntraInInter(md.pred[PRED_INTRA], cuGeom); >> > - encodeIntraInInter(md.pred[PRED_INTRA], cuGeom); >> > - checkBestMode(md.pred[PRED_INTRA], depth); >> > - } >> > - >> > if (m_bTryLossless) >> > tryLossless(cuGeom); >> > >> > @@ -679,60 +734,9 @@ >> > addSplitFlagCost(*md.bestMode, cuGeom.depth); >> > } >> > >> > - bool bNoSplit = false; >> > - if (md.bestMode) >> > - { >> > - bNoSplit = md.bestMode->cu.isSkipped(0); >> > - if (mightSplit && depth && depth >= minDepth && !bNoSplit && >> m_param->rdLevel <= 4) >> > - bNoSplit = recursionDepthCheck(parentCTU, cuGeom, >> *md.bestMode); >> > - } >> > - >> > - if (mightSplit && !bNoSplit) >> > - { >> > - Mode* splitPred = &md.pred[PRED_SPLIT]; >> > - splitPred->initCosts(); >> > - CUData* splitCU = &splitPred->cu; >> > - splitCU->initSubCU(parentCTU, cuGeom, qp); >> > - >> > - uint32_t nextDepth = depth + 1; >> > - ModeDepth& nd = m_modeDepth[nextDepth]; >> > - invalidateContexts(nextDepth); >> > - Entropy* nextContext = &m_rqt[depth].cur; >> > - int nextQP = qp; >> > - >> > - for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) >> > - { >> > - const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + >> subPartIdx); >> > - if (childGeom.flags & CUGeom::PRESENT) >> > - { >> > - m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, >> childGeom.absPartIdx); >> > - m_rqt[nextDepth].cur.load(*nextContext); >> > - >> > - if (m_slice->m_pps->bUseDQP && nextDepth <= >> m_slice->m_pps->maxCuDQPDepth) >> > - nextQP = setLambdaFromQP(parentCTU, >> calculateQpforCuSize(parentCTU, childGeom)); >> > - >> > - compressInterCU_dist(parentCTU, childGeom, nextQP); >> > - >> > - // Save best CU and pred data for this sub CU >> > - splitCU->copyPartFrom(nd.bestMode->cu, childGeom, >> subPartIdx); >> > - splitPred->addSubCosts(*nd.bestMode); >> > - >> > - >> nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, >> childGeom.numPartitions * subPartIdx); >> > - nextContext = &nd.bestMode->contexts; >> > - } >> > - else >> > - splitCU->setEmptyPart(childGeom, subPartIdx); >> > - } >> > - nextContext->store(splitPred->contexts); >> > - >> > - if (mightNotSplit) >> > - addSplitFlagCost(*splitPred, cuGeom.depth); >> > - else >> > - updateModeCost(*splitPred); >> > - >> > - checkDQPForSplitPred(*splitPred, cuGeom); >> > - checkBestMode(*splitPred, depth); >> > - } >> > + /* compare split RD cost against best cost */ >> > + if (mightSplit && !bNoSplit) >> > + checkBestMode(md.pred[PRED_SPLIT], depth); >> > >> > if (mightNotSplit) >> > { >> > @@ -746,8 +750,7 @@ >> > >> > /* Copy best data to encData CTU and recon */ >> > md.bestMode->cu.copyToPic(depth); >> > - if (md.bestMode != &md.pred[PRED_SPLIT]) >> > - md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, >> cuAddr, cuGeom.absPartIdx); >> > + md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, >> cuGeom.absPartIdx); >> > } >> > >> > uint32_t Analysis::compressInterCU_rd0_4(const CUData& parentCTU, >> const CUGeom& cuGeom, int32_t qp) >> > _______________________________________________ >> > x265-devel mailing list >> > [email protected] >> > https://mailman.videolan.org/listinfo/x265-devel >> >> -- >> Steve Borho >> _______________________________________________ >> x265-devel mailing list >> [email protected] >> https://mailman.videolan.org/listinfo/x265-devel >> > >
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
