Re: [x265] [PATCH 2 of 3] rc: accumulate mv bits, coeff bits per frame
Thanks, can you merge with the latest tip? On Fri, Jun 27, 2014 at 12:13 AM, Aarthi Priya Thirumalai aar...@multicorewareinc.com wrote: # HG changeset patch # User Aarthi Thirumalaiaar...@multicorewareinc.com # Date 1403808050 -19800 # Fri Jun 27 00:10:50 2014 +0530 # Node ID 11ddf73017d44933090a8943f4fc5098b231b56d # Parent 1b669c33ff3a8d8f6c9bd1e18979c009baed2433 rc: accumulate mv bits, coeff bits per frame diff -r 1b669c33ff3a -r 11ddf73017d4 source/Lib/TLibEncoder/TEncCu.cpp --- a/source/Lib/TLibEncoder/TEncCu.cpp Wed Jun 25 22:46:45 2014 +0530 +++ b/source/Lib/TLibEncoder/TEncCu.cpp Fri Jun 27 00:10:50 2014 +0530 @@ -1365,12 +1365,14 @@ m_entropyCoder-encodePredMode(outTempCU, 0); m_entropyCoder-encodePartSize(outTempCU, 0, depth); m_entropyCoder-encodePredInfo(outTempCU, 0); +outTempCU-m_mvBits = m_entropyCoder-getNumberOfWrittenBits(); // Encode Coefficients bool bCodeDQP = getdQPFlag(); m_entropyCoder-encodeCoeff(outTempCU, 0, depth, outTempCU-getCUSize(0), bCodeDQP); m_rdGoOnSbacCoder-store(m_rdSbacCoders[depth][CI_TEMP_BEST]); outTempCU-m_totalBits = m_entropyCoder-getNumberOfWrittenBits(); +outTempCU-m_coeffBits = outTempCU-m_totalBits - outTempCU-m_mvBits; if (m_rdCost-psyRdEnabled()) { @@ -1411,12 +1413,14 @@ m_entropyCoder-encodePredMode(outTempCU, 0); m_entropyCoder-encodePartSize(outTempCU, 0, depth); m_entropyCoder-encodePredInfo(outTempCU, 0); +outTempCU-m_mvBits = m_entropyCoder-getNumberOfWrittenBits(); // Encode Coefficients bool bCodeDQP = getdQPFlag(); m_entropyCoder-encodeCoeff(outTempCU, 0, depth, outTempCU-getCUSize(0), bCodeDQP); m_rdGoOnSbacCoder-store(m_rdSbacCoders[depth][CI_TEMP_BEST]); outTempCU-m_totalBits = m_entropyCoder-getNumberOfWrittenBits(); +outTempCU-m_coeffBits = outTempCU-m_totalBits - outTempCU-m_mvBits; if (m_rdCost-psyRdEnabled()) { diff -r 1b669c33ff3a -r 11ddf73017d4 source/Lib/TLibEncoder/TEncSearch.cpp --- a/source/Lib/TLibEncoder/TEncSearch.cpp Wed Jun 25 22:46:45 2014 +0530 +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri Jun 27 00:10:50 2014 +0530 @@ -4059,6 +4059,7 @@ } m_entropyCoder-encodeSkipFlag(cu, 0); m_entropyCoder-encodeMergeIndex(cu, 0); +cu-m_mvBits = m_entropyCoder-getNumberOfWrittenBits(); return m_entropyCoder-getNumberOfWrittenBits(); } else @@ -4073,8 +4074,11 @@ m_entropyCoder-encodePartSize(cu, 0, cu-getDepth(0)); m_entropyCoder-encodePredInfo(cu, 0); bool bDummy = false; +cu-m_mvBits = m_entropyCoder-getNumberOfWrittenBits(); m_entropyCoder-encodeCoeff(cu, 0, cu-getDepth(0), cu-getCUSize(0), bDummy); -return m_entropyCoder-getNumberOfWrittenBits(); +int totalBits = m_entropyCoder-getNumberOfWrittenBits(); +cu-m_coeffBits = totalBits - cu-m_mvBits; +return totalBits; } } diff -r 1b669c33ff3a -r 11ddf73017d4 source/encoder/compress.cpp --- a/source/encoder/compress.cpp Wed Jun 25 22:46:45 2014 +0530 +++ b/source/encoder/compress.cpp Fri Jun 27 00:10:50 2014 +0530 @@ -63,6 +63,7 @@ m_entropyCoder-encodePredMode(cu, 0); m_entropyCoder-encodePartSize(cu, 0, depth); m_entropyCoder-encodePredInfo(cu, 0); +cu-m_mvBits += m_entropyCoder-getNumberOfWrittenBits(); // Encode Coefficients bool bCodeDQP = getdQPFlag(); @@ -71,6 +72,7 @@ m_rdGoOnSbacCoder-store(m_rdSbacCoders[depth][CI_TEMP_BEST]); cu-m_totalBits = m_entropyCoder-getNumberOfWrittenBits(); +cu-m_coeffBits = cu-m_totalBits - cu-m_mvBits; if (m_rdCost-psyRdEnabled()) { int part = g_convertToBit[cu-getCUSize(0)]; diff -r 1b669c33ff3a -r 11ddf73017d4 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Wed Jun 25 22:46:45 2014 +0530 +++ b/source/encoder/frameencoder.cpp Fri Jun 27 00:10:50 2014 +0530 @@ -694,6 +694,11 @@ // Store probabilities of second LCU in line into buffer if (col == 1 m_param-bEnableWavefront) getBufferSBac(lin)-loadContexts(getSbacCoder(subStrm)); + +// Collect Frame Stats for 2 pass +m_frame-m_stats.mvBits += cu-m_mvBits; +m_frame-m_stats.coeffBits += cu-m_coeffBits; +m_frame-m_stats.miscBits += cu-m_totalBits - (cu-m_mvBits + cu-m_coeffBits); } if (slice-getPPS()-getCabacInitPresentFlag()) On Thu, Jun 26, 2014 at 4:25 PM, Deepthi Nandakumar deep...@multicorewareinc.com wrote: Pls fix extra newlines and whitespace nits. On Wed, Jun 25, 2014 at 10:54 PM, aar...@multicorewareinc.com wrote: # HG changeset patch # User Aarthi Thirumalaiaar...@multicorewareinc.com # Date 1403716735 -19800 # Wed Jun 25 22:48:55 2014 +0530 # Node ID 0995efabd44470c1192994e1aceeb40ae606467f # Parent e71e34d02de228eab43edf1910a71a44417d rc:
[x265] [PATCH] psyrd: fix for inconsistent output
# HG changeset patch # User Sumalatha Polureddysumala...@multicorewareinc.com # Date 1403858781 -19800 # Node ID a789870889fcc9a31deff7fc6961d143b0db86c1 # Parent 1b669c33ff3a8d8f6c9bd1e18979c009baed2433 psyrd: fix for inconsistent output maximum buffer size for zeropel is MAX_CU_SIZExMAX_CU_SIZE. since stride was wrong, it was accessing out of boundary memory which was different for each run, so inconsistent output diff -r 1b669c33ff3a -r a789870889fc source/Lib/TLibEncoder/TEncSearch.cpp --- a/source/Lib/TLibEncoder/TEncSearch.cpp Wed Jun 25 22:46:45 2014 +0530 +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri Jun 27 14:16:21 2014 +0530 @@ -3035,7 +3035,7 @@ { int size = g_convertToBit[trSize]; psyEnergyY = m_rdCost-psyCost(size, fencYuv-getLumaAddr(absPartIdx), fencYuv-getStride(), - (pixel*)RDCost::zeroPel, cu-getPic()-getPicYuvRec()-getStride()); // need to check whether zero distortion is similar to psyenergy of fenc +(pixel*)RDCost::zeroPel, trSize); // need to check whether zero distortion is similar to psyenergy of fenc } int16_t *curResiY = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx); X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, width not full CU\n); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] fix emms: move selectLambda() into xRateDistOptQuant() and issue emms before it
# HG changeset patch # User Satoshi Nakagawa nakagawa...@oki.com # Date 1403877807 -32400 # Fri Jun 27 23:03:27 2014 +0900 # Node ID 77f443fe169ca23969df5d5ee6968543bfa5e794 # Parent 32aa6cc3cf4d108ac92f5d29258b2c38ca888d29 fix emms: move selectLambda() into xRateDistOptQuant() and issue emms before it diff -r 32aa6cc3cf4d -r 77f443fe169c source/Lib/TLibCommon/TComTrQuant.cpp --- a/source/Lib/TLibCommon/TComTrQuant.cpp Thu Jun 26 17:19:08 2014 -0700 +++ b/source/Lib/TLibCommon/TComTrQuant.cpp Fri Jun 27 23:03:27 2014 +0900 @@ -508,6 +508,9 @@ uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t trSize, TextType ttype, uint32_t absPartIdx, int32_t *lastPos) { +x265_emms(); +selectLambda(ttype); + const uint32_t log2TrSize = g_convertToBit[trSize] + 2; uint32_t absSum = 0; int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform diff -r 32aa6cc3cf4d -r 77f443fe169c source/Lib/TLibEncoder/TEncSearch.cpp --- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu Jun 26 17:19:08 2014 -0700 +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri Jun 27 23:03:27 2014 +0900 @@ -428,8 +428,6 @@ int chFmt = cu-getChromaFormat(); m_trQuant-setQPforQuant(cu-getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt); -m_trQuant-selectLambda(TEXT_LUMA); - absSum = m_trQuant-transformNxN(cu, residual, stride, coeff, tuSize, TEXT_LUMA, absPartIdx, lastPos, useTransformSkip); //--- set coded block flag --- @@ -515,8 +513,6 @@ curChromaQpOffset = cu-getSlice()-getPPS()-getChromaCrQpOffset() + cu-getSlice()-getSliceQpDeltaCr(); } m_trQuant-setQPforQuant(cu-getQP(0), TEXT_CHROMA, cu-getSlice()-getSPS()-getQpBDOffsetC(), curChromaQpOffset, chFmt); -m_trQuant-selectLambda(ttype); - absSum = m_trQuant-transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdx, lastPos, useTransformSkipC); //--- set coded block flag --- @@ -905,7 +901,6 @@ int lastPos = -1; m_trQuant-setQPforQuant(cu-getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt); -m_trQuant-selectLambda(TEXT_LUMA); absSum = m_trQuant-transformNxN(cu, residual, stride, coeff, tuSize, TEXT_LUMA, absPartIdx, lastPos, useTransformSkip); //--- set coded block flag --- @@ -1421,8 +1416,6 @@ curChromaQpOffset = cu-getSlice()-getPPS()-getChromaCrQpOffset() + cu-getSlice()-getSliceQpDeltaCr(); } m_trQuant-setQPforQuant(cu-getQP(0), TEXT_CHROMA, cu-getSlice()-getSPS()-getQpBDOffsetC(), curChromaQpOffset, chFmt); -m_trQuant-selectLambda(ttype); - absSum = m_trQuant-transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdxC, lastPos, useTransformSkipC); //--- set coded block flag --- @@ -2702,13 +2695,11 @@ cu-setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth); -m_trQuant-setQPforQuant(cu-getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt); -m_trQuant-selectLambda(TEXT_LUMA); - int16_t *curResiY = resiYuv-getLumaAddr(absPartIdx); const uint32_t strideResiY = resiYuv-m_width; const uint32_t strideResiC = resiYuv-m_cwidth; +m_trQuant-setQPforQuant(cu-getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt); absSumY = m_trQuant-transformNxN(cu, curResiY, strideResiY, coeffCurY, trSize, TEXT_LUMA, absPartIdx, lastPosY, false, curuseRDOQ); @@ -2746,13 +2737,11 @@ int curChromaQpOffset = cu-getSlice()-getPPS()-getChromaCbQpOffset() + cu-getSlice()-getSliceQpDeltaCb(); m_trQuant-setQPforQuant(cu-getQP(0), TEXT_CHROMA, cu-getSlice()-getSPS()-getQpBDOffsetC(), curChromaQpOffset, chFmt); -m_trQuant-selectLambda(TEXT_CHROMA_U); absSumU = m_trQuant-transformNxN(cu, curResiU, strideResiC, coeffCurU + subTUBufferOffset, trSizeC, TEXT_CHROMA_U, absPartIdxC, lastPosU, false, curuseRDOQ); curChromaQpOffset = cu-getSlice()-getPPS()-getChromaCrQpOffset() + cu-getSlice()-getSliceQpDeltaCr(); m_trQuant-setQPforQuant(cu-getQP(0), TEXT_CHROMA, cu-getSlice()-getSPS()-getQpBDOffsetC(), curChromaQpOffset, chFmt); -m_trQuant-selectLambda(TEXT_CHROMA_V); absSumV = m_trQuant-transformNxN(cu, curResiV, strideResiC, coeffCurV + subTUBufferOffset, trSizeC, TEXT_CHROMA_V, absPartIdxC, lastPosV, false, curuseRDOQ); @@ -2915,8 +2904,6 @@ } m_trQuant-setQPforQuant(cu-getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt); -m_trQuant-selectLambda(TEXT_LUMA); - absSum[TEXT_LUMA][0] = m_trQuant-transformNxN(cu,
Re: [x265] [PATCH 1 of 2] improve count_nonzero by SSSE3
On 6/27/2014 4:05 PM, chen wrote: I can't understand what's your means. could you tell me more? I use some SSSE3 instruction and process 16 pixels every loop. I meant keep both sse2 and ssse3 variants. Not sure if x86inc.asm macros help with this or not. - Derek ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH 1 of 2] improve count_nonzero by SSSE3
On 6/27/2014 6:08 PM, chen wrote: I use ssse3 instruction PSHUFB to replace 3 SSE2 instructions, the x86inc macro can't handle it. After patch, this function is faster ~20% and codeCoeffNxN ~7% speedup, so I don't worry about old CPU's performance. I guess SSSE3 is very prevalent nowadays -- though I am still not a fan of throwing away variants, I guess it's reasonable in this case. - Derek ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel