Re: [x265] [PATCH 2 of 3] rc: accumulate mv bits, coeff bits per frame

2014-06-27 Thread Deepthi Nandakumar
Thanks, can you merge with the latest tip?


On Fri, Jun 27, 2014 at 12:13 AM, Aarthi Priya Thirumalai 
aar...@multicorewareinc.com wrote:

 # HG changeset patch
 # User Aarthi Thirumalaiaar...@multicorewareinc.com
 # Date 1403808050 -19800
 #  Fri Jun 27 00:10:50 2014 +0530
 # Node ID 11ddf73017d44933090a8943f4fc5098b231b56d
 # Parent  1b669c33ff3a8d8f6c9bd1e18979c009baed2433
 rc: accumulate mv bits, coeff bits per frame

 diff -r 1b669c33ff3a -r 11ddf73017d4 source/Lib/TLibEncoder/TEncCu.cpp
  --- a/source/Lib/TLibEncoder/TEncCu.cpp Wed Jun 25 22:46:45 2014 +0530
 +++ b/source/Lib/TLibEncoder/TEncCu.cpp Fri Jun 27 00:10:50 2014 +0530
 @@ -1365,12 +1365,14 @@
  m_entropyCoder-encodePredMode(outTempCU, 0);
  m_entropyCoder-encodePartSize(outTempCU, 0, depth);
  m_entropyCoder-encodePredInfo(outTempCU, 0);
  +outTempCU-m_mvBits = m_entropyCoder-getNumberOfWrittenBits();

  // Encode Coefficients
  bool bCodeDQP = getdQPFlag();
  m_entropyCoder-encodeCoeff(outTempCU, 0, depth,
 outTempCU-getCUSize(0), bCodeDQP);
  m_rdGoOnSbacCoder-store(m_rdSbacCoders[depth][CI_TEMP_BEST]);
  outTempCU-m_totalBits = m_entropyCoder-getNumberOfWrittenBits();
 +outTempCU-m_coeffBits = outTempCU-m_totalBits - outTempCU-m_mvBits;

  if (m_rdCost-psyRdEnabled())
  {
 @@ -1411,12 +1413,14 @@
  m_entropyCoder-encodePredMode(outTempCU, 0);
  m_entropyCoder-encodePartSize(outTempCU, 0, depth);
  m_entropyCoder-encodePredInfo(outTempCU, 0);
 +outTempCU-m_mvBits = m_entropyCoder-getNumberOfWrittenBits();

  // Encode Coefficients
  bool bCodeDQP = getdQPFlag();
  m_entropyCoder-encodeCoeff(outTempCU, 0, depth,
 outTempCU-getCUSize(0), bCodeDQP);
  m_rdGoOnSbacCoder-store(m_rdSbacCoders[depth][CI_TEMP_BEST]);
  outTempCU-m_totalBits = m_entropyCoder-getNumberOfWrittenBits();
 +outTempCU-m_coeffBits = outTempCU-m_totalBits - outTempCU-m_mvBits;

  if (m_rdCost-psyRdEnabled())
  {
 diff -r 1b669c33ff3a -r 11ddf73017d4 source/Lib/TLibEncoder/TEncSearch.cpp
  --- a/source/Lib/TLibEncoder/TEncSearch.cpp Wed Jun 25 22:46:45 2014
 +0530
 +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri Jun 27 00:10:50 2014 +0530
 @@ -4059,6 +4059,7 @@
  }
  m_entropyCoder-encodeSkipFlag(cu, 0);
  m_entropyCoder-encodeMergeIndex(cu, 0);
 +cu-m_mvBits = m_entropyCoder-getNumberOfWrittenBits();
  return m_entropyCoder-getNumberOfWrittenBits();
  }
  else
 @@ -4073,8 +4074,11 @@
  m_entropyCoder-encodePartSize(cu, 0, cu-getDepth(0));
  m_entropyCoder-encodePredInfo(cu, 0);
  bool bDummy = false;
 +cu-m_mvBits = m_entropyCoder-getNumberOfWrittenBits();
  m_entropyCoder-encodeCoeff(cu, 0, cu-getDepth(0),
 cu-getCUSize(0), bDummy);
 -return m_entropyCoder-getNumberOfWrittenBits();
 +int totalBits = m_entropyCoder-getNumberOfWrittenBits();
 +cu-m_coeffBits = totalBits - cu-m_mvBits;
  +return totalBits;
  }
  }

 diff -r 1b669c33ff3a -r 11ddf73017d4 source/encoder/compress.cpp
 --- a/source/encoder/compress.cpp Wed Jun 25 22:46:45 2014 +0530
 +++ b/source/encoder/compress.cpp Fri Jun 27 00:10:50 2014 +0530
 @@ -63,6 +63,7 @@
  m_entropyCoder-encodePredMode(cu, 0);
  m_entropyCoder-encodePartSize(cu, 0, depth);
  m_entropyCoder-encodePredInfo(cu, 0);
 +cu-m_mvBits += m_entropyCoder-getNumberOfWrittenBits();

  // Encode Coefficients
  bool bCodeDQP = getdQPFlag();
 @@ -71,6 +72,7 @@
  m_rdGoOnSbacCoder-store(m_rdSbacCoders[depth][CI_TEMP_BEST]);

  cu-m_totalBits = m_entropyCoder-getNumberOfWrittenBits();
 +cu-m_coeffBits = cu-m_totalBits - cu-m_mvBits;
  if (m_rdCost-psyRdEnabled())
  {
  int part = g_convertToBit[cu-getCUSize(0)];
 diff -r 1b669c33ff3a -r 11ddf73017d4 source/encoder/frameencoder.cpp
 --- a/source/encoder/frameencoder.cpp Wed Jun 25 22:46:45 2014 +0530
 +++ b/source/encoder/frameencoder.cpp Fri Jun 27 00:10:50 2014 +0530
 @@ -694,6 +694,11 @@
  // Store probabilities of second LCU in line into buffer
  if (col == 1  m_param-bEnableWavefront)
  getBufferSBac(lin)-loadContexts(getSbacCoder(subStrm));
 +
 +// Collect Frame Stats for 2 pass
 +m_frame-m_stats.mvBits += cu-m_mvBits;
 +m_frame-m_stats.coeffBits += cu-m_coeffBits;
 +m_frame-m_stats.miscBits += cu-m_totalBits - (cu-m_mvBits +
 cu-m_coeffBits);
  }

  if (slice-getPPS()-getCabacInitPresentFlag())


 On Thu, Jun 26, 2014 at 4:25 PM, Deepthi Nandakumar 
 deep...@multicorewareinc.com wrote:

 Pls fix extra newlines and whitespace nits.


 On Wed, Jun 25, 2014 at 10:54 PM, aar...@multicorewareinc.com wrote:

 # HG changeset patch
 # User Aarthi Thirumalaiaar...@multicorewareinc.com
 # Date 1403716735 -19800
 #  Wed Jun 25 22:48:55 2014 +0530
 # Node ID 0995efabd44470c1192994e1aceeb40ae606467f
 # Parent  e71e34d02de228eab43edf1910a71a44417d
 rc: 

[x265] [PATCH] psyrd: fix for inconsistent output

2014-06-27 Thread sumalatha
# HG changeset patch
# User Sumalatha Polureddysumala...@multicorewareinc.com
# Date 1403858781 -19800
# Node ID a789870889fcc9a31deff7fc6961d143b0db86c1
# Parent  1b669c33ff3a8d8f6c9bd1e18979c009baed2433
psyrd: fix for inconsistent output

maximum buffer size for zeropel is MAX_CU_SIZExMAX_CU_SIZE. since stride was 
wrong,
it was accessing out of boundary memory which was different for each run, so 
inconsistent output

diff -r 1b669c33ff3a -r a789870889fc source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Wed Jun 25 22:46:45 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri Jun 27 14:16:21 2014 +0530
@@ -3035,7 +3035,7 @@
 {
 int size = g_convertToBit[trSize];
 psyEnergyY = m_rdCost-psyCost(size, 
fencYuv-getLumaAddr(absPartIdx), fencYuv-getStride(),
-   (pixel*)RDCost::zeroPel, 
cu-getPic()-getPicYuvRec()-getStride()); // need to check whether zero 
distortion is similar to psyenergy of fenc
+(pixel*)RDCost::zeroPel, trSize); // need to check whether 
zero distortion is similar to psyenergy of fenc
 }
 int16_t *curResiY = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
 X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, width 
not full CU\n);
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] fix emms: move selectLambda() into xRateDistOptQuant() and issue emms before it

2014-06-27 Thread Satoshi Nakagawa
# HG changeset patch
# User Satoshi Nakagawa nakagawa...@oki.com
# Date 1403877807 -32400
#  Fri Jun 27 23:03:27 2014 +0900
# Node ID 77f443fe169ca23969df5d5ee6968543bfa5e794
# Parent  32aa6cc3cf4d108ac92f5d29258b2c38ca888d29
fix emms: move selectLambda() into xRateDistOptQuant() and issue emms before it

diff -r 32aa6cc3cf4d -r 77f443fe169c source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp Thu Jun 26 17:19:08 2014 -0700
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp Fri Jun 27 23:03:27 2014 +0900
@@ -508,6 +508,9 @@
 uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, 
coeff_t* dstCoeff, uint32_t trSize,
 TextType ttype, uint32_t absPartIdx, 
int32_t *lastPos)
 {
+x265_emms();
+selectLambda(ttype);
+
 const uint32_t log2TrSize = g_convertToBit[trSize] + 2;
 uint32_t absSum = 0;
 int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // 
Represents scaling through forward transform
diff -r 32aa6cc3cf4d -r 77f443fe169c source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu Jun 26 17:19:08 2014 -0700
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri Jun 27 23:03:27 2014 +0900
@@ -428,8 +428,6 @@
 
 int chFmt = cu-getChromaFormat();
 m_trQuant-setQPforQuant(cu-getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);
-m_trQuant-selectLambda(TEXT_LUMA);
-
 absSum = m_trQuant-transformNxN(cu, residual, stride, coeff, tuSize, 
TEXT_LUMA, absPartIdx, lastPos, useTransformSkip);
 
 //--- set coded block flag ---
@@ -515,8 +513,6 @@
 curChromaQpOffset = cu-getSlice()-getPPS()-getChromaCrQpOffset() + 
cu-getSlice()-getSliceQpDeltaCr();
 }
 m_trQuant-setQPforQuant(cu-getQP(0), TEXT_CHROMA, 
cu-getSlice()-getSPS()-getQpBDOffsetC(), curChromaQpOffset, chFmt);
-m_trQuant-selectLambda(ttype);
-
 absSum = m_trQuant-transformNxN(cu, residual, stride, coeff, tuSize, 
ttype, absPartIdx, lastPos, useTransformSkipC);
 
 //--- set coded block flag ---
@@ -905,7 +901,6 @@
 int lastPos = -1;
 
 m_trQuant-setQPforQuant(cu-getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, 
chFmt);
-m_trQuant-selectLambda(TEXT_LUMA);
 absSum = m_trQuant-transformNxN(cu, residual, stride, coeff, tuSize, 
TEXT_LUMA, absPartIdx, lastPos, useTransformSkip);
 
 //--- set coded block flag ---
@@ -1421,8 +1416,6 @@
 curChromaQpOffset = 
cu-getSlice()-getPPS()-getChromaCrQpOffset() + 
cu-getSlice()-getSliceQpDeltaCr();
 }
 m_trQuant-setQPforQuant(cu-getQP(0), TEXT_CHROMA, 
cu-getSlice()-getSPS()-getQpBDOffsetC(), curChromaQpOffset, chFmt);
-m_trQuant-selectLambda(ttype);
-
 absSum = m_trQuant-transformNxN(cu, residual, stride, coeff, 
tuSize, ttype, absPartIdxC, lastPos, useTransformSkipC);
 
 //--- set coded block flag ---
@@ -2702,13 +2695,11 @@
 
 cu-setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
 
-m_trQuant-setQPforQuant(cu-getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, 
chFmt);
-m_trQuant-selectLambda(TEXT_LUMA);
-
 int16_t *curResiY = resiYuv-getLumaAddr(absPartIdx);
 const uint32_t strideResiY = resiYuv-m_width;
 const uint32_t strideResiC = resiYuv-m_cwidth;
 
+m_trQuant-setQPforQuant(cu-getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, 
chFmt);
 absSumY = m_trQuant-transformNxN(cu, curResiY, strideResiY, coeffCurY,
   trSize, TEXT_LUMA, absPartIdx, 
lastPosY, false, curuseRDOQ);
 
@@ -2746,13 +2737,11 @@
 
 int curChromaQpOffset = 
cu-getSlice()-getPPS()-getChromaCbQpOffset() + 
cu-getSlice()-getSliceQpDeltaCb();
 m_trQuant-setQPforQuant(cu-getQP(0), TEXT_CHROMA, 
cu-getSlice()-getSPS()-getQpBDOffsetC(), curChromaQpOffset, chFmt);
-m_trQuant-selectLambda(TEXT_CHROMA_U);
 absSumU = m_trQuant-transformNxN(cu, curResiU, strideResiC, 
coeffCurU + subTUBufferOffset,
   trSizeC, TEXT_CHROMA_U, 
absPartIdxC, lastPosU, false, curuseRDOQ);
 
 curChromaQpOffset = 
cu-getSlice()-getPPS()-getChromaCrQpOffset() + 
cu-getSlice()-getSliceQpDeltaCr();
 m_trQuant-setQPforQuant(cu-getQP(0), TEXT_CHROMA, 
cu-getSlice()-getSPS()-getQpBDOffsetC(), curChromaQpOffset, chFmt);
-m_trQuant-selectLambda(TEXT_CHROMA_V);
 absSumV = m_trQuant-transformNxN(cu, curResiV, strideResiC, 
coeffCurV + subTUBufferOffset,
   trSizeC, TEXT_CHROMA_V, 
absPartIdxC, lastPosV, false, curuseRDOQ);
 
@@ -2915,8 +2904,6 @@
 }
 
 m_trQuant-setQPforQuant(cu-getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, 
chFmt);
-m_trQuant-selectLambda(TEXT_LUMA);
-
 absSum[TEXT_LUMA][0] = m_trQuant-transformNxN(cu, 

Re: [x265] [PATCH 1 of 2] improve count_nonzero by SSSE3

2014-06-27 Thread Derek Buitenhuis
On 6/27/2014 4:05 PM, chen wrote:
 I can't understand what's your means. could you tell me more?
 
 I use some SSSE3 instruction and process 16 pixels every loop.

I meant keep both sse2 and ssse3 variants. Not sure if x86inc.asm macros
help with this or not.

- Derek
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] [PATCH 1 of 2] improve count_nonzero by SSSE3

2014-06-27 Thread Derek Buitenhuis
On 6/27/2014 6:08 PM, chen wrote:
 I use ssse3 instruction PSHUFB to replace 3 SSE2 instructions, the x86inc 
 macro can't handle it.
 
 After patch, this function is faster ~20% and codeCoeffNxN ~7% speedup, so I 
 don't worry about old CPU's performance.

I guess SSSE3 is very prevalent nowadays -- though I am still not a fan
of throwing away variants, I guess it's reasonable in this case.

- Derek
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel