Re: [x265] [PATCH] Search: remove redundant encode coefficients in intra for performance

2014-09-15 Thread Deepthi Nandakumar
Sorry, the output mismatch was due to asm. Pushed.

On Sun, Sep 14, 2014 at 4:35 PM, Deepthi Nandakumar 
deep...@multicorewareinc.com wrote:

 This significantly changes outputs for P and B frames. Higher bitrates and
 higher SSIM. Lets do full regression testing on this - and compare the
 bitrate/ssim for all combinations to be reasonably sure there are no bugs.

 On Fri, Sep 12, 2014 at 7:47 PM, as...@multicorewareinc.com wrote:

 # HG changeset patch
 # User Ashok Kumar Mishraas...@multicorewareinc.com
 # Date 1410341620 -19800
 #  Wed Sep 10 15:03:40 2014 +0530
 # Node ID d8be3c38915d4a628b804522da8946a152041203
 # Parent  cd8fd0afd4e873fc940ae3384fac4deed3ec7b4f
 Search: remove redundant encode coefficients in intra for performance

 diff -r cd8fd0afd4e8 -r d8be3c38915d source/encoder/analysis.cpp
 --- a/source/encoder/analysis.cpp   Thu Sep 11 17:25:40 2014 -0700
 +++ b/source/encoder/analysis.cpp   Wed Sep 10 15:03:40 2014 +0530
 @@ -1840,6 +1840,7 @@
  void Analysis::encodeIntraInInter(TComDataCU* cu, TComYuv* fencYuv,
 TComYuv* predYuv,  ShortYuv* outResiYuv, TComYuv* outReconYuv)
  {
  uint64_t puCost = 0;
 +uint32_t puBits = 0;
  uint32_t depth = cu-getDepth(0);
  uint32_t initTrDepth = cu-getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1;

 @@ -1851,7 +1852,7 @@
  uint32_t tuDepthRange[2];
  cu-getQuadtreeTULog2MinSizeInCU(tuDepthRange, 0);

 -uint32_t puDistY = xRecurIntraCodingQT(cu, initTrDepth, 0, fencYuv,
 predYuv, outResiYuv, false, puCost, tuDepthRange);
 +uint32_t puDistY = xRecurIntraCodingQT(cu, initTrDepth, 0, fencYuv,
 predYuv, outResiYuv, false, puCost, puBits, tuDepthRange);
  xSetIntraResultQT(cu, initTrDepth, 0, outReconYuv);

  //=== update PU data 
 diff -r cd8fd0afd4e8 -r d8be3c38915d source/encoder/search.cpp
 --- a/source/encoder/search.cpp Thu Sep 11 17:25:40 2014 -0700
 +++ b/source/encoder/search.cpp Wed Sep 10 15:03:40 2014 +0530
 @@ -111,47 +111,6 @@
  return false;
  }

 -void Search::xEncSubdivCbfQTLuma(TComDataCU* cu, uint32_t trDepth,
 uint32_t absPartIdx, uint32_t depthRange[2])
 -{
 -uint32_t fullDepth  = cu-getDepth(0) + trDepth;
 -uint32_t trMode = cu-getTransformIdx(absPartIdx);
 -uint32_t subdiv = (trMode  trDepth ? 1 : 0);
 -uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 -
 -if (cu-getPredictionMode(0) == MODE_INTRA 
 cu-getPartitionSize(0) == SIZE_NxN  trDepth == 0)
 -{
 -X265_CHECK(subdiv, subdivision not present\n);
 -}
 -else if (log2TrSize  *(depthRange + 1))
 -{
 -X265_CHECK(subdiv, subdivision not present\n);
 -}
 -else if (log2TrSize == cu-m_slice-m_sps-quadtreeTULog2MinSize)
 -{
 -X265_CHECK(!subdiv, subdivision present\n);
 -}
 -else if (log2TrSize == *depthRange)
 -{
 -X265_CHECK(!subdiv, subdivision present\n);
 -}
 -else
 -{
 -X265_CHECK(log2TrSize  *depthRange, transform size too
 small\n);
 -m_entropyCoder-codeTransformSubdivFlag(subdiv, 5 - log2TrSize);
 -}
 -
 -if (subdiv)
 -{
 -uint32_t qtPartNum = cu-m_pic-getNumPartInCU()  ((fullDepth
 + 1)  1);
 -for (uint32_t part = 0; part  4; part++)
 -xEncSubdivCbfQTLuma(cu, trDepth + 1, absPartIdx + part *
 qtPartNum, depthRange);
 -
 -return;
 -}
 -
 -m_entropyCoder-codeQtCbf(cu, absPartIdx, TEXT_LUMA, trMode);
 -}
 -
  void Search::xEncSubdivCbfQTChroma(TComDataCU* cu, uint32_t trDepth,
 uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t
 height)
  {
  uint32_t fullDepth  = cu-getDepth(0) + trDepth;
 @@ -183,32 +142,6 @@
  }
  }

 -void Search::xEncCoeffQTLuma(TComDataCU* cu, uint32_t trDepth, uint32_t
 absPartIdx)
 -{
 -const TextType ttype = TEXT_LUMA;
 -
 -if (!cu-getCbf(absPartIdx, ttype, trDepth))
 -return;
 -
 -uint32_t fullDepth = cu-getDepth(0) + trDepth;
 -uint32_t trMode= cu-getTransformIdx(absPartIdx);
 -
 -if (trMode  trDepth)
 -{
 -uint32_t qtPartNum = cu-m_pic-getNumPartInCU()  ((fullDepth
 + 1)  1);
 -for (uint32_t part = 0; part  4; part++)
 -xEncCoeffQTLuma(cu, trDepth + 1, absPartIdx + part *
 qtPartNum);
 -
 -return;
 -}
 -
 -uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 -uint32_t qtLayer= log2TrSize - 2;
 -uint32_t coeffOffset = absPartIdx  LOG2_UNIT_SIZE * 2;
 -coeff_t* coeff = m_qtTempCoeff[ttype][qtLayer] + coeffOffset;
 -m_entropyCoder-codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize,
 ttype);
 -}
 -
  void Search::xEncCoeffQTChroma(TComDataCU* cu, uint32_t trDepth,
 uint32_t absPartIdx, TextType ttype)
  {
  if (!cu-getCbf(absPartIdx, ttype, trDepth))
 @@ -316,15 +249,6 @@
  }
  }

 -uint32_t Search::xGetIntraBitsQTLuma(TComDataCU* cu, uint32_t trDepth,
 uint32_t absPartIdx, uint32_t depthRange[2])
 -{
 -m_entropyCoder-resetBits();
 -xEncIntraHeaderLuma(cu, trDepth, absPartIdx);
 -

Re: [x265] [PATCH] Search: remove redundant encode coefficients in intra for performance

2014-09-14 Thread Deepthi Nandakumar
This significantly changes outputs for P and B frames. Higher bitrates and
higher SSIM. Lets do full regression testing on this - and compare the
bitrate/ssim for all combinations to be reasonably sure there are no bugs.

On Fri, Sep 12, 2014 at 7:47 PM, as...@multicorewareinc.com wrote:

 # HG changeset patch
 # User Ashok Kumar Mishraas...@multicorewareinc.com
 # Date 1410341620 -19800
 #  Wed Sep 10 15:03:40 2014 +0530
 # Node ID d8be3c38915d4a628b804522da8946a152041203
 # Parent  cd8fd0afd4e873fc940ae3384fac4deed3ec7b4f
 Search: remove redundant encode coefficients in intra for performance

 diff -r cd8fd0afd4e8 -r d8be3c38915d source/encoder/analysis.cpp
 --- a/source/encoder/analysis.cpp   Thu Sep 11 17:25:40 2014 -0700
 +++ b/source/encoder/analysis.cpp   Wed Sep 10 15:03:40 2014 +0530
 @@ -1840,6 +1840,7 @@
  void Analysis::encodeIntraInInter(TComDataCU* cu, TComYuv* fencYuv,
 TComYuv* predYuv,  ShortYuv* outResiYuv, TComYuv* outReconYuv)
  {
  uint64_t puCost = 0;
 +uint32_t puBits = 0;
  uint32_t depth = cu-getDepth(0);
  uint32_t initTrDepth = cu-getPartitionSize(0) == SIZE_2Nx2N ? 0 : 1;

 @@ -1851,7 +1852,7 @@
  uint32_t tuDepthRange[2];
  cu-getQuadtreeTULog2MinSizeInCU(tuDepthRange, 0);

 -uint32_t puDistY = xRecurIntraCodingQT(cu, initTrDepth, 0, fencYuv,
 predYuv, outResiYuv, false, puCost, tuDepthRange);
 +uint32_t puDistY = xRecurIntraCodingQT(cu, initTrDepth, 0, fencYuv,
 predYuv, outResiYuv, false, puCost, puBits, tuDepthRange);
  xSetIntraResultQT(cu, initTrDepth, 0, outReconYuv);

  //=== update PU data 
 diff -r cd8fd0afd4e8 -r d8be3c38915d source/encoder/search.cpp
 --- a/source/encoder/search.cpp Thu Sep 11 17:25:40 2014 -0700
 +++ b/source/encoder/search.cpp Wed Sep 10 15:03:40 2014 +0530
 @@ -111,47 +111,6 @@
  return false;
  }

 -void Search::xEncSubdivCbfQTLuma(TComDataCU* cu, uint32_t trDepth,
 uint32_t absPartIdx, uint32_t depthRange[2])
 -{
 -uint32_t fullDepth  = cu-getDepth(0) + trDepth;
 -uint32_t trMode = cu-getTransformIdx(absPartIdx);
 -uint32_t subdiv = (trMode  trDepth ? 1 : 0);
 -uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 -
 -if (cu-getPredictionMode(0) == MODE_INTRA  cu-getPartitionSize(0)
 == SIZE_NxN  trDepth == 0)
 -{
 -X265_CHECK(subdiv, subdivision not present\n);
 -}
 -else if (log2TrSize  *(depthRange + 1))
 -{
 -X265_CHECK(subdiv, subdivision not present\n);
 -}
 -else if (log2TrSize == cu-m_slice-m_sps-quadtreeTULog2MinSize)
 -{
 -X265_CHECK(!subdiv, subdivision present\n);
 -}
 -else if (log2TrSize == *depthRange)
 -{
 -X265_CHECK(!subdiv, subdivision present\n);
 -}
 -else
 -{
 -X265_CHECK(log2TrSize  *depthRange, transform size too
 small\n);
 -m_entropyCoder-codeTransformSubdivFlag(subdiv, 5 - log2TrSize);
 -}
 -
 -if (subdiv)
 -{
 -uint32_t qtPartNum = cu-m_pic-getNumPartInCU()  ((fullDepth +
 1)  1);
 -for (uint32_t part = 0; part  4; part++)
 -xEncSubdivCbfQTLuma(cu, trDepth + 1, absPartIdx + part *
 qtPartNum, depthRange);
 -
 -return;
 -}
 -
 -m_entropyCoder-codeQtCbf(cu, absPartIdx, TEXT_LUMA, trMode);
 -}
 -
  void Search::xEncSubdivCbfQTChroma(TComDataCU* cu, uint32_t trDepth,
 uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t
 height)
  {
  uint32_t fullDepth  = cu-getDepth(0) + trDepth;
 @@ -183,32 +142,6 @@
  }
  }

 -void Search::xEncCoeffQTLuma(TComDataCU* cu, uint32_t trDepth, uint32_t
 absPartIdx)
 -{
 -const TextType ttype = TEXT_LUMA;
 -
 -if (!cu-getCbf(absPartIdx, ttype, trDepth))
 -return;
 -
 -uint32_t fullDepth = cu-getDepth(0) + trDepth;
 -uint32_t trMode= cu-getTransformIdx(absPartIdx);
 -
 -if (trMode  trDepth)
 -{
 -uint32_t qtPartNum = cu-m_pic-getNumPartInCU()  ((fullDepth +
 1)  1);
 -for (uint32_t part = 0; part  4; part++)
 -xEncCoeffQTLuma(cu, trDepth + 1, absPartIdx + part *
 qtPartNum);
 -
 -return;
 -}
 -
 -uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 -uint32_t qtLayer= log2TrSize - 2;
 -uint32_t coeffOffset = absPartIdx  LOG2_UNIT_SIZE * 2;
 -coeff_t* coeff = m_qtTempCoeff[ttype][qtLayer] + coeffOffset;
 -m_entropyCoder-codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize,
 ttype);
 -}
 -
  void Search::xEncCoeffQTChroma(TComDataCU* cu, uint32_t trDepth, uint32_t
 absPartIdx, TextType ttype)
  {
  if (!cu-getCbf(absPartIdx, ttype, trDepth))
 @@ -316,15 +249,6 @@
  }
  }

 -uint32_t Search::xGetIntraBitsQTLuma(TComDataCU* cu, uint32_t trDepth,
 uint32_t absPartIdx, uint32_t depthRange[2])
 -{
 -m_entropyCoder-resetBits();
 -xEncIntraHeaderLuma(cu, trDepth, absPartIdx);
 -xEncSubdivCbfQTLuma(cu, trDepth, absPartIdx, depthRange);
 -xEncCoeffQTLuma(cu, trDepth, absPartIdx);
 -return