[x265] [PATCH] Added fast intra search option to Analysis::checkIntraInInter_rd0_4

2014-08-14 Thread dtyx265
# HG changeset patch
# User David T Yuen dtyx...@gmail.com
# Date 1408026426 25200
# Node ID 81766e60e622f28c12766f277b087cfeccff9cc3
# Parent  6b741cce14acb610a2a17a08f51898ea18b16a35
Added fast intra search option to Analysis::checkIntraInInter_rd0_4

diff -r 6b741cce14ac -r 81766e60e622 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp   Thu Aug 14 12:53:52 2014 +0530
+++ b/source/encoder/analysis.cpp   Thu Aug 14 07:27:06 2014 -0700
@@ -26,6 +26,7 @@
 #include common.h
 #include rdcost.h
 #include encoder.h
+#include predict.h
 #include PPA/ppa.h
 
 using namespace x265;
@@ -1655,6 +1656,7 @@
 }
 
 pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
+int predsize = scaleTuSize * scaleTuSize;
 
 uint32_t preds[3];
 cu-getIntraDirLumaPredictor(partOffset, preds);
@@ -1685,23 +1687,79 @@
 bits = !(mpms  ((uint64_t)1  mode)) ? rbits : xModeBitsIntra(cu, mode, 
partOffset, depth);
 cost = m_rdCost.calcRdSADCost(sad, bits);
 COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
-
+
 // Transpose NxN
 primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride);
-
 primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, 
leftFiltered, (scaleTuSize = 16));
 
-for (mode = 2; mode  35; mode++)
+bool modeHor;
+pixel *cmp;
+intptr_t srcStride;
+if (m_param-bEnableFastIntra)
 {
-bool modeHor = (mode  18);
-pixel *cmp = (modeHor ? buf_trans : fenc);
-intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride);
-sad  = sa8d(cmp, srcStride, tmp[(mode - 2) * (scaleTuSize * 
scaleTuSize)], scaleTuSize)  costShift;
+int lowsad, highsad, asad = 0;
+uint32_t lowbits, highbits, amode, lowmode, highmode, abits = 0;
+uint64_t lowcost, highcost = MAX_INT64, acost = MAX_INT64;
+
+for (mode = 4;mode  35; mode += 5)
+{
+modeHor = (mode  18);
+cmp = (modeHor ? buf_trans : fenc);
+srcStride = (modeHor ? scaleTuSize : scaleStride);
+sad = sa8d(cmp, srcStride, tmp[(mode - 2) * predsize], 
scaleTuSize)  costShift;
+bits = !(mpms  ((uint64_t)1  mode)) ? rbits : 
xModeBitsIntra(cu, mode, partOffset, depth);
+cost = m_rdCost.calcRdSADCost(sad, bits);
+COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
+}
+lowmode = amode - 2;
+modeHor = (lowmode  18);
+cmp = (modeHor ? buf_trans : fenc);
+srcStride = (modeHor ? scaleTuSize : scaleStride);
+lowsad = sa8d(cmp, srcStride, tmp[(lowmode - 2) * predsize], 
scaleTuSize)  costShift;
+lowbits = !(mpms  ((uint64_t)1  lowmode)) ? rbits : 
xModeBitsIntra(cu, lowmode, partOffset, depth);
+lowcost = m_rdCost.calcRdSADCost(lowsad, lowbits);
+if (bmode  34)
+{
+highmode = amode + 2;
+modeHor = (highmode  18);
+cmp = (modeHor ? buf_trans : fenc);
+srcStride = (modeHor ? scaleTuSize : scaleStride);
+highsad = sa8d(cmp, srcStride, tmp[(highmode - 2) * predsize], 
scaleTuSize)  costShift;
+highbits = !(mpms  ((uint64_t)1  highmode)) ? rbits : 
xModeBitsIntra(cu, highmode, partOffset, depth);
+highcost = m_rdCost.calcRdSADCost(highsad, highbits);
+}
+if (lowcost = highcost)
+{
+mode = amode - 1;
+COPY4_IF_LT(acost, lowcost, amode, lowmode, asad, lowsad, abits, 
lowbits);
+}
+else
+{
+mode = amode + 1;
+COPY4_IF_LT(acost, highcost, amode, highmode, asad, highsad, 
abits, highbits);
+}
+modeHor = (mode  18);
+cmp = (modeHor ? buf_trans : fenc);
+srcStride = (modeHor ? scaleTuSize : scaleStride);
+sad = sa8d(cmp, srcStride, tmp[(mode - 2) * predsize], scaleTuSize) 
 costShift;
 bits = !(mpms  ((uint64_t)1  mode)) ? rbits : xModeBitsIntra(cu, 
mode, partOffset, depth);
 cost = m_rdCost.calcRdSADCost(sad, bits);
-COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
+COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
 }
-
+else // calculate and search all intra prediction angles for lowest cost
+{
+for (mode = 2; mode  35; mode++)
+{
+modeHor = (mode  18);
+cmp = (modeHor ? buf_trans : fenc);
+srcStride = (modeHor ? scaleTuSize : scaleStride);
+sad = sa8d(cmp, srcStride, tmp[(mode - 2) * predsize], 
scaleTuSize)  costShift;
+bits = !(mpms  ((uint64_t)1  mode)) ? rbits : 
xModeBitsIntra(cu, mode, partOffset, depth);
+cost = m_rdCost.calcRdSADCost(sad, bits);
+COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+}
+}
 cu-m_totalBits = bbits;
 cu-m_totalDistortion = bsad;
 

Re: [x265] [PATCH] Added fast intra search option

2014-08-13 Thread Deepthi Nandakumar
There are a couple of warnings our regression tests caught with this. Can
you take a look?

source\encoder\predict.cpp(78): warning C4800: 'const unsigned char' :
forcing value to bool 'true' or 'false' (performance warning)
(IntraFilterType can be bool, I think?).


C:\users\deepthi\code\x265\source\encoder\slicetype.cpp(1714): warning
C4701: potentially uninitialized local variable 'lowmode' used

Thanks,
Deepthi



On Wed, Aug 13, 2014 at 4:07 AM, dtyx...@gmail.com wrote:

 # HG changeset patch
 # User David T Yuen dtyx...@gmail.com
 # Date 1407882999 25200
 # Node ID 75e4ad481b3668b1e420ede300287aa3ea3fb8d5
 # Parent  8a7f4bb1d1be32fe668d410450c2e320ccae6098
 Added fast intra search option

 This version calls intra_pred_allangs  to create the predictions then the
 faster search with satd

 diff -r 8a7f4bb1d1be -r 75e4ad481b36 source/common/param.cpp
 --- a/source/common/param.cpp   Tue Aug 12 01:11:39 2014 -0500
 +++ b/source/common/param.cpp   Tue Aug 12 15:36:39 2014 -0700
 @@ -132,6 +132,7 @@
  /* Intra Coding Tools */
  param-bEnableConstrainedIntra = 0;
  param-bEnableStrongIntraSmoothing = 1;
 +param-bEnableFastIntra = 0;

  /* Inter Coding tools */
  param-searchMethod = X265_HEX_SEARCH;
 @@ -560,6 +561,7 @@
  OPT(lossless) p-bLossless = atobool(value);
  OPT(cu-lossless) p-bCULossless = atobool(value);
  OPT(constrained-intra) p-bEnableConstrainedIntra = atobool(value);
 +OPT(fast-intra) p-bEnableFastIntra = atobool(value);
  OPT(open-gop) p-bOpenGOP = atobool(value);
  OPT(scenecut)
  {
 @@ -1211,6 +1213,7 @@
  BOOL(p-bLossless, lossless);
  BOOL(p-bCULossless, cu-lossless);
  BOOL(p-bEnableConstrainedIntra, constrained-intra);
 +BOOL(p-bEnableFastIntra, fast-intra);
  BOOL(p-bOpenGOP, open-gop);
  s += sprintf(s,  interlace=%d, p-interlaceMode);
  s += sprintf(s,  keyint=%d, p-keyframeMax);
 diff -r 8a7f4bb1d1be -r 75e4ad481b36 source/encoder/slicetype.cpp
 --- a/source/encoder/slicetype.cpp  Tue Aug 12 01:11:39 2014 -0500
 +++ b/source/encoder/slicetype.cpp  Tue Aug 12 15:36:39 2014 -0700
 @@ -1242,6 +1242,7 @@
  {
  m_rows[i].m_widthInCU = m_widthInCU;
  m_rows[i].m_heightInCU = m_heightInCU;
 +m_rows[i].m_param = m_param;
  }

  if (!WaveFront::init(m_heightInCU))
 @@ -1676,26 +1677,86 @@

  int predsize = cuSize * cuSize;

 -// generate 35 intra predictions into tmp
 +// generate 35 intra predictions into m_predictions
 +pixelcmp_t satd =
 primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
 +int icost = m_me.COST_MAX, cost, highcost, lowcost, acost =
 m_me.COST_MAX;
 +uint32_t  lowmode, mode;
  primitives.intra_pred[sizeIdx][DC_IDX](m_predictions, cuSize,
 left0, above0, 0, (cuSize = 16));
 +cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
 +if (cost  icost)
 +icost = cost;
  pixel *above = (cuSize = 8) ? above1 : above0;
  pixel *left  = (cuSize = 8) ? left1 : left0;
 -primitives.intra_pred[sizeIdx][PLANAR_IDX](m_predictions +
 predsize, cuSize, left, above, 0, 0);
 +primitives.intra_pred[sizeIdx][PLANAR_IDX](m_predictions, cuSize,
 left, above, 0, 0);
 +cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
 +if (cost  icost)
 +icost = cost;
  primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 *
 predsize, above0, left0, above1, left1, (cuSize = 16));

 -// calculate 35 satd costs, keep least cost
 +// calculate satd costs, keep least cost
  ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
  primitives.transpose[sizeIdx](buf_trans, m_me.fenc, FENC_STRIDE);
 -pixelcmp_t satd =
 primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
 -int icost = m_me.COST_MAX, cost;
 -for (uint32_t mode = 0; mode  35; mode++)
 +// fast-intra angle search
 +if (m_param-bEnableFastIntra)
  {
 -if ((mode = 2)  (mode  18))
 +for (mode = 4;mode  35; mode += 5)
 +{
 +if (mode  18)
 +cost = satd(buf_trans, cuSize, m_predictions[mode *
 predsize], cuSize);
 +else
 +cost = satd(m_me.fenc, FENC_STRIDE,
 m_predictions[mode * predsize], cuSize);
 +if (cost  acost)
 +{
 +lowmode = mode;
 +acost = cost;
 +}
 +}
 +mode = lowmode - 2;
 +if (mode  18)
 +lowcost = satd(buf_trans, cuSize, m_predictions[mode *
 predsize], cuSize);
 +else
 +lowcost = satd(m_me.fenc, FENC_STRIDE,
 m_predictions[mode * predsize], cuSize);
 +highcost = m_me.COST_MAX;
 +if (lowmode  34)
 +{
 +mode = lowmode + 2;
 +if (mode  18)

Re: [x265] [PATCH] Added fast intra search option

2014-08-13 Thread dave

In building with gcc debian 4.7.2-5 I get no warnings.
On 08/13/2014 05:46 AM, Deepthi Nandakumar wrote:
There are a couple of warnings our regression tests caught with this. 
Can you take a look?


source\encoder\predict.cpp(78): warning C4800: 'const unsigned char' : 
forcing value to bool 'true' or 'false' (performance warning)

(IntraFilterType can be bool, I think?).

Initially I used bool for the table but unsigned char performed better 
on my old system.


C:\users\deepthi\code\x265\source\encoder\slicetype.cpp(1714): warning 
C4701: potentially uninitialized local variable 'lowmode' used



I'll submit a patch to set lowmode to a default.

Thanks,
Deepthi



On Wed, Aug 13, 2014 at 4:07 AM, dtyx...@gmail.com 
mailto:dtyx...@gmail.com wrote:


# HG changeset patch
# User David T Yuen dtyx...@gmail.com mailto:dtyx...@gmail.com
# Date 1407882999 25200
# Node ID 75e4ad481b3668b1e420ede300287aa3ea3fb8d5
# Parent  8a7f4bb1d1be32fe668d410450c2e320ccae6098
Added fast intra search option

This version calls intra_pred_allangs  to create the predictions
then the faster search with satd

diff -r 8a7f4bb1d1be -r 75e4ad481b36 source/common/param.cpp
--- a/source/common/param.cpp   Tue Aug 12 01:11:39 2014 -0500
+++ b/source/common/param.cpp   Tue Aug 12 15:36:39 2014 -0700
@@ -132,6 +132,7 @@
 /* Intra Coding Tools */
 param-bEnableConstrainedIntra = 0;
 param-bEnableStrongIntraSmoothing = 1;
+param-bEnableFastIntra = 0;

 /* Inter Coding tools */
 param-searchMethod = X265_HEX_SEARCH;
@@ -560,6 +561,7 @@
 OPT(lossless) p-bLossless = atobool(value);
 OPT(cu-lossless) p-bCULossless = atobool(value);
 OPT(constrained-intra) p-bEnableConstrainedIntra =
atobool(value);
+OPT(fast-intra) p-bEnableFastIntra = atobool(value);
 OPT(open-gop) p-bOpenGOP = atobool(value);
 OPT(scenecut)
 {
@@ -1211,6 +1213,7 @@
 BOOL(p-bLossless, lossless);
 BOOL(p-bCULossless, cu-lossless);
 BOOL(p-bEnableConstrainedIntra, constrained-intra);
+BOOL(p-bEnableFastIntra, fast-intra);
 BOOL(p-bOpenGOP, open-gop);
 s += sprintf(s,  interlace=%d, p-interlaceMode);
 s += sprintf(s,  keyint=%d, p-keyframeMax);
diff -r 8a7f4bb1d1be -r 75e4ad481b36 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp  Tue Aug 12 01:11:39 2014 -0500
+++ b/source/encoder/slicetype.cpp  Tue Aug 12 15:36:39 2014 -0700
@@ -1242,6 +1242,7 @@
 {
 m_rows[i].m_widthInCU = m_widthInCU;
 m_rows[i].m_heightInCU = m_heightInCU;
+m_rows[i].m_param = m_param;
 }

 if (!WaveFront::init(m_heightInCU))
@@ -1676,26 +1677,86 @@

 int predsize = cuSize * cuSize;

-// generate 35 intra predictions into tmp
+// generate 35 intra predictions into m_predictions
+pixelcmp_t satd =
primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
+int icost = m_me.COST_MAX, cost, highcost, lowcost, acost
= m_me.COST_MAX;
+uint32_t  lowmode, mode;
 primitives.intra_pred[sizeIdx][DC_IDX](m_predictions, cuSize,
left0, above0, 0, (cuSize = 16));
+cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+if (cost  icost)
+icost = cost;
 pixel *above = (cuSize = 8) ? above1 : above0;
 pixel *left  = (cuSize = 8) ? left1 : left0;
-  primitives.intra_pred[sizeIdx][PLANAR_IDX](m_predictions +
predsize, cuSize, left, above, 0, 0);
+  primitives.intra_pred[sizeIdx][PLANAR_IDX](m_predictions,
cuSize, left, above, 0, 0);
+cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+if (cost  icost)
+icost = cost;
 primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 *
predsize, above0, left0, above1, left1, (cuSize = 16));

-// calculate 35 satd costs, keep least cost
+// calculate satd costs, keep least cost
 ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
 primitives.transpose[sizeIdx](buf_trans, m_me.fenc,
FENC_STRIDE);
-pixelcmp_t satd =
primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
-int icost = m_me.COST_MAX, cost;
-for (uint32_t mode = 0; mode  35; mode++)
+// fast-intra angle search
+if (m_param-bEnableFastIntra)
 {
-if ((mode = 2)  (mode  18))
+for (mode = 4;mode  35; mode += 5)
+{
+if (mode  18)
+cost = satd(buf_trans, cuSize,
m_predictions[mode * predsize], cuSize);
+else
+cost = satd(m_me.fenc, FENC_STRIDE,
m_predictions[mode * predsize], cuSize);
+if (cost  acost)

Re: [x265] [PATCH] Added fast intra search option

2014-08-13 Thread dave

On 08/12/2014 10:22 PM, Steve Borho wrote:

On 08/12, dtyx...@gmail.com wrote:

# HG changeset patch
# User David T Yuen dtyx...@gmail.com
# Date 1407882999 25200
# Node ID 75e4ad481b3668b1e420ede300287aa3ea3fb8d5
# Parent  8a7f4bb1d1be32fe668d410450c2e320ccae6098
Added fast intra search option

This version calls intra_pred_allangs  to create the predictions then the 
faster search with satd

on my newer CPUs, this version was unambiguously faster; so I've pushed
this version, thanks.

How were you testing it?  I was encoding a 2 minute video with -I 1.

I also have a patch that changes m_predictions from EstimateRow member 
pointer to enough dynamically allocated memory to hold all 35 
predictions to a local array of EstimateRow::estimateCUCost big enough 
to hold one prediction which can't be used with allangs and so would 
only be useful with the other fast-intra version.  Again, I didn't seem 
to help much on my system but if you would like to try it I'll submit a 
patch.



diff -r 8a7f4bb1d1be -r 75e4ad481b36 source/common/param.cpp
--- a/source/common/param.cpp   Tue Aug 12 01:11:39 2014 -0500
+++ b/source/common/param.cpp   Tue Aug 12 15:36:39 2014 -0700
@@ -132,6 +132,7 @@
  /* Intra Coding Tools */
  param-bEnableConstrainedIntra = 0;
  param-bEnableStrongIntraSmoothing = 1;
+param-bEnableFastIntra = 0;
  
  /* Inter Coding tools */

  param-searchMethod = X265_HEX_SEARCH;
@@ -560,6 +561,7 @@
  OPT(lossless) p-bLossless = atobool(value);
  OPT(cu-lossless) p-bCULossless = atobool(value);
  OPT(constrained-intra) p-bEnableConstrainedIntra = atobool(value);
+OPT(fast-intra) p-bEnableFastIntra = atobool(value);
  OPT(open-gop) p-bOpenGOP = atobool(value);
  OPT(scenecut)
  {
@@ -1211,6 +1213,7 @@
  BOOL(p-bLossless, lossless);
  BOOL(p-bCULossless, cu-lossless);
  BOOL(p-bEnableConstrainedIntra, constrained-intra);
+BOOL(p-bEnableFastIntra, fast-intra);
  BOOL(p-bOpenGOP, open-gop);
  s += sprintf(s,  interlace=%d, p-interlaceMode);
  s += sprintf(s,  keyint=%d, p-keyframeMax);
diff -r 8a7f4bb1d1be -r 75e4ad481b36 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp  Tue Aug 12 01:11:39 2014 -0500
+++ b/source/encoder/slicetype.cpp  Tue Aug 12 15:36:39 2014 -0700
@@ -1242,6 +1242,7 @@
  {
  m_rows[i].m_widthInCU = m_widthInCU;
  m_rows[i].m_heightInCU = m_heightInCU;
+m_rows[i].m_param = m_param;
  }
  
  if (!WaveFront::init(m_heightInCU))

@@ -1676,26 +1677,86 @@
  
  int predsize = cuSize * cuSize;
  
-// generate 35 intra predictions into tmp

+// generate 35 intra predictions into m_predictions
+pixelcmp_t satd = 
primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
+int icost = m_me.COST_MAX, cost, highcost, lowcost, acost = 
m_me.COST_MAX;
+uint32_t  lowmode, mode;
  primitives.intra_pred[sizeIdx][DC_IDX](m_predictions, cuSize, left0, 
above0, 0, (cuSize = 16));
+cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+if (cost  icost)
+icost = cost;
  pixel *above = (cuSize = 8) ? above1 : above0;
  pixel *left  = (cuSize = 8) ? left1 : left0;
-primitives.intra_pred[sizeIdx][PLANAR_IDX](m_predictions + predsize, 
cuSize, left, above, 0, 0);
+primitives.intra_pred[sizeIdx][PLANAR_IDX](m_predictions, cuSize, 
left, above, 0, 0);
+cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+if (cost  icost)
+icost = cost;
  primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, 
above0, left0, above1, left1, (cuSize = 16));
  
-// calculate 35 satd costs, keep least cost

+// calculate satd costs, keep least cost
  ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
  primitives.transpose[sizeIdx](buf_trans, m_me.fenc, FENC_STRIDE);
-pixelcmp_t satd = 
primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
-int icost = m_me.COST_MAX, cost;
-for (uint32_t mode = 0; mode  35; mode++)
+// fast-intra angle search
+if (m_param-bEnableFastIntra)
  {
-if ((mode = 2)  (mode  18))
+for (mode = 4;mode  35; mode += 5)
+{
+if (mode  18)
+cost = satd(buf_trans, cuSize, m_predictions[mode * 
predsize], cuSize);
+else
+cost = satd(m_me.fenc, FENC_STRIDE, m_predictions[mode * 
predsize], cuSize);
+if (cost  acost)
+{
+lowmode = mode;
+acost = cost;
+}
+}
+mode = lowmode - 2;
+if (mode  18)
+lowcost = satd(buf_trans, cuSize, m_predictions[mode * 
predsize], cuSize);
+else
+lowcost = satd(m_me.fenc, FENC_STRIDE, m_predictions[mode * 
predsize], cuSize);
+

[x265] [PATCH] Added Fast intra search option

2014-08-12 Thread dtyx265
# HG changeset patch
# User David T Yuen dtyx...@gmail.com
# Date 1407881349 25200
# Node ID 1e079a117f0f381c97753d74404a6a943ab3ff1d
# Parent  8a7f4bb1d1be32fe668d410450c2e320ccae6098
Added Fast intra search option

This version calls intra_pred and satd for each mode searched
and also uses the IntraFilterType table that was moved from intrapred.cpp to 
predict.h

diff -r 8a7f4bb1d1be -r 1e079a117f0f source/common/param.cpp
--- a/source/common/param.cpp   Tue Aug 12 01:11:39 2014 -0500
+++ b/source/common/param.cpp   Tue Aug 12 15:09:09 2014 -0700
@@ -132,6 +132,7 @@
 /* Intra Coding Tools */
 param-bEnableConstrainedIntra = 0;
 param-bEnableStrongIntraSmoothing = 1;
+param-bEnableFastIntra = 0;
 
 /* Inter Coding tools */
 param-searchMethod = X265_HEX_SEARCH;
@@ -560,6 +561,7 @@
 OPT(lossless) p-bLossless = atobool(value);
 OPT(cu-lossless) p-bCULossless = atobool(value);
 OPT(constrained-intra) p-bEnableConstrainedIntra = atobool(value);
+OPT(fast-intra) p-bEnableFastIntra = atobool(value);
 OPT(open-gop) p-bOpenGOP = atobool(value);
 OPT(scenecut)
 {
@@ -1211,6 +1213,7 @@
 BOOL(p-bLossless, lossless);
 BOOL(p-bCULossless, cu-lossless);
 BOOL(p-bEnableConstrainedIntra, constrained-intra);
+BOOL(p-bEnableFastIntra, fast-intra);
 BOOL(p-bOpenGOP, open-gop);
 s += sprintf(s,  interlace=%d, p-interlaceMode);
 s += sprintf(s,  keyint=%d, p-keyframeMax);
diff -r 8a7f4bb1d1be -r 1e079a117f0f source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp  Tue Aug 12 01:11:39 2014 -0500
+++ b/source/encoder/slicetype.cpp  Tue Aug 12 15:09:09 2014 -0700
@@ -33,6 +33,7 @@
 #include slicetype.h
 #include motion.h
 #include ratecontrol.h
+#include predict.h
 
 #define NUM_CUS (m_widthInCU  2  m_heightInCU  2 ? (m_widthInCU - 2) * 
(m_heightInCU - 2) : m_widthInCU * m_heightInCU)
 
@@ -1242,6 +1243,7 @@
 {
 m_rows[i].m_widthInCU = m_widthInCU;
 m_rows[i].m_heightInCU = m_heightInCU;
+m_rows[i].m_param = m_param;
 }
 
 if (!WaveFront::init(m_heightInCU))
@@ -1675,27 +1677,89 @@
 }
 
 int predsize = cuSize * cuSize;
+int icost = m_me.COST_MAX, cost;
+pixelcmp_t satd = 
primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
 
-// generate 35 intra predictions into tmp
+// generate intra predictions into m_predictions
 primitives.intra_pred[sizeIdx][DC_IDX](m_predictions, cuSize, left0, 
above0, 0, (cuSize = 16));
+cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+if (cost  icost)
+icost = cost;
 pixel *above = (cuSize = 8) ? above1 : above0;
 pixel *left  = (cuSize = 8) ? left1 : left0;
-primitives.intra_pred[sizeIdx][PLANAR_IDX](m_predictions + predsize, 
cuSize, left, above, 0, 0);
-primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, 
above0, left0, above1, left1, (cuSize = 16));
+primitives.intra_pred[sizeIdx][PLANAR_IDX](m_predictions, cuSize, 
left, above, 0, 0);
+cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+if (cost  icost)
+icost = cost;
+uint32_t lowmode, mode;
+// fast intra prediction angle search
+if (m_param-bEnableFastIntra)
+{
+int acost = m_me.COST_MAX;
+for (mode = 4;mode  35; mode += 5)
+{
+left = (IntraFilterType[sizeIdx][mode] ? left1 : left0);
+above = (IntraFilterType[sizeIdx][mode] ? above1 : above0);
+primitives.intra_pred[sizeIdx][mode](m_predictions, cuSize, 
left, above, mode, cuSize = 16);
+cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+if (cost  acost)
+{
+lowmode = mode;
+acost = cost;
+}
+}
+mode = lowmode - 2;
+left = (IntraFilterType[sizeIdx][mode] ? left1 : left0);
+above = (IntraFilterType[sizeIdx][mode] ? above1 : above0);
+primitives.intra_pred[sizeIdx][mode](m_predictions, cuSize, left, 
above, mode, cuSize = 16);
+int lowcost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+int highcost = m_me.COST_MAX;
+if (lowmode  34)
+{
+mode = lowmode + 2;
+left = (IntraFilterType[sizeIdx][mode] ? left1 : left0);
+above = (IntraFilterType[sizeIdx][mode] ? above1 : above0);
+primitives.intra_pred[sizeIdx][mode](m_predictions, cuSize, 
left, above, mode, cuSize = 16);
+highcost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+}
+if (lowcost = highcost)
+{
+mode = lowmode - 1;
+left = (IntraFilterType[sizeIdx][mode] ? left1 : left0);
+above = 

[x265] [PATCH] Added fast intra search option

2014-08-12 Thread dtyx265
# HG changeset patch
# User David T Yuen dtyx...@gmail.com
# Date 1407882999 25200
# Node ID 75e4ad481b3668b1e420ede300287aa3ea3fb8d5
# Parent  8a7f4bb1d1be32fe668d410450c2e320ccae6098
Added fast intra search option

This version calls intra_pred_allangs  to create the predictions then the 
faster search with satd

diff -r 8a7f4bb1d1be -r 75e4ad481b36 source/common/param.cpp
--- a/source/common/param.cpp   Tue Aug 12 01:11:39 2014 -0500
+++ b/source/common/param.cpp   Tue Aug 12 15:36:39 2014 -0700
@@ -132,6 +132,7 @@
 /* Intra Coding Tools */
 param-bEnableConstrainedIntra = 0;
 param-bEnableStrongIntraSmoothing = 1;
+param-bEnableFastIntra = 0;
 
 /* Inter Coding tools */
 param-searchMethod = X265_HEX_SEARCH;
@@ -560,6 +561,7 @@
 OPT(lossless) p-bLossless = atobool(value);
 OPT(cu-lossless) p-bCULossless = atobool(value);
 OPT(constrained-intra) p-bEnableConstrainedIntra = atobool(value);
+OPT(fast-intra) p-bEnableFastIntra = atobool(value);
 OPT(open-gop) p-bOpenGOP = atobool(value);
 OPT(scenecut)
 {
@@ -1211,6 +1213,7 @@
 BOOL(p-bLossless, lossless);
 BOOL(p-bCULossless, cu-lossless);
 BOOL(p-bEnableConstrainedIntra, constrained-intra);
+BOOL(p-bEnableFastIntra, fast-intra);
 BOOL(p-bOpenGOP, open-gop);
 s += sprintf(s,  interlace=%d, p-interlaceMode);
 s += sprintf(s,  keyint=%d, p-keyframeMax);
diff -r 8a7f4bb1d1be -r 75e4ad481b36 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp  Tue Aug 12 01:11:39 2014 -0500
+++ b/source/encoder/slicetype.cpp  Tue Aug 12 15:36:39 2014 -0700
@@ -1242,6 +1242,7 @@
 {
 m_rows[i].m_widthInCU = m_widthInCU;
 m_rows[i].m_heightInCU = m_heightInCU;
+m_rows[i].m_param = m_param;
 }
 
 if (!WaveFront::init(m_heightInCU))
@@ -1676,26 +1677,86 @@
 
 int predsize = cuSize * cuSize;
 
-// generate 35 intra predictions into tmp
+// generate 35 intra predictions into m_predictions
+pixelcmp_t satd = 
primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
+int icost = m_me.COST_MAX, cost, highcost, lowcost, acost = 
m_me.COST_MAX;
+uint32_t  lowmode, mode;
 primitives.intra_pred[sizeIdx][DC_IDX](m_predictions, cuSize, left0, 
above0, 0, (cuSize = 16));
+cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+if (cost  icost)
+icost = cost;
 pixel *above = (cuSize = 8) ? above1 : above0;
 pixel *left  = (cuSize = 8) ? left1 : left0;
-primitives.intra_pred[sizeIdx][PLANAR_IDX](m_predictions + predsize, 
cuSize, left, above, 0, 0);
+primitives.intra_pred[sizeIdx][PLANAR_IDX](m_predictions, cuSize, 
left, above, 0, 0);
+cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+if (cost  icost)
+icost = cost;
 primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, 
above0, left0, above1, left1, (cuSize = 16));
 
-// calculate 35 satd costs, keep least cost
+// calculate satd costs, keep least cost
 ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
 primitives.transpose[sizeIdx](buf_trans, m_me.fenc, FENC_STRIDE);
-pixelcmp_t satd = 
primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
-int icost = m_me.COST_MAX, cost;
-for (uint32_t mode = 0; mode  35; mode++)
+// fast-intra angle search
+if (m_param-bEnableFastIntra)
 {
-if ((mode = 2)  (mode  18))
+for (mode = 4;mode  35; mode += 5)
+{
+if (mode  18)
+cost = satd(buf_trans, cuSize, m_predictions[mode * 
predsize], cuSize);
+else
+cost = satd(m_me.fenc, FENC_STRIDE, m_predictions[mode * 
predsize], cuSize);
+if (cost  acost)
+{
+lowmode = mode;
+acost = cost;
+}
+}
+mode = lowmode - 2;
+if (mode  18)
+lowcost = satd(buf_trans, cuSize, m_predictions[mode * 
predsize], cuSize);
+else
+lowcost = satd(m_me.fenc, FENC_STRIDE, m_predictions[mode * 
predsize], cuSize);
+highcost = m_me.COST_MAX;
+if (lowmode  34)
+{
+mode = lowmode + 2;
+if (mode  18)
+highcost = satd(buf_trans, cuSize, m_predictions[mode * 
predsize], cuSize);
+else
+highcost = satd(m_me.fenc, FENC_STRIDE, 
m_predictions[mode * predsize], cuSize);
+}
+if (lowcost = highcost)
+{
+mode = lowmode - 1;
+if (lowcost  acost)
+acost = lowcost;
+}
+else
+{
+mode = lowmode + 1;
+if (highcost  acost)
+acost = highcost;
+  

Re: [x265] [PATCH] Added fast intra search option

2014-08-12 Thread Steve Borho
On 08/12, dtyx...@gmail.com wrote:
 # HG changeset patch
 # User David T Yuen dtyx...@gmail.com
 # Date 1407882999 25200
 # Node ID 75e4ad481b3668b1e420ede300287aa3ea3fb8d5
 # Parent  8a7f4bb1d1be32fe668d410450c2e320ccae6098
 Added fast intra search option
 
 This version calls intra_pred_allangs  to create the predictions then the 
 faster search with satd

on my newer CPUs, this version was unambiguously faster; so I've pushed
this version, thanks.

 diff -r 8a7f4bb1d1be -r 75e4ad481b36 source/common/param.cpp
 --- a/source/common/param.cpp Tue Aug 12 01:11:39 2014 -0500
 +++ b/source/common/param.cpp Tue Aug 12 15:36:39 2014 -0700
 @@ -132,6 +132,7 @@
  /* Intra Coding Tools */
  param-bEnableConstrainedIntra = 0;
  param-bEnableStrongIntraSmoothing = 1;
 +param-bEnableFastIntra = 0;
  
  /* Inter Coding tools */
  param-searchMethod = X265_HEX_SEARCH;
 @@ -560,6 +561,7 @@
  OPT(lossless) p-bLossless = atobool(value);
  OPT(cu-lossless) p-bCULossless = atobool(value);
  OPT(constrained-intra) p-bEnableConstrainedIntra = atobool(value);
 +OPT(fast-intra) p-bEnableFastIntra = atobool(value);
  OPT(open-gop) p-bOpenGOP = atobool(value);
  OPT(scenecut)
  {
 @@ -1211,6 +1213,7 @@
  BOOL(p-bLossless, lossless);
  BOOL(p-bCULossless, cu-lossless);
  BOOL(p-bEnableConstrainedIntra, constrained-intra);
 +BOOL(p-bEnableFastIntra, fast-intra);
  BOOL(p-bOpenGOP, open-gop);
  s += sprintf(s,  interlace=%d, p-interlaceMode);
  s += sprintf(s,  keyint=%d, p-keyframeMax);
 diff -r 8a7f4bb1d1be -r 75e4ad481b36 source/encoder/slicetype.cpp
 --- a/source/encoder/slicetype.cppTue Aug 12 01:11:39 2014 -0500
 +++ b/source/encoder/slicetype.cppTue Aug 12 15:36:39 2014 -0700
 @@ -1242,6 +1242,7 @@
  {
  m_rows[i].m_widthInCU = m_widthInCU;
  m_rows[i].m_heightInCU = m_heightInCU;
 +m_rows[i].m_param = m_param;
  }
  
  if (!WaveFront::init(m_heightInCU))
 @@ -1676,26 +1677,86 @@
  
  int predsize = cuSize * cuSize;
  
 -// generate 35 intra predictions into tmp
 +// generate 35 intra predictions into m_predictions
 +pixelcmp_t satd = 
 primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
 +int icost = m_me.COST_MAX, cost, highcost, lowcost, acost = 
 m_me.COST_MAX;
 +uint32_t  lowmode, mode;
  primitives.intra_pred[sizeIdx][DC_IDX](m_predictions, cuSize, left0, 
 above0, 0, (cuSize = 16));
 +cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
 +if (cost  icost)
 +icost = cost;
  pixel *above = (cuSize = 8) ? above1 : above0;
  pixel *left  = (cuSize = 8) ? left1 : left0;
 -primitives.intra_pred[sizeIdx][PLANAR_IDX](m_predictions + predsize, 
 cuSize, left, above, 0, 0);
 +primitives.intra_pred[sizeIdx][PLANAR_IDX](m_predictions, cuSize, 
 left, above, 0, 0);
 +cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
 +if (cost  icost)
 +icost = cost;
  primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, 
 above0, left0, above1, left1, (cuSize = 16));
  
 -// calculate 35 satd costs, keep least cost
 +// calculate satd costs, keep least cost
  ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
  primitives.transpose[sizeIdx](buf_trans, m_me.fenc, FENC_STRIDE);
 -pixelcmp_t satd = 
 primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
 -int icost = m_me.COST_MAX, cost;
 -for (uint32_t mode = 0; mode  35; mode++)
 +// fast-intra angle search
 +if (m_param-bEnableFastIntra)
  {
 -if ((mode = 2)  (mode  18))
 +for (mode = 4;mode  35; mode += 5)
 +{
 +if (mode  18)
 +cost = satd(buf_trans, cuSize, m_predictions[mode * 
 predsize], cuSize);
 +else
 +cost = satd(m_me.fenc, FENC_STRIDE, m_predictions[mode 
 * predsize], cuSize);
 +if (cost  acost)
 +{
 +lowmode = mode;
 +acost = cost;
 +}
 +}
 +mode = lowmode - 2;
 +if (mode  18)
 +lowcost = satd(buf_trans, cuSize, m_predictions[mode * 
 predsize], cuSize);
 +else
 +lowcost = satd(m_me.fenc, FENC_STRIDE, m_predictions[mode * 
 predsize], cuSize);
 +highcost = m_me.COST_MAX;
 +if (lowmode  34)
 +{
 +mode = lowmode + 2;
 +if (mode  18)
 +highcost = satd(buf_trans, cuSize, m_predictions[mode * 
 predsize], cuSize);
 +else
 +highcost = satd(m_me.fenc, FENC_STRIDE, 
 m_predictions[mode * predsize], cuSize);
 +}
 +if (lowcost = highcost)
 +{
 +mode = lowmode - 1;