from:"praveen"

Re: [x265] [X265][PATCH] Fix: Hang when vbv is used with slices

2021-01-22 Thread Praveen Kumar Karadugattu

Hi Aruna,

I have reviewed the patch internally. Could you please review it once and
push this patch to the default of x265?

Thanks,
Praveen

On Fri, Jan 22, 2021 at 4:52 PM Niranjan Bala 
wrote:

> From 64a985847ecca1a6937fe1dae00d3db79cf0bcb2 Mon Sep 17 00:00:00 2001
> From: Niranjan 
> Date: Fri, 22 Jan 2021 08:26:58 +0530
> Subject: [PATCH] Fix: Hang when vbv is used with slices
>
> ---
>  source/encoder/frameencoder.cpp | 30 --
>  source/encoder/frameencoder.h   |  4 ++--
>  2 files changed, 18 insertions(+), 16 deletions(-)
>
> diff --git a/source/encoder/frameencoder.cpp
> b/source/encoder/frameencoder.cpp
> index 2086a15a5..efe85282f 100644
> --- a/source/encoder/frameencoder.cpp
> +++ b/source/encoder/frameencoder.cpp
> @@ -47,8 +47,6 @@ FrameEncoder::FrameEncoder()
>  m_slicetypeWaitTime = 0;
>  m_activeWorkerCount = 0;
>  m_completionCount = 0;
> -m_bAllRowsStop = false;
> -m_vbvResetTriggerRow = -1;
>  m_outStreams = NULL;
>  m_backupStreams = NULL;
>  m_substreamSizes = NULL;
> @@ -88,6 +86,8 @@ void FrameEncoder::destroy()
>  delete[] m_outStreams;
>  delete[] m_backupStreams;
>  X265_FREE(m_sliceBaseRow);
> +X265_FREE((void*)m_bAllRowsStop);
> +X265_FREE((void*)m_vbvResetTriggerRow);
>  X265_FREE(m_sliceMaxBlockRow);
>  X265_FREE(m_cuGeoms);
>  X265_FREE(m_ctuGeomMap);
> @@ -118,6 +118,8 @@ bool FrameEncoder::init(Encoder *top, int numRows, int
> numCols)
>  bool ok = !!m_numRows;
>
>  m_sliceBaseRow = X265_MALLOC(uint32_t, m_param->maxSlices + 1);
> +m_bAllRowsStop = X265_MALLOC(bool, m_param->maxSlices);
> +m_vbvResetTriggerRow = X265_MALLOC(int, m_param->maxSlices);
>  ok &= !!m_sliceBaseRow;
>  m_sliceGroupSize = (uint16_t)(m_numRows + m_param->maxSlices - 1) /
> m_param->maxSlices;
>  uint32_t sliceGroupSizeAccu = (m_numRows << 8) / m_param->maxSlices;
>
> @@ -438,8 +440,8 @@ void FrameEncoder::compressFrame()
>  m_stallStartTime = 0;
>
>  m_completionCount = 0;
> -m_bAllRowsStop = false;
> -m_vbvResetTriggerRow = -1;
> +memset((void*)m_bAllRowsStop, 0, sizeof(bool) * m_param->maxSlices);
> +memset((void*)m_vbvResetTriggerRow, -1, sizeof(int) *
> m_param->maxSlices);
>  m_rowSliceTotalBits[0] = 0;
>  m_rowSliceTotalBits[1] = 0;
>
> @@ -1469,16 +1471,16 @@ void FrameEncoder::processRowEncoder(int intRow,
> ThreadLocalData& tld)
>  curRow.bufferedEntropy.copyState(rowCoder);
>  curRow.bufferedEntropy.loadContexts(rowCoder);
>  }
> -if (bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
>
> +if (bFirstRowInSlice && m_vbvResetTriggerRow[curRow.sliceId]
> != intRow)
>  {
>  curEncData.m_rowStat[row].rowQp = curEncData.m_avgQpRc;
>  curEncData.m_rowStat[row].rowQpScale =
> x265_qp2qScale(curEncData.m_avgQpRc);
>  }
>
>  FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr];
> -if (m_param->bEnableWavefront && rowInSlice >= col &&
> !bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
> +if (m_param->bEnableWavefront && rowInSlice >= col &&
> !bFirstRowInSlice && m_vbvResetTriggerRow[curRow.sliceId] != intRow)
>  cuStat.baseQp = curEncData.m_cuStat[cuAddr - numCols +
> 1].baseQp;
> -else if (!m_param->bEnableWavefront && !bFirstRowInSlice &&
> m_vbvResetTriggerRow != intRow)
> +else if (!m_param->bEnableWavefront && !bFirstRowInSlice &&
> m_vbvResetTriggerRow[curRow.sliceId] != intRow)
>  cuStat.baseQp = curEncData.m_rowStat[row - 1].rowQp;
>  else
>  cuStat.baseQp = curEncData.m_rowStat[row].rowQp;
> @@ -1655,7 +1657,7 @@ void FrameEncoder::processRowEncoder(int intRow,
> ThreadLocalData& tld)
>  x265_log(m_param, X265_LOG_DEBUG, "POC %d row %d -
> encode restart required for VBV, to %.2f from %.2f\n",
>  m_frame->m_poc, row, qpBase,
> curEncData.m_cuStat[cuAddr].baseQp);
>
> -m_vbvResetTriggerRow = row;
> +m_vbvResetTriggerRow[curRow.sliceId] = row;
>  m_outStreams[0].copyBits(_backupStreams[0]);
>
>  rowCoder.copyState(curRow.bufferedEntropy);
> @@ -1707,8 +1709,8 @@ void FrameEncoder::processRowEncoder(int intRow,
> ThreadLocalData& tld)
>

Re: [x265] [PATCH] fix: help for rskip cli option to avoid make errors

2020-09-16 Thread Praveen Kumar Karadugattu

Looks good to me.

Regards,
Praveen

On Wed, Sep 16, 2020 at 7:06 PM Srikanth Kurapati <
srikanth.kurap...@multicorewareinc.com> wrote:

> From a92bc566e03f473af25db8f78d1eb3f40106a959 Mon Sep 17 00:00:00 2001
> From: Srikanth Kurapati 
> Date: Fri, 4 Sep 2020 11:06:39 +0530
> Subject: [PATCH] fix: help for rskip cli option to avoid make errors
>
> ---
>  source/x265cli.cpp | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/source/x265cli.cpp b/source/x265cli.cpp
> index c28dd7f8c..2575e02cd 100755
> --- a/source/x265cli.cpp
> +++ b/source/x265cli.cpp
> @@ -127,8 +127,7 @@ namespace X265_NS {
>  H0("   --[no-]ssim-rdEnable ssim rate distortion
> optimization, 0 to disable. Default %s\n", OPT(param->bSsimRd));
>  H0("   --[no-]rd-refine  Enable QP based RD
> refinement for rd levels 5 and 6. Default %s\n",
> OPT(param->bEnableRdRefine));
>  H0("   --[no-]early-skip Enable early SKIP detection.
> Default %s\n", OPT(param->bEnableEarlySkip));
> -H0("   --rskip Set mode for early exit from
> recursion. Mode 1: exit using rdcost & CU homogenity. Mode 2: exit using CU
> edge density.\n"
> -" Mode 0: disabled. Default
> %d\n", param->recursionSkipMode);
> +H0("   --rskip  Enable recurison skip for
> early exit. 1: exit using rdcost & CU homogenity. 2: exit using CU edge
> density. 0: disabled. Default %d\n", param->recursionSkipMode);
>  H1("   --rskip-edge-thresholdThreshold in terms of
> percentage (integer of range [0,100]) for minimum edge density in CUs used
> to prun the recursion depth. Applicable only for rskip mode 2. Value is
> preset dependent. Default: %.f\n", param->edgeVarThreshold*100.0f);
>  H1("   --[no-]tskip-fast Enable fast intra transform
> skipping. Default %s\n", OPT(param->bEnableTSkipFast));
>  H1("   --[no-]splitrd-skip   Enable skipping split RD
> analysis when sum of split CU rdCost larger than one split CU rdCost for
> Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
> --
> 2.20.1.windows.1
>
> --
> *With Regards,*
> *Srikanth Kurapati.*
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] Corrected wrong cli in docs wrt --colormatrix & --videoformat

2020-09-01 Thread Praveen Kumar Karadugattu

>From 866e7d77aa113dcfd9596c27e7dda70d8da8220b Mon Sep 17 00:00:00 2001
From: Praveen Kumar Karadugattu 
Date: Tue, 1 Sep 2020 22:48:30 +0530
Subject: [PATCH] Corrected wrong cli in docs wrt --colormatrix &
--videoformat

---
 doc/reST/cli.rst| 104

 source/CMakeLists.txt   |   2 +-
 source/common/param.cpp |  38 ++
 source/x265cli.cpp  |  12 +++---
 4 files changed, 80 insertions(+), 76 deletions(-)
 mode change 100644 => 100755 doc/reST/cli.rst
 mode change 100644 => 100755 source/CMakeLists.txt
 mode change 100644 => 100755 source/common/param.cpp
 mode change 100644 => 100755 source/x265cli.cpp

diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst
old mode 100644
new mode 100755
index 02828e3..e6c628c
--- a/doc/reST/cli.rst
+++ b/doc/reST/cli.rst
@@ -141,7 +141,7 @@ Logging/Statistic Options
  **Residual Energy** Average residual energy. SSE is calculated on fenc
  and pred(before quantization).

- **Luma/Chroma Values** minumum, maximum and average(averaged by area)
+ **Luma/Chroma Values** minimum, maximum and average(averaged by area)
  luma and chroma values of source for each frame.

  **PU Statistics** percentage of PU modes at each depth.
@@ -246,7 +246,7 @@ Performance Options

 .. option:: --pools , --numa-pools 

- Comma seperated list of threads per NUMA node. If "none", then no worker
+ Comma separated list of threads per NUMA node. If "none", then no worker
  pools are created and only frame parallelism is possible. If NULL or ""
  (default) x265 will use all available threads on each NUMA node::

@@ -284,7 +284,7 @@ Performance Options
  the last thread pool is spawned only if it has more than 32 threads for
  64-bit machines, or 16 for 32-bit machines. If the total number of threads
  in the system doesn't obey this constraint, we may spawn fewer threads
- than cores which has been emperically shown to be better for performance.
+ than cores which has been empirically shown to be better for performance.

  If the four pool features: :option:`--wpp`, :option:`--pmode`,
  :option:`--pme` and :option:`--lookahead-slices` are all disabled,
@@ -409,7 +409,7 @@ Performance Options

  Allow encoder to copy input x265 pictures to internal frame buffers. When
disabled,
  x265 will not make an internal copy of the input picture and will work
with the
- application's buffers. While this allows for deeper integration, it is
the responsbility
+ application's buffers. While this allows for deeper integration, it is
the responsibility
  of the application to (a) ensure that the allocated picture has extra
space for padding
  that will be done by the library, and (b) the buffers aren't recycled
until the library
  has completed encoding this frame (which can be figured out by tracking
NALs output by x265)
@@ -554,7 +554,7 @@ frame counts) are only applicable to the CLI
application.

 .. option:: --chunk-start 

- First frame of the chunk. Frames preceeding this in display order will
+ First frame of the chunk. Frames preceding this in display order will
  be encoded, however, they will be discarded in the bitstream. This
  feature can be enabled only in closed GOP structures.
  Default 0 (disabled).
@@ -562,7 +562,7 @@ frame counts) are only applicable to the CLI
application.
 .. option:: --chunk-end 

  Last frame of the chunk. Frames following this in display order will be
- used in taking lookahead decisions, but, they will not be encoded.
+ used in taking lookahead decisions, but they will not be encoded.
  This feature can be enabled only in closed GOP structures.
  Default 0 (disabled).

@@ -638,7 +638,7 @@ Profile, Level, Tier
  If :option:`--level-idc` has been specified, --high-tier allows the
  support of high tier at that level. The encoder will first attempt to
encode
  at the specified level, main tier first, turning on high tier only if
- necessary and available at that level.If your requested level does not
+ necessary and available at that level. If your requested level does not
  support a High tier, high tier will not be supported. If --no-high-tier
  has been specified, then the encoder will attempt to encode only at the
main tier.

@@ -647,8 +647,8 @@ Profile, Level, Tier
 .. option:: --ref <1..16>

  Max number of L0 references to be allowed. This number has a linear
- multiplier effect on the amount of work performed in motion search,
- but will generally have a beneficial affect on compression and
+ multiplier effect on the amount of work performed in motion search
+ but will generally have a beneficial effect on compression and
  distortion.

  Note that x265 allows up to 16 L0 references but the HEVC
@@ -668,7 +668,7 @@ Profile, Level, Tier
 .. option:: --allow-non-conformance, --no-allow-non-conformance

  Allow libx265 to generate a bitstream with profile and level NONE.
- By default it will abort any encode which does not meet strict level
+ By defau

[x265] [Test-harness]Added the normalization fix of hist-scenectu to output-changing-commits

2020-07-06 Thread Praveen Kumar Karadugattu

>From 09d42e9e3850d4f6424fb7b4e8620e4eb3ec7389 Mon Sep 17 00:00:00 2001
From: Praveen Kumar Karadugattu 
Date: Mon, 6 Jul 2020 13:21:59 +0530
Subject: [PATCH] Added the normalization fix of hist-scenectu to
 output-changing-commits

---
 output-changing-commits-git.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/output-changing-commits-git.txt
b/output-changing-commits-git.txt
index ead6d85..d60c5bc 100644
--- a/output-changing-commits-git.txt
+++ b/output-changing-commits-git.txt
@@ -18,5 +18,6 @@
 # 2) not required [sao], [weightp], [cutree] etc.., because these are
 # already set in presets so golden outputs will store for that preset.

+38774073d45138b01a6abd0e2cfcecae01038a72 [hist-scenecut] Fixed the
normalization formula to ouput 0 to 1 and considered max chroma histogram
SAD in histogram based scene cut detection
 3f476a384a190bab44a2bdcf94a081ccc58b13e8 Merge with default
 7bd63522910add904aaf878e85c2e7a2fece80cd analysis-save/load: fix crash
during analysis sharing between non-dyadic resolutions
-- 
1.8.3.1


0001-Added-the-normalization-fix-of-hist-scenectu-to-outp.patch
Description: Binary data
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH]Fixed the normalization formula to ouput 0 to 1 and considered max chroma histogram SAD in histogram based scene cut detection

2020-06-22 Thread Praveen Kumar Karadugattu

>From 44704e10a60ae314ecd13dfb84c0c4f82d6c1a55 Mon Sep 17 00:00:00 2001
From: Praveen Kumar Karadugattu 
Date: Wed, 17 Jun 2020 19:28:06 +0530
Subject: [PATCH] Fixed the normalization formula to ouput 0 to 1 and
 considered max chroma histogram SAD in histogram based scene cut detection

---
 doc/reST/cli.rst   |  8 ++---
 source/common/param.cpp|  4 +--
 source/encoder/encoder.cpp | 75
++
 source/encoder/encoder.h   |  6 ++--
 source/x265cli.cpp |  2 +-
 5 files changed, 45 insertions(+), 50 deletions(-)

diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst
index eceec40..c9e288e 100644
--- a/doc/reST/cli.rst
+++ b/doc/reST/cli.rst
@@ -1462,13 +1462,13 @@ Slice decision options
 .. option:: --hist-scenecut, --no-hist-scenecut

  Indicates that scenecuts need to be detected using luma edge and chroma
histograms.
- :option: `--hist-scenecut` enables scenecut detection using the
histograms and disables the default scene cut algorithm.
- :option: `--no-hist-scenecut` disables histogram based scenecut algorithm.
+ :option:`--hist-scenecut` enables scenecut detection using the histograms
and disables the default scene cut algorithm.
+ :option:`--no-hist-scenecut` disables histogram based scenecut algorithm.

-.. option:: --hist-threshold <0.0..2.0>
+.. option:: --hist-threshold <0.0..1.0>

  This value represents the threshold for normalized SAD of edge histograms
used in scenecut detection.
- This requires :option: `--hist-scenecut` to be enabled. For example, a
value of 0.2 indicates that a frame with normalized SAD value
+ This requires :option:`--hist-scenecut` to be enabled. For example, a
value of 0.2 indicates that a frame with normalized SAD value
  greater than 0.2 against the previous frame as scenecut.
  Default 0.01.

diff --git a/source/common/param.cpp b/source/common/param.cpp
index fb7244e..925f0c4 100644
--- a/source/common/param.cpp
+++ b/source/common/param.cpp
@@ -1688,8 +1688,8 @@ int x265_check_params(x265_param* param)
   "scenecutThreshold must be greater than 0");
 CHECK(param->scenecutBias < 0 || 100 < param->scenecutBias,
 "scenecut-bias must be between 0 and 100");
-CHECK(param->edgeTransitionThreshold < 0.0 || 2.0 <
param->edgeTransitionThreshold,
-"hist-threshold must be between 0.0 and 2.0");
+CHECK(param->edgeTransitionThreshold < 0.0 || 1.0 <
param->edgeTransitionThreshold,
+"hist-threshold must be between 0.0 and 1.0");
 CHECK(param->radl < 0 || param->radl > param->bframes,
   "radl must be between 0 and bframes");
 CHECK(param->rdPenalty < 0 || param->rdPenalty > 2,
diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp
index 752e5b2..f6bc540 100644
--- a/source/encoder/encoder.cpp
+++ b/source/encoder/encoder.cpp
@@ -222,12 +222,9 @@ void Encoder::create()
 uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1;
 m_edgePic = X265_MALLOC(pixel, m_planeSizes[0] * pixelbytes);
 m_edgeHistThreshold = m_param->edgeTransitionThreshold;
-m_chromaHistThreshold = m_edgeHistThreshold * 10.0;
-m_chromaHistThreshold = x265_min(m_chromaHistThreshold,
MAX_SCENECUT_THRESHOLD);
-m_scaledEdgeThreshold = m_edgeHistThreshold *
SCENECUT_STRENGTH_FACTOR;
-m_scaledEdgeThreshold = x265_min(m_scaledEdgeThreshold,
MAX_SCENECUT_THRESHOLD);
-m_scaledChromaThreshold = m_chromaHistThreshold *
SCENECUT_STRENGTH_FACTOR;
-m_scaledChromaThreshold = x265_min(m_scaledChromaThreshold,
MAX_SCENECUT_THRESHOLD);
+m_chromaHistThreshold = x265_min(m_edgeHistThreshold * 10.0,
MAX_SCENECUT_THRESHOLD);
+m_scaledEdgeThreshold = x265_min(m_edgeHistThreshold *
SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
+m_scaledChromaThreshold = x265_min(m_chromaHistThreshold *
SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
 if (m_param->sourceBitDepth != m_param->internalBitDepth)
 {
 int size = m_param->sourceWidth * m_param->sourceHeight;
@@ -1450,13 +1447,14 @@ bool Encoder::computeHistograms(x265_picture *pic)
 memset(edgeHist, 0, EDGE_BINS * sizeof(int32_t));
 for (uint32_t i = 0; i < m_planeSizes[0]; i++)
 {
-if (!m_edgePic[i])
-   edgeHist[0]++;
+if (m_edgePic[i])
+edgeHist[1]++;
 else
-   edgeHist[1]++;
+edgeHist[0]++;
 }
+
 /* Y Histogram Calculation */
-int32_t* yHist = m_curYUVHist[0];
+int32_t *yHist = m_curYUVHist[0];
 memset(yHist, 0, HISTOGRAM_BINS * sizeof(int32_t));
 for (uint32_t i = 0; i < m_planeSizes[0]; i++)
 {
@@ -1468,7 +1466,7 @@ bool Encoder::computeHistograms(x265_picture *pic)
 {
 /* U Histogram Calculation */
 int32_t *uHist = m_curYUVHist[1];
-

[x265] Meet Chat 5 of 937 [PATCH]Fixed the normalization formula to ouput 0 to 1 and considered max chroma histogram SAD in histogram based scene cut detection

2020-06-22 Thread Praveen Kumar Karadugattu

>From 44704e10a60ae314ecd13dfb84c0c4f82d6c1a55 Mon Sep 17 00:00:00 2001
From: Praveen Kumar Karadugattu 
Date: Wed, 17 Jun 2020 19:28:06 +0530
Subject: [PATCH] Fixed the normalization formula to ouput 0 to 1 and
 considered max chroma histogram SAD in histogram based scene cut detection

---
 doc/reST/cli.rst   |  8 ++---
 source/common/param.cpp|  4 +--
 source/encoder/encoder.cpp | 75
++
 source/encoder/encoder.h   |  6 ++--
 source/x265cli.cpp |  2 +-
 5 files changed, 45 insertions(+), 50 deletions(-)

diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst
index eceec40..c9e288e 100644
--- a/doc/reST/cli.rst
+++ b/doc/reST/cli.rst
@@ -1462,13 +1462,13 @@ Slice decision options
 .. option:: --hist-scenecut, --no-hist-scenecut

  Indicates that scenecuts need to be detected using luma edge and chroma
histograms.
- :option: `--hist-scenecut` enables scenecut detection using the
histograms and disables the default scene cut algorithm.
- :option: `--no-hist-scenecut` disables histogram based scenecut algorithm.
+ :option:`--hist-scenecut` enables scenecut detection using the histograms
and disables the default scene cut algorithm.
+ :option:`--no-hist-scenecut` disables histogram based scenecut algorithm.

-.. option:: --hist-threshold <0.0..2.0>
+.. option:: --hist-threshold <0.0..1.0>

  This value represents the threshold for normalized SAD of edge histograms
used in scenecut detection.
- This requires :option: `--hist-scenecut` to be enabled. For example, a
value of 0.2 indicates that a frame with normalized SAD value
+ This requires :option:`--hist-scenecut` to be enabled. For example, a
value of 0.2 indicates that a frame with normalized SAD value
  greater than 0.2 against the previous frame as scenecut.
  Default 0.01.

diff --git a/source/common/param.cpp b/source/common/param.cpp
index fb7244e..925f0c4 100644
--- a/source/common/param.cpp
+++ b/source/common/param.cpp
@@ -1688,8 +1688,8 @@ int x265_check_params(x265_param* param)
   "scenecutThreshold must be greater than 0");
 CHECK(param->scenecutBias < 0 || 100 < param->scenecutBias,
 "scenecut-bias must be between 0 and 100");
-CHECK(param->edgeTransitionThreshold < 0.0 || 2.0 <
param->edgeTransitionThreshold,
-"hist-threshold must be between 0.0 and 2.0");
+CHECK(param->edgeTransitionThreshold < 0.0 || 1.0 <
param->edgeTransitionThreshold,
+"hist-threshold must be between 0.0 and 1.0");
 CHECK(param->radl < 0 || param->radl > param->bframes,
   "radl must be between 0 and bframes");
 CHECK(param->rdPenalty < 0 || param->rdPenalty > 2,
diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp
index 752e5b2..f6bc540 100644
--- a/source/encoder/encoder.cpp
+++ b/source/encoder/encoder.cpp
@@ -222,12 +222,9 @@ void Encoder::create()
 uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1;
 m_edgePic = X265_MALLOC(pixel, m_planeSizes[0] * pixelbytes);
 m_edgeHistThreshold = m_param->edgeTransitionThreshold;
-m_chromaHistThreshold = m_edgeHistThreshold * 10.0;
-m_chromaHistThreshold = x265_min(m_chromaHistThreshold,
MAX_SCENECUT_THRESHOLD);
-m_scaledEdgeThreshold = m_edgeHistThreshold *
SCENECUT_STRENGTH_FACTOR;
-m_scaledEdgeThreshold = x265_min(m_scaledEdgeThreshold,
MAX_SCENECUT_THRESHOLD);
-m_scaledChromaThreshold = m_chromaHistThreshold *
SCENECUT_STRENGTH_FACTOR;
-m_scaledChromaThreshold = x265_min(m_scaledChromaThreshold,
MAX_SCENECUT_THRESHOLD);
+m_chromaHistThreshold = x265_min(m_edgeHistThreshold * 10.0,
MAX_SCENECUT_THRESHOLD);
+m_scaledEdgeThreshold = x265_min(m_edgeHistThreshold *
SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
+m_scaledChromaThreshold = x265_min(m_chromaHistThreshold *
SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
 if (m_param->sourceBitDepth != m_param->internalBitDepth)
 {
 int size = m_param->sourceWidth * m_param->sourceHeight;
@@ -1450,13 +1447,14 @@ bool Encoder::computeHistograms(x265_picture *pic)
 memset(edgeHist, 0, EDGE_BINS * sizeof(int32_t));
 for (uint32_t i = 0; i < m_planeSizes[0]; i++)
 {
-if (!m_edgePic[i])
-   edgeHist[0]++;
+if (m_edgePic[i])
+edgeHist[1]++;
 else
-   edgeHist[1]++;
+edgeHist[0]++;
 }
+
 /* Y Histogram Calculation */
-int32_t* yHist = m_curYUVHist[0];
+int32_t *yHist = m_curYUVHist[0];
 memset(yHist, 0, HISTOGRAM_BINS * sizeof(int32_t));
 for (uint32_t i = 0; i < m_planeSizes[0]; i++)
 {
@@ -1468,7 +1466,7 @@ bool Encoder::computeHistograms(x265_picture *pic)
 {
 /* U Histogram Calculation */
 int32_t *uHist = m_curYUVHist[1];
-

[x265] [PATCH]Fixed the --hist-scenecut feature to consider the max variation in chroma histograms along with luma edge histograms. Also fixed the formula for normalizing both the SAD values from 0.0

2020-06-09 Thread Praveen Kumar Karadugattu

>From 0bc864dbc48624902e5a8314d9ec49ce19a84146 Mon Sep 17 00:00:00 2001
From: Praveen Kumar Karadugattu 
Date: Tue, 9 Jun 2020 20:27:48 +0530
Subject: [PATCH] Fixed the --hist-scenecut feature to consider the max
 variation in chroma histograms along with luma edge histograms. Also fixed
 the formula for normalizing both the SAD values from 0.0 to 1.0. This would
 alleviate the false positive scene-cuts observed with this feature.
---
 doc/reST/cli.rst   |  8 ++---
 source/common/param.cpp|  4 +--
 source/encoder/encoder.cpp | 75
++
 source/encoder/encoder.h   |  6 ++--
 source/x265cli.cpp |  4 +--
 5 files changed, 46 insertions(+), 51 deletions(-)
 mode change 100644 => 100755 doc/reST/cli.rst
 mode change 100644 => 100755 source/common/param.cpp
 mode change 100644 => 100755 source/encoder/encoder.cpp
 mode change 100644 => 100755 source/encoder/encoder.h
 mode change 100644 => 100755 source/x265cli.cpp
diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst
old mode 100644
new mode 100755
index eceec40..c9e288e
--- a/doc/reST/cli.rst
+++ b/doc/reST/cli.rst
@@ -1462,13 +1462,13 @@ Slice decision options
 .. option:: --hist-scenecut, --no-hist-scenecut

  Indicates that scenecuts need to be detected using luma edge and chroma
histograms.
- :option: `--hist-scenecut` enables scenecut detection using the
histograms and disables the default scene cut algorithm.
- :option: `--no-hist-scenecut` disables histogram based scenecut algorithm.
+ :option:`--hist-scenecut` enables scenecut detection using the histograms
and disables the default scene cut algorithm.
+ :option:`--no-hist-scenecut` disables histogram based scenecut algorithm.

-.. option:: --hist-threshold <0.0..2.0>
+.. option:: --hist-threshold <0.0..1.0>

  This value represents the threshold for normalized SAD of edge histograms
used in scenecut detection.
- This requires :option: `--hist-scenecut` to be enabled. For example, a
value of 0.2 indicates that a frame with normalized SAD value
+ This requires :option:`--hist-scenecut` to be enabled. For example, a
value of 0.2 indicates that a frame with normalized SAD value
  greater than 0.2 against the previous frame as scenecut.
  Default 0.01.

diff --git a/source/common/param.cpp b/source/common/param.cpp
old mode 100644
new mode 100755
index fb7244e..925f0c4
--- a/source/common/param.cpp
+++ b/source/common/param.cpp
@@ -1688,8 +1688,8 @@ int x265_check_params(x265_param* param)
   "scenecutThreshold must be greater than 0");
 CHECK(param->scenecutBias < 0 || 100 < param->scenecutBias,
 "scenecut-bias must be between 0 and 100");
-CHECK(param->edgeTransitionThreshold < 0.0 || 2.0 <
param->edgeTransitionThreshold,
-"hist-threshold must be between 0.0 and 2.0");
+CHECK(param->edgeTransitionThreshold < 0.0 || 1.0 <
param->edgeTransitionThreshold,
+"hist-threshold must be between 0.0 and 1.0");
 CHECK(param->radl < 0 || param->radl > param->bframes,
   "radl must be between 0 and bframes");
 CHECK(param->rdPenalty < 0 || param->rdPenalty > 2,
diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp
old mode 100644
new mode 100755
index 752e5b2..f6bc540
--- a/source/encoder/encoder.cpp
+++ b/source/encoder/encoder.cpp
@@ -222,12 +222,9 @@ void Encoder::create()
 uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1;
 m_edgePic = X265_MALLOC(pixel, m_planeSizes[0] * pixelbytes);
 m_edgeHistThreshold = m_param->edgeTransitionThreshold;
-m_chromaHistThreshold = m_edgeHistThreshold * 10.0;
-m_chromaHistThreshold = x265_min(m_chromaHistThreshold,
MAX_SCENECUT_THRESHOLD);
-m_scaledEdgeThreshold = m_edgeHistThreshold *
SCENECUT_STRENGTH_FACTOR;
-m_scaledEdgeThreshold = x265_min(m_scaledEdgeThreshold,
MAX_SCENECUT_THRESHOLD);
-m_scaledChromaThreshold = m_chromaHistThreshold *
SCENECUT_STRENGTH_FACTOR;
-m_scaledChromaThreshold = x265_min(m_scaledChromaThreshold,
MAX_SCENECUT_THRESHOLD);
+m_chromaHistThreshold = x265_min(m_edgeHistThreshold * 10.0,
MAX_SCENECUT_THRESHOLD);
+m_scaledEdgeThreshold = x265_min(m_edgeHistThreshold *
SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
+m_scaledChromaThreshold = x265_min(m_chromaHistThreshold *
SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD);
 if (m_param->sourceBitDepth != m_param->internalBitDepth)
 {
 int size = m_param->sourceWidth * m_param->sourceHeight;
@@ -1450,13 +1447,14 @@ bool Encoder::computeHistograms(x265_picture *pic)
 memset(edgeHist, 0, EDGE_BINS * sizeof(int32_t));
 for (uint32_t i = 0; i < m_planeSizes[0]; i++)
 {
-if (!m_edgePic[i])
-   edgeHist[0]++;
+if (m_edgePic[i])
+

[x265] [PATCH]Fixed some of the wrongly represented cli parameters related to --colormatrix and --videoformat

2020-05-26 Thread Praveen Kumar Karadugattu

>From f8664c406ee597b862ca3ee43d6e008bba5d7004 Mon Sep 17 00:00:00 2001
From: Praveen Kumar Karadugattu 
Date: Tue, 26 May 2020 18:12:51 +0530
Subject: [PATCH] Fixed some of the wrongly represented cli parameters
related
 to --colormatrix and --videoformat
---
 doc/reST/cli.rst| 110

 source/CMakeLists.txt   |   2 +-
 source/common/param.cpp |  12 +++---
 source/x265cli.cpp  |  12 +++---
 4 files changed, 68 insertions(+), 68 deletions(-)
diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst
index eceec40..6a04100 100644
--- a/doc/reST/cli.rst
+++ b/doc/reST/cli.rst
@@ -141,7 +141,7 @@ Logging/Statistic Options
  **Residual Energy** Average residual energy. SSE is calculated on fenc
  and pred(before quantization).

- **Luma/Chroma Values** minumum, maximum and average(averaged by area)
+ **Luma/Chroma Values** minimum, maximum and average(averaged by area)
  luma and chroma values of source for each frame.

  **PU Statistics** percentage of PU modes at each depth.
@@ -246,7 +246,7 @@ Performance Options

 .. option:: --pools , --numa-pools 

- Comma seperated list of threads per NUMA node. If "none", then no worker
+ Comma separated list of threads per NUMA node. If "none", then no worker
  pools are created and only frame parallelism is possible. If NULL or ""
  (default) x265 will use all available threads on each NUMA node::

@@ -284,7 +284,7 @@ Performance Options
  the last thread pool is spawned only if it has more than 32 threads for
  64-bit machines, or 16 for 32-bit machines. If the total number of threads
  in the system doesn't obey this constraint, we may spawn fewer threads
- than cores which has been emperically shown to be better for performance.
+ than cores which has been empirically shown to be better for performance.

  If the four pool features: :option:`--wpp`, :option:`--pmode`,
  :option:`--pme` and :option:`--lookahead-slices` are all disabled,
@@ -409,7 +409,7 @@ Performance Options

  Allow encoder to copy input x265 pictures to internal frame buffers. When
disabled,
  x265 will not make an internal copy of the input picture and will work
with the
- application's buffers. While this allows for deeper integration, it is
the responsbility
+ application's buffers. While this allows for deeper integration, it is
the responsibility
  of the application to (a) ensure that the allocated picture has extra
space for padding
  that will be done by the library, and (b) the buffers aren't recycled
until the library
  has completed encoding this frame (which can be figured out by tracking
NALs output by x265)
@@ -554,7 +554,7 @@ frame counts) are only applicable to the CLI
application.

 .. option:: --chunk-start 

- First frame of the chunk. Frames preceeding this in display order will
+ First frame of the chunk. Frames preceding this in display order will
  be encoded, however, they will be discarded in the bitstream. This
  feature can be enabled only in closed GOP structures.
  Default 0 (disabled).
@@ -562,7 +562,7 @@ frame counts) are only applicable to the CLI
application.
 .. option:: --chunk-end 

  Last frame of the chunk. Frames following this in display order will be
- used in taking lookahead decisions, but, they will not be encoded.
+ used in taking lookahead decisions, but they will not be encoded.
  This feature can be enabled only in closed GOP structures.
  Default 0 (disabled).

@@ -638,7 +638,7 @@ Profile, Level, Tier
  If :option:`--level-idc` has been specified, --high-tier allows the
  support of high tier at that level. The encoder will first attempt to
encode
  at the specified level, main tier first, turning on high tier only if
- necessary and available at that level.If your requested level does not
+ necessary and available at that level. If your requested level does not
  support a High tier, high tier will not be supported. If --no-high-tier
  has been specified, then the encoder will attempt to encode only at the
main tier.

@@ -647,8 +647,8 @@ Profile, Level, Tier
 .. option:: --ref <1..16>

  Max number of L0 references to be allowed. This number has a linear
- multiplier effect on the amount of work performed in motion search,
- but will generally have a beneficial affect on compression and
+ multiplier effect on the amount of work performed in motion search
+ but will generally have a beneficial effect on compression and
  distortion.

  Note that x265 allows up to 16 L0 references but the HEVC
@@ -668,7 +668,7 @@ Profile, Level, Tier
 .. option:: --allow-non-conformance, --no-allow-non-conformance

  Allow libx265 to generate a bitstream with profile and level NONE.
- By default it will abort any encode which does not meet strict level
+ By default, it will abort any encode which does not meet strict level
  compliance. The two most likely causes for non-conformance are
  :option:`--ctu` being too small, :option:`--ref` being too high,
  or the bitrate o

Re: [x265] [PATCH] Fixed some of the wrongly represented cli parameters in the docs related to --colormatrix and --videoformat.

2020-05-25 Thread Praveen Kumar Karadugattu

Hi Aruna,

Please find attached the updated patch with the review comments
incorporated.

Thanks & Regards,
Praveen

On Mon, May 4, 2020 at 4:56 PM Praveen Kumar Karadugattu <
praveenku...@multicorewareinc.com> wrote:

> Hi Aruna,
>
> I have incorporated the changes and the updated patch is sent in the
> previous email in this thread. Please check and push the same.
>
> Thanks & Regards,
> Praveen
>
> On Mon, May 4, 2020 at 4:55 PM Praveen Kumar Karadugattu <
> praveenku...@multicorewareinc.com> wrote:
>
>> From 951411943ed54043c2111f4a09419cbc77e5f0fd Mon Sep 17 00:00:00 2001
>> From: Praveen Karadugattu 
>> Date: Mon, 4 May 2020 16:50:14 +0530
>> Subject: [PATCH] Fixed some of the wrongly represented cli parameters in
>> the
>>  docs related to --colormatrix and --videoformat.
>> ---
>>  doc/reST/cli.rst| 112
>> 
>>  source/CMakeLists.txt   |   2 +-
>>  source/common/param.cpp |  12 +++---
>>  source/x265cli.cpp  |  12 +++---
>>  4 files changed, 69 insertions(+), 69 deletions(-)
>> diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst
>> index 1e2765d..524714a 100644
>> --- a/doc/reST/cli.rst
>> +++ b/doc/reST/cli.rst
>> @@ -141,7 +141,7 @@ Logging/Statistic Options
>>   **Residual Energy** Average residual energy. SSE is calculated on fenc
>>   and pred(before quantization).
>>
>> - **Luma/Chroma Values** minumum, maximum and average(averaged by area)
>> + **Luma/Chroma Values** minimum, maximum and average(averaged by area)
>>   luma and chroma values of source for each frame.
>>
>>   **PU Statistics** percentage of PU modes at each depth.
>> @@ -246,7 +246,7 @@ Performance Options
>>
>>  .. option:: --pools , --numa-pools 
>>
>> - Comma seperated list of threads per NUMA node. If "none", then no worker
>> + Comma separated list of threads per NUMA node. If "none", then no worker
>>   pools are created and only frame parallelism is possible. If NULL or ""
>>   (default) x265 will use all available threads on each NUMA node::
>>
>> @@ -284,7 +284,7 @@ Performance Options
>>   the last thread pool is spawned only if it has more than 32 threads for
>>   64-bit machines, or 16 for 32-bit machines. If the total number of
>> threads
>>   in the system doesn't obey this constraint, we may spawn fewer threads
>> - than cores which has been emperically shown to be better for
>> performance.
>> + than cores which has been empirically shown to be better for
>> performance.
>>
>>   If the four pool features: :option:`--wpp`, :option:`--pmode`,
>>   :option:`--pme` and :option:`--lookahead-slices` are all disabled,
>> @@ -409,7 +409,7 @@ Performance Options
>>
>>   Allow encoder to copy input x265 pictures to internal frame buffers.
>> When disabled,
>>   x265 will not make an internal copy of the input picture and will work
>> with the
>> - application's buffers. While this allows for deeper integration, it is
>> the responsbility
>> + application's buffers. While this allows for deeper integration, it is
>> the responsibility
>>   of the application to (a) ensure that the allocated picture has extra
>> space for padding
>>   that will be done by the library, and (b) the buffers aren't recycled
>> until the library
>>   has completed encoding this frame (which can be figured out by tracking
>> NALs output by x265)
>> @@ -554,7 +554,7 @@ frame counts) are only applicable to the CLI
>> application.
>>
>>  .. option:: --chunk-start 
>>
>> - First frame of the chunk. Frames preceeding this in display order will
>> + First frame of the chunk. Frames preceding this in display order will
>>   be encoded, however, they will be discarded in the bitstream. This
>>   feature can be enabled only in closed GOP structures.
>>   Default 0 (disabled).
>> @@ -562,7 +562,7 @@ frame counts) are only applicable to the CLI
>> application.
>>  .. option:: --chunk-end 
>>
>>   Last frame of the chunk. Frames following this in display order will be
>> - used in taking lookahead decisions, but, they will not be encoded.
>> + used in taking lookahead decisions, but they will not be encoded.
>>   This feature can be enabled only in closed GOP structures.
>>   Default 0 (disabled).
>>
>> @@ -638,7 +638,7 @@ Profile, Level, Tier
>>   If :option:`--level-idc` has been specified, --high-tier allows the
>>   support of high tier at that level. The encoder will first attempt to
>> encode
>>

Re: [x265] [PATCH] Fixed some of the wrongly represented cli parameters in the docs related to --colormatrix and --videoformat.

2020-05-04 Thread Praveen Kumar Karadugattu

>From 951411943ed54043c2111f4a09419cbc77e5f0fd Mon Sep 17 00:00:00 2001
From: Praveen Karadugattu 
Date: Mon, 4 May 2020 16:50:14 +0530
Subject: [PATCH] Fixed some of the wrongly represented cli parameters in the
 docs related to --colormatrix and --videoformat.
---
 doc/reST/cli.rst| 112

 source/CMakeLists.txt   |   2 +-
 source/common/param.cpp |  12 +++---
 source/x265cli.cpp  |  12 +++---
 4 files changed, 69 insertions(+), 69 deletions(-)
diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst
index 1e2765d..524714a 100644
--- a/doc/reST/cli.rst
+++ b/doc/reST/cli.rst
@@ -141,7 +141,7 @@ Logging/Statistic Options
  **Residual Energy** Average residual energy. SSE is calculated on fenc
  and pred(before quantization).

- **Luma/Chroma Values** minumum, maximum and average(averaged by area)
+ **Luma/Chroma Values** minimum, maximum and average(averaged by area)
  luma and chroma values of source for each frame.

  **PU Statistics** percentage of PU modes at each depth.
@@ -246,7 +246,7 @@ Performance Options

 .. option:: --pools , --numa-pools 

- Comma seperated list of threads per NUMA node. If "none", then no worker
+ Comma separated list of threads per NUMA node. If "none", then no worker
  pools are created and only frame parallelism is possible. If NULL or ""
  (default) x265 will use all available threads on each NUMA node::

@@ -284,7 +284,7 @@ Performance Options
  the last thread pool is spawned only if it has more than 32 threads for
  64-bit machines, or 16 for 32-bit machines. If the total number of threads
  in the system doesn't obey this constraint, we may spawn fewer threads
- than cores which has been emperically shown to be better for performance.
+ than cores which has been empirically shown to be better for performance.

  If the four pool features: :option:`--wpp`, :option:`--pmode`,
  :option:`--pme` and :option:`--lookahead-slices` are all disabled,
@@ -409,7 +409,7 @@ Performance Options

  Allow encoder to copy input x265 pictures to internal frame buffers. When
disabled,
  x265 will not make an internal copy of the input picture and will work
with the
- application's buffers. While this allows for deeper integration, it is
the responsbility
+ application's buffers. While this allows for deeper integration, it is
the responsibility
  of the application to (a) ensure that the allocated picture has extra
space for padding
  that will be done by the library, and (b) the buffers aren't recycled
until the library
  has completed encoding this frame (which can be figured out by tracking
NALs output by x265)
@@ -554,7 +554,7 @@ frame counts) are only applicable to the CLI
application.

 .. option:: --chunk-start 

- First frame of the chunk. Frames preceeding this in display order will
+ First frame of the chunk. Frames preceding this in display order will
  be encoded, however, they will be discarded in the bitstream. This
  feature can be enabled only in closed GOP structures.
  Default 0 (disabled).
@@ -562,7 +562,7 @@ frame counts) are only applicable to the CLI
application.
 .. option:: --chunk-end 

  Last frame of the chunk. Frames following this in display order will be
- used in taking lookahead decisions, but, they will not be encoded.
+ used in taking lookahead decisions, but they will not be encoded.
  This feature can be enabled only in closed GOP structures.
  Default 0 (disabled).

@@ -638,7 +638,7 @@ Profile, Level, Tier
  If :option:`--level-idc` has been specified, --high-tier allows the
  support of high tier at that level. The encoder will first attempt to
encode
  at the specified level, main tier first, turning on high tier only if
- necessary and available at that level.If your requested level does not
+ necessary and available at that level. If your requested level does not
  support a High tier, high tier will not be supported. If --no-high-tier
  has been specified, then the encoder will attempt to encode only at the
main tier.

@@ -647,8 +647,8 @@ Profile, Level, Tier
 .. option:: --ref <1..16>

  Max number of L0 references to be allowed. This number has a linear
- multiplier effect on the amount of work performed in motion search,
- but will generally have a beneficial affect on compression and
+ multiplier effect on the amount of work performed in motion search
+ but will generally have a beneficial effect on compression and
  distortion.

  Note that x265 allows up to 16 L0 references but the HEVC
@@ -668,7 +668,7 @@ Profile, Level, Tier
 .. option:: --allow-non-conformance, --no-allow-non-conformance

  Allow libx265 to generate a bitstream with profile and level NONE.
- By default it will abort any encode which does not meet strict level
+ By default, it will abort any encode which does not meet strict level
  compliance. The two most likely causes for non-conformance are
  :option:`--ctu` being too small, :option:`--ref` being too high,
  or the bitrate o

Re: [x265] [PATCH] Fixed some of the wrongly represented cli parameters in the docs related to --colormatrix and --videoformat.

2020-04-30 Thread Praveen Kumar Karadugattu

Hi Aruna,

Thanks for the review. Please find my response in-lined below. I will
re-send the patch incorporating the changes suggested.

Regards,
Praveen
On Thu, Apr 23, 2020 at 4:11 PM Aruna Matheswaran <
ar...@multicorewareinc.com> wrote:

>
>
> On Mon, Apr 20, 2020 at 10:38 AM Praveen Kumar Karadugattu <
> praveenku...@multicorewareinc.com> wrote:
>
>> From 9207e6db602ea218aca7d03075339009429280ef Mon Sep 17 00:00:00 2001
>> From: Praveen Karadugattu 
>> Date: Fri, 17 Apr 2020 19:59:03 +0530
>> Subject: [PATCH] Fixed some of the wrongly represented cli parameters in
>> the
>>  docs related to --colormatrix and --videoformat.
>> ---
>>  doc/reST/cli.rst|  8 
>>  source/common/param.cpp | 12 ++--
>>  source/x265cli.cpp  |  6 +++---
>>  3 files changed, 13 insertions(+), 13 deletions(-)
>> diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst
>> index 1e2765d..0fad45b 100644
>> --- a/doc/reST/cli.rst
>> +++ b/doc/reST/cli.rst
>> @@ -2154,7 +2154,7 @@ VUI fields must be manually specified.
>>   2. ntsc
>>   3. secam
>>   4. mac
>> - 5. undefined
>> + 5. unknown
>>
>>  .. option:: --range 
>>
>> @@ -2207,15 +2207,15 @@ VUI fields must be manually specified.
>>   Specify color matrix setting i.e set the matrix coefficients used in
>>   deriving the luma and chroma. Default undefined (not signaled)
>>
>> - 0. GBR
>> + 0. gbr
>>   1. bt709
>> - 2. undef
>> + 2. unknown
>>   3. **reserved**
>>   4. fcc
>>   5. bt470bg
>>   6. smpte170m
>>   7. smpte240m
>> - 8. YCgCo
>> + 8. ycgco
>>   9. bt2020nc
>>   10. bt2020c
>>   11. smpte2085
>> diff --git a/source/common/param.cpp b/source/common/param.cpp
>> index 908400f..b4965ce 100644
>> --- a/source/common/param.cpp
>> +++ b/source/common/param.cpp
>> @@ -1122,7 +1122,7 @@ int x265_param_parse(x265_param* p, const char*
>> name, const char* value)
>>  p->vui.bEnableOverscanInfoPresentFlag = 1;
>>  p->vui.bEnableOverscanAppropriateFlag = 1;
>>  }
>> -else if (!strcmp(value, "undef"))
>> +else if (!strcmp(value, "unknown"))
>>
> [AM] You are changing the API here. Please update X265_BUILD.
> btw, there is no discrepancy between CLI values and the document of the
> "overscan" option.
> Did you modify this to have uniform values across VUI options? If so, can
> we introduce "unknown" as an alias for "undef" and deprecate "undef" in the
> next version?
>
[PK] I will increment the X265_BUILD in CMakeLists.txt. "overscan" has
"undef" option. I have made it "unknown" to maintain uniformity. Yes we
need to deprecate "undef" and use "unknown" everywhere instead.

>  p->vui.bEnableOverscanInfoPresentFlag = 0;
>>  else
>>  bError = true;
>> @@ -1643,23 +1643,23 @@ int x265_check_params(x265_param* param)
>>"Sample Aspect Ratio height must be greater than 0");
>>  CHECK(param->vui.videoFormat < 0 || param->vui.videoFormat > 5,
>>"Video Format must be component,"
>> -  " pal, ntsc, secam, mac or undef");
>> +  " pal, ntsc, secam, mac or unknown");
>>  CHECK(param->vui.colorPrimaries < 0
>>|| param->vui.colorPrimaries > 12
>>|| param->vui.colorPrimaries == 3,
>> -  "Color Primaries must be undef, bt709, bt470m,"
>> +  "Color Primaries must be unknown, bt709, bt470m,"
>>" bt470bg, smpte170m, smpte240m, film, bt2020, smpte-st-428,
>> smpte-rp-431 or smpte-eg-432");
>>  CHECK(param->vui.transferCharacteristics < 0
>>|| param->vui.transferCharacteristics > 18
>>|| param->vui.transferCharacteristics == 3,
>> -  "Transfer Characteristics must be undef, bt709, bt470m,
>> bt470bg,"
>> +  "Transfer Characteristics must be unknown, bt709, bt470m,
>> bt470bg,"
>>" smpte170m, smpte240m, linear, log100, log316, iec61966-2-4,
>> bt1361e,"
>>" iec61966-2-1, bt2020-10, bt2020-12, smpte-st-2084,
>> smpte-st-428 or arib-std-b67");
>>  CHECK(param->vui.matrixCoeffs < 0
>>|| param->vui.matrixCoeffs > 14
>>|| param->vui.matrixCoeffs == 3,
>> -  "Matrix Coeffici

[x265] [PATCH] Fixed some of the wrongly represented cli parameters in the docs related to --colormatrix and --videoformat.

2020-04-19 Thread Praveen Kumar Karadugattu

>From 9207e6db602ea218aca7d03075339009429280ef Mon Sep 17 00:00:00 2001
From: Praveen Karadugattu 
Date: Fri, 17 Apr 2020 19:59:03 +0530
Subject: [PATCH] Fixed some of the wrongly represented cli parameters in the
 docs related to --colormatrix and --videoformat.
---
 doc/reST/cli.rst|  8 
 source/common/param.cpp | 12 ++--
 source/x265cli.cpp  |  6 +++---
 3 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst
index 1e2765d..0fad45b 100644
--- a/doc/reST/cli.rst
+++ b/doc/reST/cli.rst
@@ -2154,7 +2154,7 @@ VUI fields must be manually specified.
  2. ntsc
  3. secam
  4. mac
- 5. undefined
+ 5. unknown

 .. option:: --range 

@@ -2207,15 +2207,15 @@ VUI fields must be manually specified.
  Specify color matrix setting i.e set the matrix coefficients used in
  deriving the luma and chroma. Default undefined (not signaled)

- 0. GBR
+ 0. gbr
  1. bt709
- 2. undef
+ 2. unknown
  3. **reserved**
  4. fcc
  5. bt470bg
  6. smpte170m
  7. smpte240m
- 8. YCgCo
+ 8. ycgco
  9. bt2020nc
  10. bt2020c
  11. smpte2085
diff --git a/source/common/param.cpp b/source/common/param.cpp
index 908400f..b4965ce 100644
--- a/source/common/param.cpp
+++ b/source/common/param.cpp
@@ -1122,7 +1122,7 @@ int x265_param_parse(x265_param* p, const char* name,
const char* value)
 p->vui.bEnableOverscanInfoPresentFlag = 1;
 p->vui.bEnableOverscanAppropriateFlag = 1;
 }
-else if (!strcmp(value, "undef"))
+else if (!strcmp(value, "unknown"))
 p->vui.bEnableOverscanInfoPresentFlag = 0;
 else
 bError = true;
@@ -1643,23 +1643,23 @@ int x265_check_params(x265_param* param)
   "Sample Aspect Ratio height must be greater than 0");
 CHECK(param->vui.videoFormat < 0 || param->vui.videoFormat > 5,
   "Video Format must be component,"
-  " pal, ntsc, secam, mac or undef");
+  " pal, ntsc, secam, mac or unknown");
 CHECK(param->vui.colorPrimaries < 0
   || param->vui.colorPrimaries > 12
   || param->vui.colorPrimaries == 3,
-  "Color Primaries must be undef, bt709, bt470m,"
+  "Color Primaries must be unknown, bt709, bt470m,"
   " bt470bg, smpte170m, smpte240m, film, bt2020, smpte-st-428,
smpte-rp-431 or smpte-eg-432");
 CHECK(param->vui.transferCharacteristics < 0
   || param->vui.transferCharacteristics > 18
   || param->vui.transferCharacteristics == 3,
-  "Transfer Characteristics must be undef, bt709, bt470m, bt470bg,"
+  "Transfer Characteristics must be unknown, bt709, bt470m,
bt470bg,"
   " smpte170m, smpte240m, linear, log100, log316, iec61966-2-4,
bt1361e,"
   " iec61966-2-1, bt2020-10, bt2020-12, smpte-st-2084,
smpte-st-428 or arib-std-b67");
 CHECK(param->vui.matrixCoeffs < 0
   || param->vui.matrixCoeffs > 14
   || param->vui.matrixCoeffs == 3,
-  "Matrix Coefficients must be undef, bt709, fcc, bt470bg,
smpte170m,"
-  " smpte240m, GBR, YCgCo, bt2020nc, bt2020c, smpte-st-2085,
chroma-nc, chroma-c or ictcp");
+  "Matrix Coefficients must be unknown, bt709, fcc, bt470bg,
smpte170m,"
+  " smpte240m, gbr, ycgco, bt2020nc, bt2020c, smpte-st-2085,
chroma-nc, chroma-c or ictcp");
 CHECK(param->vui.chromaSampleLocTypeTopField < 0
   || param->vui.chromaSampleLocTypeTopField > 5,
   "Chroma Sample Location Type Top Field must be 0-5");
diff --git a/source/x265cli.cpp b/source/x265cli.cpp
index 05f16b7..4d91b99 100644
--- a/source/x265cli.cpp
+++ b/source/x265cli.cpp
@@ -291,8 +291,8 @@ namespace X265_NS {
 H0(" 5=40:33, 6=24:11, 7=20:11,
8=32:11, 9=80:33, 10=18:11, 11=15:11,\n");
 H0(" 12=64:33, 13=160:99, 14=4:3,
15=3:2, 16=2:1 or custom ratio of . Default %d\n",
param->vui.aspectRatioIdc);
 H1("   --display-window  Describe overscan cropping
region as 'left,top,right,bottom' in pixels\n");
-H1("   --overscanSpecify whether it is
appropriate for decoder to show cropped region: undef, show or crop.
Default undef\n");
-H0("   --videoformat Specify video format from
undef, component, pal, ntsc, secam, mac. Default undef\n");
+H1("   --overscanSpecify whether it is
appropriate for decoder to show cropped region: unknown, show or crop.
Default undef\n");
+H0("   --videoformat Specify video format from
unknown, component, pal, ntsc, secam, mac. Default undef\n");
 H0("   --range

Re: [x265] [PATCH 1 of 1] Feature: Histogram Based Scene Cut Detection

2019-11-25 Thread Praveen Kumar Karadugattu

This patch has been reviewed and looks good to me.

Regards,
Praveen

On Mon, Nov 25, 2019 at 6:53 PM Srikanth Kurapati <
srikanth.kurap...@multicorewareinc.com> wrote:

> # HG changeset patch
> # User Srikanth Kurapati 
> # Date 1573649311 -19800
> #  Wed Nov 13 18:18:31 2019 +0530
> # Node ID 97a9eca413d83cd03ae0fa95957160bdf70c170b
> # Parent  04db2bfee5d628d931d1407355b909ac8ff1c898
> Histogram Based Scene Cut Detection.
>
> This patch does the following.
> 1.Finds scene cuts by thresholding normalized SAD of edge and chroma
> histograms.
> 2.Add option "--hist-scenecut" to enable histogram based scene cut
> detection.
> 3.Add option "--hist-threshold" to provide threshold for determining scene
> cuts.
> 3.Optimizes frame duplication by reusing normalized SAD to mark duplicate
> frames.
>
> diff -r 04db2bfee5d6 -r 97a9eca413d8 doc/reST/cli.rst
> --- a/doc/reST/cli.rst Thu Oct 31 16:23:27 2019 +0530
> +++ b/doc/reST/cli.rst Wed Nov 13 18:18:31 2019 +0530
> @@ -1426,7 +1426,20 @@
>   This value represents the percentage difference between the inter cost
> and
>   intra cost of a frame used in scenecut detection. For example, a value
> of 5 indicates,
>   if the inter cost of a frame is greater than or equal to 95 percent of
> the intra cost of the frame,
> - then detect this frame as scenecut. Values between 5 and 15 are
> recommended. Default 5.
> + then detect this frame as scenecut. Values between 5 and 15 are
> recommended. Default 5.
> +
> +.. option:: --hist-scenecut, --no-hist-scenecut
> +
> + Indicates that scenecuts need to be detected using luma edge and chroma
> histograms.
> + option: `--hist-scenecut` enables scenecut detection using the
> histograms and disables the default scene cut algorithm.
> + option: `--no-hist-scenecut` disables histogram based scenecut algorithm.
> +
> +.. option:: --hist-threshold <0.0..2.0>
> +
> + This value represents the threshold for normalized SAD of edge
> histograms used in scenecut detection.
> + This requires option: `--hist-scenecut` to be enabled. For example, a
> value of 0.2 indicates that a frame with normalized SAD value
> + greater than 0.2 against the previous frame as scenecut.
> + Default 0.01.
>
>  .. option:: --radl 
>
> diff -r 04db2bfee5d6 -r 97a9eca413d8 source/CMakeLists.txt
> --- a/source/CMakeLists.txt Thu Oct 31 16:23:27 2019 +0530
> +++ b/source/CMakeLists.txt Wed Nov 13 18:18:31 2019 +0530
> @@ -29,7 +29,7 @@
>  option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
>  mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
>  # X265_BUILD must be incremented each time the public API is changed
> -set(X265_BUILD 182)
> +set(X265_BUILD 183)
>  configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
> "${PROJECT_BINARY_DIR}/x265.def")
>  configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
> diff -r 04db2bfee5d6 -r 97a9eca413d8 source/common/common.h
> --- a/source/common/common.h Thu Oct 31 16:23:27 2019 +0530
> +++ b/source/common/common.h Wed Nov 13 18:18:31 2019 +0530
> @@ -129,12 +129,16 @@
>  typedef uint64_t sum2_t;
>  typedef uint64_t pixel4;
>  typedef int64_t  ssum2_t;
> +#define HISTOGRAM_BINS 1024
> +#define SHIFT 1
>  #else
>  typedef uint8_t  pixel;
>  typedef uint16_t sum_t;
>  typedef uint32_t sum2_t;
>  typedef uint32_t pixel4;
>  typedef int32_t  ssum2_t; // Signed sum
> +#define HISTOGRAM_BINS 256
> +#define SHIFT 0
>  #endif // if HIGH_BIT_DEPTH
>
>  #if X265_DEPTH < 10
> diff -r 04db2bfee5d6 -r 97a9eca413d8 source/common/param.cpp
> --- a/source/common/param.cpp Thu Oct 31 16:23:27 2019 +0530
> +++ b/source/common/param.cpp Wed Nov 13 18:18:31 2019 +0530
> @@ -167,6 +167,8 @@
>  param->bFrameAdaptive = X265_B_ADAPT_TRELLIS;
>  param->bBPyramid = 1;
>  param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
> +param->edgeTransitionThreshold = 0.01;
> +param->bHistBasedSceneCut = 0;
>  param->lookaheadSlices = 8;
>  param->lookaheadThreads = 0;
>  param->scenecutBias = 5.0;
> @@ -572,6 +574,7 @@
>  param->bframes = 0;
>  param->lookaheadDepth = 0;
>  param->scenecutThreshold = 0;
> +param->bHistBasedSceneCut = 0;
>  param->rc.cuTree = 0;
>  param->frameNumThreads = 1;
>  }
> @@ -920,12 +923,13 @@
>  OPT("lookahead-slices") p->lookaheadSlices = atoi(value);
>  OPT("scenecut")
>  {
> -p->scenecutThreshold = atobool(value);
> -if (bError || p->scenecutThr

[x265] [x265 PATCH] Decoupled the non-"medium" presets from the effect of new default parameters committed under 4583000db964

2019-06-14 Thread Praveen Kumar Karadugattu

# HG changeset patch
# User praveen_karadugattu 
# Date 1560507248 -19800
#  Fri Jun 14 15:44:08 2019 +0530
# Node ID 6766973bc652a7a2a550f539f03248ee54e3b312
# Parent  a46ded2c14116af1cafacdc1fb98be43259dc7d6
Decoupled the non-"medium" presets from the effect of new default
parameters committed under commit-id 4583000db964.
diff -r a46ded2c1411 -r 6766973bc652 source/common/param.cpp
--- a/source/common/param.cpp Tue May 28 14:01:54 2019 +0800
+++ b/source/common/param.cpp Fri Jun 14 15:44:08 2019 +0530
@@ -361,6 +361,8 @@

 if (!strcmp(preset, "ultrafast"))
 {
+param->maxNumMergeCand = 2;
+param->bIntraInBFrames = 0;
 param->lookaheadDepth = 5;
 param->scenecutThreshold = 0; // disable lookahead
 param->maxCUSize = 32;
@@ -369,7 +371,6 @@
 param->bFrameAdaptive = 0;
 param->subpelRefine = 0;
 param->searchMethod = X265_DIA_SEARCH;
-param->bEnableEarlySkip = 1;
 param->bEnableSAO = 0;
 param->bEnableSignHiding = 0;
 param->bEnableWeightedPred = 0;
@@ -384,12 +385,13 @@
 }
 else if (!strcmp(preset, "superfast"))
 {
+param->maxNumMergeCand = 2;
+param->bIntraInBFrames = 0;
 param->lookaheadDepth = 10;
 param->maxCUSize = 32;
 param->bframes = 3;
 param->bFrameAdaptive = 0;
 param->subpelRefine = 1;
-param->bEnableEarlySkip = 1;
 param->bEnableWeightedPred = 0;
 param->rdLevel = 2;
 param->maxNumReferences = 1;
@@ -403,10 +405,12 @@
 }
 else if (!strcmp(preset, "veryfast"))
 {
+param->maxNumMergeCand = 2;
+param->limitReferences = 3;
+param->bIntraInBFrames = 0;
 param->lookaheadDepth = 15;
 param->bFrameAdaptive = 0;
 param->subpelRefine = 1;
-param->bEnableEarlySkip = 1;
 param->rdLevel = 2;
 param->maxNumReferences = 2;
 param->rc.qgSize = 32;
@@ -414,15 +418,21 @@
 }
 else if (!strcmp(preset, "faster"))
 {
+param->maxNumMergeCand = 2;
+param->limitReferences = 3;
+param->bIntraInBFrames = 0;
 param->lookaheadDepth = 15;
 param->bFrameAdaptive = 0;
-param->bEnableEarlySkip = 1;
 param->rdLevel = 2;
 param->maxNumReferences = 2;
 param->bEnableFastIntra = 1;
 }
 else if (!strcmp(preset, "fast"))
 {
+param->maxNumMergeCand = 2;
+param->limitReferences = 3;
+param->bEnableEarlySkip = 0;
+param->bIntraInBFrames = 0;
 param->lookaheadDepth = 15;
 param->bFrameAdaptive = 0;
 param->rdLevel = 2;
@@ -435,13 +445,15 @@
 }
 else if (!strcmp(preset, "slow"))
 {
+param->limitReferences = 3;
+param->bEnableEarlySkip = 0;
+param->bIntraInBFrames = 0;
 param->bEnableRectInter = 1;
 param->lookaheadDepth = 25;
 param->rdLevel = 4;
 param->rdoqLevel = 2;
 param->psyRdoq = 1.0;
 param->subpelRefine = 3;
-param->maxNumMergeCand = 3;
 param->searchMethod = X265_STAR_SEARCH;
 param->maxNumReferences = 4;
 param->limitModes = 1;
@@ -449,6 +461,7 @@
 }
 else if (!strcmp(preset, "slower"))
 {
+param->bEnableEarlySkip = 0;
 param->bEnableWeightedBiPred = 1;
 param->bEnableAMP = 1;
 param->bEnableRectInter = 1;
@@ -463,14 +476,13 @@
 param->maxNumMergeCand = 4;
 param->searchMethod = X265_STAR_SEARCH;
 param->maxNumReferences = 5;
-param->limitReferences = 1;
 param->limitModes = 1;
-param->bIntraInBFrames = 1;
 param->lookaheadSlices = 0; // disabled for best quality
 param->limitTU = 4;
 }
 else if (!strcmp(preset, "veryslow"))
 {
+param->bEnableEarlySkip = 0;
 param->bEnableWeightedBiPred = 1;
 param->bEnableAMP = 1;
 param->bEnableRectInter = 1;
@@ -487,12 +499,12 @@
 param->maxNumReferences = 5;
 param->limitReferences = 0;
 param->limitModes = 0;
-param->bIntraInBFrames = 1;
 param->lookaheadSlices = 0; // disabled for best quality
 param->limitTU = 0;
 }
 else if (!strcmp(preset, "placebo"))
 {
+param->bEnableEarlySkip = 0;
 param->bEnableWeightedBiPred = 1;
 param->bEnableAMP = 1;
 param->bEnableRectInter = 1;
@@ -511,7 +523,6 @@
 param->bEnableRecursionSkip = 0;

[x265] [x265 PATCH] Changed the params max-merge to 3, b-intra enabled, limit-ref and early-skip enabled for improved performance for high res

2019-05-27 Thread Praveen Kumar Karadugattu

# HG changeset patch
# User praveen_karadugattu 
# Date 1558937905 -19800
#  Mon May 27 11:48:25 2019 +0530
# Node ID 4583000db964b8b942c55f532216a3696fcf69ea
# Parent  b9bef1a4c34a82ea685ed76ebdd642c266bffcc3
Changed the params max-merge to 3, b-intra enabled, limit-ref and
early-skip enabled for improved performance for high res
diff --git a/source/common/param.cpp b/source/common/param.cpp
--- a/source/common/param.cpp
+++ b/source/common/param.cpp
@@ -185,12 +185,12 @@
 param->searchMethod = X265_HEX_SEARCH;
 param->subpelRefine = 2;
 param->searchRange = 57;
-param->maxNumMergeCand = 2;
-param->limitReferences = 3;
+ param->maxNumMergeCand = 3;
+ param->limitReferences = 1;
 param->limitModes = 0;
 param->bEnableWeightedPred = 1;
 param->bEnableWeightedBiPred = 0;
-param->bEnableEarlySkip = 0;
+ param->bEnableEarlySkip = 1;
 param->bEnableRecursionSkip = 1;
 param->bEnableAMP = 0;
 param->bEnableRectInter = 0;
@@ -225,7 +225,7 @@
 param->analysisReuseFileName = NULL;
 param->analysisSave = NULL;
 param->analysisLoad = NULL;
-param->bIntraInBFrames = 0;
+ param->bIntraInBFrames = 1;
 param->bLossless = 0;
 param->bCULossless = 0;
 param->bEnableTemporalSubLayers = 0;


ModifiedDefaultParams.diff
Description: Binary data
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] ratecontrol.cpp: nits - fix for coding style

2018-11-06 Thread praveen

# HG changeset patch
# User Praveen Tiwari 
# Date 1541569020 -19800
#  Wed Nov 07 11:07:00 2018 +0530
# Branch stable
# Node ID 5177401a9d4c8b577c4502538037e1cd0d2fae68
# Parent  26b4debfab1af7d5e080902b700d6124fafa8ebd
ratecontrol.cpp: nits - fix for coding style

diff -r 26b4debfab1a -r 5177401a9d4c source/encoder/ratecontrol.cpp
--- a/source/encoder/ratecontrol.cppThu Nov 01 18:47:40 2018 +0530
+++ b/source/encoder/ratecontrol.cppWed Nov 07 11:07:00 2018 +0530
@@ -381,9 +381,9 @@
 
 m_isGrainEnabled = false;
 if(m_param->rc.bEnableGrain) // tune for grainy content OR equal p-b frame 
sizes
-m_isGrainEnabled = true;
+m_isGrainEnabled = true;
 for (int i = 0; i < 3; i++)
-m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == 
X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN);
+m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == 
X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN);
 m_avgPFrameQp = 0 ;
 
 /* 720p videos seem to be a good cutoff for cplxrSum */
@@ -1253,9 +1253,7 @@
 m_isSceneTransition = false;
 
 if (rce->encodeOrder < m_lastPredictorReset + m_param->frameNumThreads)
-{
 rce->rowPreds[0][0].count = 0;
-}
 
 rce->bLastMiniGopBFrame = curFrame->m_lowres.bLastMiniGopBFrame;
 rce->bufferRate = m_bufferRate;
@@ -1458,12 +1456,8 @@
 if (!rce->keptAsRef)
 q *= fabs(m_param->rc.pbFactor);
 }
-else if (rce->sliceType == P_SLICE
- && m_lastNonBPictType == P_SLICE
- && rce->coeffBits == 0)
-{
+else if (rce->sliceType == P_SLICE && m_lastNonBPictType == P_SLICE && 
rce->coeffBits == 0)
 q = lastPqScale;
-}
 
 /* last qscale / qdiff stuff */
 if (m_lastNonBPictType == rce->sliceType &&
@@ -1664,9 +1658,7 @@
 m_movingAvgSum += m_satdCostWindow[addPos];
 }
 else if (m_sliderPos == s_slidingWindowFrames)
-{
 m_movingAvgSum += m_satdCostWindow[addPos];
-}
 else if (m_sliderPos > 0)
 {
 m_movingAvgSum += m_satdCostWindow[addPos];
@@ -1964,9 +1956,7 @@
 }
 }
 else if (m_qCompress != 1 && m_param->rc.rateControlMode == 
X265_RC_CRF)
-{
 q = x265_qp2qScale(CRF_INIT_QP) / fabs(m_param->rc.ipFactor);
-}
 else if (m_framesDone == 0 && !m_isVbv && 
m_param->rc.rateControlMode == X265_RC_ABR)
 {
 /* for ABR alone, clip the first I frame qp */
# HG changeset patch
# User Praveen Tiwari 
# Date 1541569020 -19800
#  Wed Nov 07 11:07:00 2018 +0530
# Branch stable
# Node ID 5177401a9d4c8b577c4502538037e1cd0d2fae68
# Parent  26b4debfab1af7d5e080902b700d6124fafa8ebd
ratecontrol.cpp: nits - fix for coding style

diff -r 26b4debfab1a -r 5177401a9d4c source/encoder/ratecontrol.cpp
--- a/source/encoder/ratecontrol.cpp	Thu Nov 01 18:47:40 2018 +0530
+++ b/source/encoder/ratecontrol.cpp	Wed Nov 07 11:07:00 2018 +0530
@@ -381,9 +381,9 @@
 
 m_isGrainEnabled = false;
 if(m_param->rc.bEnableGrain) // tune for grainy content OR equal p-b frame sizes
-m_isGrainEnabled = true;
+m_isGrainEnabled = true;
 for (int i = 0; i < 3; i++)
-m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN);
+m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN);
 m_avgPFrameQp = 0 ;
 
 /* 720p videos seem to be a good cutoff for cplxrSum */
@@ -1253,9 +1253,7 @@
 m_isSceneTransition = false;
 
 if (rce->encodeOrder < m_lastPredictorReset + m_param->frameNumThreads)
-{
 rce->rowPreds[0][0].count = 0;
-}
 
 rce->bLastMiniGopBFrame = curFrame->m_lowres.bLastMiniGopBFrame;
 rce->bufferRate = m_bufferRate;
@@ -1458,12 +1456,8 @@
 if (!rce->keptAsRef)
 q *= fabs(m_param->rc.pbFactor);
 }
-else if (rce->sliceType == P_SLICE
- && m_lastNonBPictType == P_SLICE
- && rce->coeffBits == 0)
-{
+else if (rce->sliceType == P_SLICE && m_lastNonBPictType == P_SLICE && rce->coeffBits == 0)
 q = lastPqScale;
-}
 
 /* last qscale / qdiff stuff */
 if (m_lastNonBPictType == rce->sliceType &&
@@ -1664,9 +1658,7 @@
 m_movingAvgSum += m_satdCostWindow[addPos];
 }
 else if (m_sliderPos == s_slidingWindowFrames)
-{
 m_movingAvgSum += m_satdCostWindow[addPos];
-}
 else if (m_sliderPos > 0)
 {
 m_movingAvgSum += m_satdCostWindow[addPos];
@@ -19

Re: [x265] [PATCH] encoder: Do not include CLL SEI message if empty

2018-11-06 Thread Praveen Tiwari

Hello Vittorio,

Sorry for the late reply,  all of us were on leave due to the Diwali
festival in India.

Thanks for the patch, will run some basic test and push the patch.

Regards,
Praveen


On Wed, Nov 7, 2018 at 12:35 AM Vittorio Giovara 
wrote:

>
>
> On Thu, Nov 1, 2018 at 5:34 PM Vittorio Giovara <
> vittorio.giov...@gmail.com> wrote:
>
>> Some devices render out-of-luminance pixels incorrectly otherwise.
>>
>> ---
>>  source/encoder/encoder.cpp | 11 +++
>>  1 file changed, 7 insertions(+), 4 deletions(-)
>>
>> diff -r fd517ae68f93 source/encoder/encoder.cpp
>> --- a/source/encoder/encoder.cppTue Sep 25 16:02:31 2018 +0530
>> +++ b/source/encoder/encoder.cppThu Nov 01 17:27:51 2018 -0400
>> @@ -2381,10 +2381,13 @@
>>
>>  if (m_param->bEmitHDRSEI)
>>  {
>> -SEIContentLightLevel cllsei;
>> -cllsei.max_content_light_level = m_param->maxCLL;
>> -cllsei.max_pic_average_light_level = m_param->maxFALL;
>> -cllsei.writeSEImessages(bs, m_sps, NAL_UNIT_PREFIX_SEI, list,
>> m_param->bSingleSeiNal);
>> +if (m_emitCLLSEI)
>> +{
>> +SEIContentLightLevel cllsei;
>> +cllsei.max_content_light_level = m_param->maxCLL;
>> +cllsei.max_pic_average_light_level = m_param->maxFALL;
>> +cllsei.writeSEImessages(bs, m_sps, NAL_UNIT_PREFIX_SEI,
>> list, m_param->bSingleSeiNal);
>> +}
>>
>>  if (m_param->masteringDisplayColorVolume)
>>  {
>> --
>> Vittorio
>>
>
> ping
> --
> Vittorio
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] REPORT_SPEEDUP: correct the description

2018-11-01 Thread praveen

# HG changeset patch
# User Praveen Tiwari 
# Date 1541078260 -19800
#  Thu Nov 01 18:47:40 2018 +0530
# Branch stable
# Node ID 26b4debfab1af7d5e080902b700d6124fafa8ebd
# Parent  471726d3a0462739ff8e3518eb1a1e8a01de4e8d
REPORT_SPEEDUP: correct the description

diff -r 471726d3a046 -r 26b4debfab1a source/test/testharness.h
--- a/source/test/testharness.h Wed Oct 31 16:35:48 2018 +0530
+++ b/source/test/testharness.h Thu Nov 01 18:47:40 2018 +0530
@@ -93,9 +93,9 @@
 
 #define BENCH_RUNS 2000
 
-// Adapted from checkasm.c, runs each optimized primitive four times, measures 
rdtsc
-// and discards invalid times.  Repeats 1000 times to get a good average.  
Then measures
-// the C reference with fewer runs and reports X factor and average cycles.
+/* Adapted from checkasm.c, runs each optimized primitive four times, measures 
rdtsc
+ * and discards invalid times. Repeats BENCH_RUNS times to get a good average.
+ * Then measures the C reference with BENCH_RUNS / 4 runs and reports X factor 
and average cycles.*/
 #define REPORT_SPEEDUP(RUNOPT, RUNREF, ...) \
 { \
 uint32_t cycles = 0; int runs = 0; \
# HG changeset patch
# User Praveen Tiwari 
# Date 1541078260 -19800
#  Thu Nov 01 18:47:40 2018 +0530
# Branch stable
# Node ID 26b4debfab1af7d5e080902b700d6124fafa8ebd
# Parent  471726d3a0462739ff8e3518eb1a1e8a01de4e8d
REPORT_SPEEDUP: correct the description

diff -r 471726d3a046 -r 26b4debfab1a source/test/testharness.h
--- a/source/test/testharness.h	Wed Oct 31 16:35:48 2018 +0530
+++ b/source/test/testharness.h	Thu Nov 01 18:47:40 2018 +0530
@@ -93,9 +93,9 @@
 
 #define BENCH_RUNS 2000
 
-// Adapted from checkasm.c, runs each optimized primitive four times, measures rdtsc
-// and discards invalid times.  Repeats 1000 times to get a good average.  Then measures
-// the C reference with fewer runs and reports X factor and average cycles.
+/* Adapted from checkasm.c, runs each optimized primitive four times, measures rdtsc
+ * and discards invalid times. Repeats BENCH_RUNS times to get a good average.
+ * Then measures the C reference with BENCH_RUNS / 4 runs and reports X factor and average cycles.*/
 #define REPORT_SPEEDUP(RUNOPT, RUNREF, ...) \
 { \
 uint32_t cycles = 0; int runs = 0; \
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] fix Issue #442: linking issue on non x86 platform

2018-10-31 Thread praveen

# HG changeset patch
# User Praveen Tiwari 
# Date 1540983948 -19800
#  Wed Oct 31 16:35:48 2018 +0530
# Node ID f0d02ca443adf8ff90ed61552d35347ff51c8e90
# Parent  fd517ae68f93dbfdd1bff45a9dd8e626523542b6
fix Issue #442: linking issue on non x86 platform

diff -r fd517ae68f93 -r f0d02ca443ad source/common/cpu.cpp
--- a/source/common/cpu.cpp Tue Sep 25 16:02:31 2018 +0530
+++ b/source/common/cpu.cpp Wed Oct 31 16:35:48 2018 +0530
@@ -127,6 +127,7 @@
 {
 return(enable512);
 }
+
 uint32_t cpu_detect(bool benableavx512 )
 {
 
diff -r fd517ae68f93 -r f0d02ca443ad source/common/quant.cpp
--- a/source/common/quant.cpp   Tue Sep 25 16:02:31 2018 +0530
+++ b/source/common/quant.cpp   Wed Oct 31 16:35:48 2018 +0530
@@ -723,6 +723,7 @@
 X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
 uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
 uint32_t blkPos  = codeParams.scan[scanPosBase];
+#if X265_ARCH_X86
 bool enable512 = detect512();
 if (enable512)
 primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, 
m_fencDctCoeff, costUncoded, , , , 
blkPos);
@@ -731,6 +732,10 @@
 primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff,  
costUncoded, , ,blkPos);
 primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, 
m_fencDctCoeff, costUncoded, , , , 
blkPos);
 }
+#else
+primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, 
costUncoded, , , blkPos);
+primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, 
m_fencDctCoeff, costUncoded, , , , 
blkPos);
+#endif
 }
 }
 else
@@ -805,8 +810,8 @@
 uint32_t blkPos = codeParams.scan[scanPosBase];
 if (usePsyMask)
 {
+#if X265_ARCH_X86
 bool enable512 = detect512();
-
 if (enable512)
 primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, 
m_fencDctCoeff, costUncoded, , , , 
blkPos);
 else
@@ -814,6 +819,10 @@
 primitives.cu[log2TrSize - 
2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , 
blkPos);
 primitives.cu[log2TrSize - 
2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, 
, , , blkPos);
 }
+#else
+primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, 
costUncoded, , , blkPos);
+primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, 
m_fencDctCoeff, costUncoded, , , , 
blkPos);
+#endif
 blkPos = codeParams.scan[scanPosBase];
 for (int y = 0; y < MLS_CG_SIZE; y++)
 {
# HG changeset patch
# User Praveen Tiwari 
# Date 1540983948 -19800
#  Wed Oct 31 16:35:48 2018 +0530
# Node ID f0d02ca443adf8ff90ed61552d35347ff51c8e90
# Parent  fd517ae68f93dbfdd1bff45a9dd8e626523542b6
fix Issue #442: linking issue on non x86 platform

diff -r fd517ae68f93 -r f0d02ca443ad source/common/cpu.cpp
--- a/source/common/cpu.cpp	Tue Sep 25 16:02:31 2018 +0530
+++ b/source/common/cpu.cpp	Wed Oct 31 16:35:48 2018 +0530
@@ -127,6 +127,7 @@
 {
 return(enable512);
 }
+
 uint32_t cpu_detect(bool benableavx512 )
 {
 
diff -r fd517ae68f93 -r f0d02ca443ad source/common/quant.cpp
--- a/source/common/quant.cpp	Tue Sep 25 16:02:31 2018 +0530
+++ b/source/common/quant.cpp	Wed Oct 31 16:35:48 2018 +0530
@@ -723,6 +723,7 @@
 X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
 uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
 uint32_t blkPos  = codeParams.scan[scanPosBase];
+#if X265_ARCH_X86
 bool enable512 = detect512();
 if (enable512)
 primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos);
@@ -731,6 +732,10 @@
 primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff,  costUncoded, , ,blkPos);
 primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos);
 }
+#else
+primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , blkPos);
+primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos);
+#endif
 }
 }
 else
@@ -805,8 +810,8 @@
 uint32_t blkPos = codeParams.scan[scanPosBase];
 if (usePsyMask)
 {
+#if X265_ARCH_X86
 bool enable512 = detect512();
-
 if (enable512)
 primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos);
 else
@@ -814,6 +819,10 @@
 primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , blkPos);
 primitives.cu[log2TrSize

Re: [x265] [PATCH] fix Issue #442: linking issue on non x86 platform

2018-10-31 Thread Praveen Tiwari

Thanks! I messed up the syntax.

On Wed, Oct 31, 2018 at 5:45 PM Andrey Semashev 
wrote:

> On 10/31/18 2:33 PM, prav...@multicorewareinc.com wrote:
> > # HG changeset patch
> > # User Praveen Tiwari 
> > # Date 1540983948 -19800
> > #  Wed Oct 31 16:35:48 2018 +0530
> > # Node ID 1c878790edea64186edabcd40fb3df121f536311
> > # Parent  fd517ae68f93dbfdd1bff45a9dd8e626523542b6
> > fix Issue #442: linking issue on non x86 platform
> >
> > diff -r fd517ae68f93 -r 1c878790edea source/common/cpu.cpp
> > --- a/source/common/cpu.cpp   Tue Sep 25 16:02:31 2018 +0530
> > +++ b/source/common/cpu.cpp   Wed Oct 31 16:35:48 2018 +0530
> > @@ -127,6 +127,7 @@
> >   {
> >   return(enable512);
> >   }
> > +
> >   uint32_t cpu_detect(bool benableavx512 )
> >   {
> >
> > diff -r fd517ae68f93 -r 1c878790edea source/common/quant.cpp
> > --- a/source/common/quant.cpp Tue Sep 25 16:02:31 2018 +0530
> > +++ b/source/common/quant.cpp Wed Oct 31 16:35:48 2018 +0530
> > @@ -723,6 +723,7 @@
> >   X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff
> failure\n");
> >   uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
> >   uint32_t blkPos  = codeParams.scan[scanPosBase];
> > +#if X265_ARCH_X86
> >   bool enable512 = detect512();
> >   if (enable512)
> >   primitives.cu[log2TrSize -
> 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded,
> , , , blkPos);
> > @@ -731,6 +732,10 @@
> >   primitives.cu[log2TrSize -
> 2].psyRdoQuant_1p(m_resiDctCoeff,  costUncoded, ,
> ,blkPos);
> >   primitives.cu[log2TrSize -
> 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded,
> , , , blkPos);
> >   }
> > +#elif
>
> #else? Everywhere else, too.
>
> > +primitives.cu[log2TrSize -
> 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, ,
> , blkPos);
> > +primitives.cu[log2TrSize -
> 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded,
> , , , blkPos);
> > +#endif
> >   }
> >   }
> >   else
> > @@ -805,8 +810,8 @@
> >   uint32_t blkPos = codeParams.scan[scanPosBase];
> >   if (usePsyMask)
> >   {
> > +#if X265_ARCH_X86
> >   bool enable512 = detect512();
> > -
> >   if (enable512)
> >   primitives.cu[log2TrSize -
> 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded,
> , , , blkPos);
> >   else
> > @@ -814,6 +819,10 @@
> >   primitives.cu[log2TrSize -
> 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, ,
> , blkPos);
> >   primitives.cu[log2TrSize -
> 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded,
> , , , blkPos);
> >   }
> > +#elif
> > +primitives.cu[log2TrSize -
> 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, ,
> , blkPos);
> > +primitives.cu[log2TrSize -
> 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded,
> , , , blkPos);
> > +#endif
> >   blkPos = codeParams.scan[scanPosBase];
> >   for (int y = 0; y < MLS_CG_SIZE; y++)
> >   {
> >
> >
> > ___
> > x265-devel mailing list
> > x265-devel@videolan.org
> > https://mailman.videolan.org/listinfo/x265-devel
> >
>
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] fix Issue #442: linking issue on non x86 platform

2018-10-31 Thread praveen

# HG changeset patch
# User Praveen Tiwari 
# Date 1540983948 -19800
#  Wed Oct 31 16:35:48 2018 +0530
# Node ID 1c878790edea64186edabcd40fb3df121f536311
# Parent  fd517ae68f93dbfdd1bff45a9dd8e626523542b6
fix Issue #442: linking issue on non x86 platform

diff -r fd517ae68f93 -r 1c878790edea source/common/cpu.cpp
--- a/source/common/cpu.cpp Tue Sep 25 16:02:31 2018 +0530
+++ b/source/common/cpu.cpp Wed Oct 31 16:35:48 2018 +0530
@@ -127,6 +127,7 @@
 {
 return(enable512);
 }
+
 uint32_t cpu_detect(bool benableavx512 )
 {
 
diff -r fd517ae68f93 -r 1c878790edea source/common/quant.cpp
--- a/source/common/quant.cpp   Tue Sep 25 16:02:31 2018 +0530
+++ b/source/common/quant.cpp   Wed Oct 31 16:35:48 2018 +0530
@@ -723,6 +723,7 @@
 X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
 uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
 uint32_t blkPos  = codeParams.scan[scanPosBase];
+#if X265_ARCH_X86
 bool enable512 = detect512();
 if (enable512)
 primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, 
m_fencDctCoeff, costUncoded, , , , 
blkPos);
@@ -731,6 +732,10 @@
 primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff,  
costUncoded, , ,blkPos);
 primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, 
m_fencDctCoeff, costUncoded, , , , 
blkPos);
 }
+#elif
+primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, 
costUncoded, , , blkPos);
+primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, 
m_fencDctCoeff, costUncoded, , , , 
blkPos);
+#endif
 }
 }
 else
@@ -805,8 +810,8 @@
 uint32_t blkPos = codeParams.scan[scanPosBase];
 if (usePsyMask)
 {
+#if X265_ARCH_X86
 bool enable512 = detect512();
-
 if (enable512)
 primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, 
m_fencDctCoeff, costUncoded, , , , 
blkPos);
 else
@@ -814,6 +819,10 @@
 primitives.cu[log2TrSize - 
2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , 
blkPos);
 primitives.cu[log2TrSize - 
2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, 
, , , blkPos);
 }
+#elif
+primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, 
costUncoded, , , blkPos);
+primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, 
m_fencDctCoeff, costUncoded, , , , 
blkPos);
+#endif
 blkPos = codeParams.scan[scanPosBase];
 for (int y = 0; y < MLS_CG_SIZE; y++)
 {
# HG changeset patch
# User Praveen Tiwari 
# Date 1540983948 -19800
#  Wed Oct 31 16:35:48 2018 +0530
# Node ID 1c878790edea64186edabcd40fb3df121f536311
# Parent  fd517ae68f93dbfdd1bff45a9dd8e626523542b6
fix Issue #442: linking issue on non x86 platform

diff -r fd517ae68f93 -r 1c878790edea source/common/cpu.cpp
--- a/source/common/cpu.cpp	Tue Sep 25 16:02:31 2018 +0530
+++ b/source/common/cpu.cpp	Wed Oct 31 16:35:48 2018 +0530
@@ -127,6 +127,7 @@
 {
 return(enable512);
 }
+
 uint32_t cpu_detect(bool benableavx512 )
 {
 
diff -r fd517ae68f93 -r 1c878790edea source/common/quant.cpp
--- a/source/common/quant.cpp	Tue Sep 25 16:02:31 2018 +0530
+++ b/source/common/quant.cpp	Wed Oct 31 16:35:48 2018 +0530
@@ -723,6 +723,7 @@
 X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
 uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
 uint32_t blkPos  = codeParams.scan[scanPosBase];
+#if X265_ARCH_X86
 bool enable512 = detect512();
 if (enable512)
 primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos);
@@ -731,6 +732,10 @@
 primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff,  costUncoded, , ,blkPos);
 primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos);
 }
+#elif
+primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , blkPos);
+primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos);
+#endif
 }
 }
 else
@@ -805,8 +810,8 @@
 uint32_t blkPos = codeParams.scan[scanPosBase];
 if (usePsyMask)
 {
+#if X265_ARCH_X86
 bool enable512 = detect512();
-
 if (enable512)
 primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos);
 else
@@ -814,6 +819,10 @@
 primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , blkPos);
 primitives.cu[log2TrSize

Re: [x265] Original C++ code used for sad functions' assembly code in COST_MV?

2018-09-05 Thread Praveen Tiwari

Hello Jeffrey,

You can find all C primitives in source/common folder.

SAD C primitives ares in source/common/pixel.cpp.


Thanks,
Praveen

On Wed, Sep 5, 2018 at 12:23 PM, Mario *LigH* Rohkrämer 
wrote:

> Jeffrey Chen schrieb am 04.09.2018 um 23:57:
>
>> Hi, I would like to configure the sad function in COST_MV for another
>> platform. However, the assembly code would not be supported on the other
>> platform. Where can I find the original programming language code that was
>> made into the assembly language code?
>>
>
> Hi Jeffrey.
>
> I'm not a developer, just guessing:
>
> source/encoder/motion.cpp line 234 #defines a loop.
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] nits

2018-08-29 Thread praveen

# HG changeset patch
# User Praveen Tiwari 
# Date 1535537469 -19800
#  Wed Aug 29 15:41:09 2018 +0530
# Node ID c4b7f40d4747c000fafc96c6331aaf312243b586
# Parent  683defcf536ad5d4e5994dc39efb48de5fec8648
nits

diff -r 683defcf536a -r c4b7f40d4747 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cppTue Aug 28 18:57:01 2018 +0530
+++ b/source/encoder/encoder.cppWed Aug 29 15:41:09 2018 +0530
@@ -2516,7 +2516,7 @@
 vui.defaultDisplayWindow.bottomOffset = 
m_param->vui.defDispWinBottomOffset;
 vui.defaultDisplayWindow.leftOffset = m_param->vui.defDispWinLeftOffset;
 
-   vui.frameFieldInfoPresentFlag = !!m_param->interlaceMode || 
(m_param->pictureStructure >= 0);
+vui.frameFieldInfoPresentFlag = !!m_param->interlaceMode || 
(m_param->pictureStructure >= 0);
 vui.fieldSeqFlag = !!m_param->interlaceMode;
 
 vui.hrdParametersPresentFlag = m_param->bEmitHRDSEI;
diff -r 683defcf536a -r c4b7f40d4747 source/encoder/sei.h
--- a/source/encoder/sei.h  Tue Aug 28 18:57:01 2018 +0530
+++ b/source/encoder/sei.h  Wed Aug 29 15:41:09 2018 +0530
@@ -304,15 +304,15 @@
 int m_preferredTransferCharacteristics;
 SEIAlternativeTC()
 {
-   m_payloadType = ALTERNATIVE_TRANSFER_CHARACTERISTICS;
-   m_payloadSize = 0;
-   m_preferredTransferCharacteristics = -1;
-   }   
-   
-   void writeSEI(const SPS&)
-   {
-   WRITE_CODE(m_preferredTransferCharacteristics, 8, "Preferred 
transfer characteristics");
-   }
+m_payloadType = ALTERNATIVE_TRANSFER_CHARACTERISTICS;
+m_payloadSize = 0;
+m_preferredTransferCharacteristics = -1;
+}
+
+void writeSEI(const SPS&)
+{
+WRITE_CODE(m_preferredTransferCharacteristics, 8, "Preferred transfer 
characteristics");
+}
 };
 
 }
# HG changeset patch
# User Praveen Tiwari 
# Date 1535537469 -19800
#  Wed Aug 29 15:41:09 2018 +0530
# Node ID c4b7f40d4747c000fafc96c6331aaf312243b586
# Parent  683defcf536ad5d4e5994dc39efb48de5fec8648
nits

diff -r 683defcf536a -r c4b7f40d4747 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Tue Aug 28 18:57:01 2018 +0530
+++ b/source/encoder/encoder.cpp	Wed Aug 29 15:41:09 2018 +0530
@@ -2516,7 +2516,7 @@
 vui.defaultDisplayWindow.bottomOffset = m_param->vui.defDispWinBottomOffset;
 vui.defaultDisplayWindow.leftOffset = m_param->vui.defDispWinLeftOffset;
 
-	vui.frameFieldInfoPresentFlag = !!m_param->interlaceMode || (m_param->pictureStructure >= 0);
+vui.frameFieldInfoPresentFlag = !!m_param->interlaceMode || (m_param->pictureStructure >= 0);
 vui.fieldSeqFlag = !!m_param->interlaceMode;
 
 vui.hrdParametersPresentFlag = m_param->bEmitHRDSEI;
diff -r 683defcf536a -r c4b7f40d4747 source/encoder/sei.h
--- a/source/encoder/sei.h	Tue Aug 28 18:57:01 2018 +0530
+++ b/source/encoder/sei.h	Wed Aug 29 15:41:09 2018 +0530
@@ -304,15 +304,15 @@
 int m_preferredTransferCharacteristics;
 SEIAlternativeTC()
 {
-	m_payloadType = ALTERNATIVE_TRANSFER_CHARACTERISTICS;
-		m_payloadSize = 0;
-		m_preferredTransferCharacteristics = -1;
-	}	
-	
-	void writeSEI(const SPS&)
-	{
-	WRITE_CODE(m_preferredTransferCharacteristics, 8, "Preferred transfer characteristics");
-	}
+m_payloadType = ALTERNATIVE_TRANSFER_CHARACTERISTICS;
+m_payloadSize = 0;
+m_preferredTransferCharacteristics = -1;
+}
+
+void writeSEI(const SPS&)
+{
+WRITE_CODE(m_preferredTransferCharacteristics, 8, "Preferred transfer characteristics");
+}
 };
 
 }
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] Patch for issue #422 - Credit to Dimitry Andric

2018-08-16 Thread praveen

# HG changeset patch
# User Praveen Tiwari 
# Date 1534424221 -19800
#  Thu Aug 16 18:27:01 2018 +0530
# Node ID 88ee12651e3031dc1fc2f3f6a8bbac5f67839579
# Parent  cbc24109c1c849c027b5f087c6ff5f2087cb7301
Patch for issue #422 - Credit to Dimitry Andric.

This is due to undefined behavior in cuTreeFix8Pack(), where a double
value is cast directly to uint16_t.  If the double value is negative,
the resulting value from the cast is undefined.

diff -r cbc24109c1c8 -r 88ee12651e30 source/common/pixel.cpp
--- a/source/common/pixel.cpp   Tue Aug 14 18:01:51 2018 +0530
+++ b/source/common/pixel.cpp   Thu Aug 16 18:27:01 2018 +0530
@@ -922,7 +922,7 @@
 static void cuTreeFix8Pack(uint16_t *dst, double *src, int count)
 {
 for (int i = 0; i < count; i++)
-dst[i] = (uint16_t)(src[i] * 256.0);
+dst[i] = (uint16_t)(int16_t)(src[i] * 256.0);
 }
 
 static void cuTreeFix8Unpack(double *dst, uint16_t *src, int count)
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Re: [x265] Code performance issue

2018-06-04 Thread Praveen Tiwari

Hello Min,

Thanks for the suggestion, we will run some tests and let you know if any
change is required here. Thanks.


Regards,
Praveen Tiwari



On Sat, Jun 2, 2018 at 9:18 AM, chen  wrote:

> There have series performance issues, such as,
>
> uint32_t sum = (uint32_t)pow((outOfBound >> 2), 2);
>
> Are you want to get square value from a small integer?
>
>
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Re: [x265] [PATCH] threadpool.cpp: use WIN system call for popcount

2018-05-03 Thread Praveen Tiwari

It is just counting cpusPerNode, so the 64-bit number is not required, yes
but I missed the fact of support on few CPUs.  Lookup table
based implementation could have been fastest due to better caching, but it
is not used frequently so we can keep as it is. Thanks.

On Thu, May 3, 2018 at 11:24 PM, Andrey Semashev <andrey.semas...@gmail.com>
wrote:

> On Thu, May 3, 2018 at 7:37 PM, Pradeep Ramachandran
> <prad...@multicorewareinc.com> wrote:
> >
> > On Thu, May 3, 2018 at 2:23 PM, <prav...@multicorewareinc.com> wrote:
> >>
> >> # HG changeset patch
> >> # User Praveen Tiwari <prav...@multicorewareinc.com>
> >> # Date 1525328839 -19800
> >> #  Thu May 03 11:57:19 2018 +0530
> >> # Branch stable
> >> # Node ID 9cbb2aadcca3a2f7a308ea1dc792fb817bcc5b51
> >> # Parent  69aafa6d70ad4e151f4590766c6b125621c5d007
> >> threadpool.cpp: use WIN system call for popcount
> >
> >
> > Unless this fixes a known bug, I don't want to push this directly into
> > stable. Syscalls are notorious especially when working with older
> versions
> > of the OS.
> > I would rather push this into default and allow users to test that this
> > works with all kinds of systems and then merge with stable once the
> answer
> > is known.
> > Does this fix a specific issue on some platform, or improve performance?
>
> The comment is not quite right, __popcnt is not a syscall but an
> MSVC-specific intrinsic.
>
> https://msdn.microsoft.com/en-us/library/bb385231.aspx
>
> The equivalent gcc intrinsic is __builtin_popcount and friends.
>
> I think, the patch is buggy because the relevant field is a 64-bit
> integer on 64-bit Windows and __popcnt is 32-bit.
>
> Note also that the popcount instruction only available in ABM ISA
> extension. In Intel CPUs it is available since Nehalem.
>
> >> diff -r 69aafa6d70ad -r 9cbb2aadcca3 source/common/threadpool.cpp
> >> --- a/source/common/threadpool.cpp  Wed May 02 15:15:05 2018 +0530
> >> +++ b/source/common/threadpool.cpp  Thu May 03 11:57:19 2018 +0530
> >> @@ -71,21 +71,6 @@
> >>  # define strcasecmp _stricmp
> >>  #endif
> >>
> >> -#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
> >> -const uint64_t m1 = 0x; //binary: 0101...
> >> -const uint64_t m2 = 0x; //binary: 00110011..
> >> -const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary:  4 zeros,  4 ones ...
> >> -const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power
> of
> >> 0,1,2,3...
> >> -
> >> -static int popCount(uint64_t x)
> >> -{
> >> -x -= (x >> 1) & m1;
> >> -x = (x & m2) + ((x >> 2) & m2);
> >> -x = (x + (x >> 4)) & m3;
> >> -return (x * h01) >> 56;
> >> -}
> >> -#endif
> >> -
> >>  namespace X265_NS {
> >>  // x265 private namespace
> >>
> >> @@ -274,7 +259,7 @@
> >>  for (int i = 0; i < numNumaNodes; i++)
> >>  {
> >>  GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
> >> -cpusPerNode[i] = popCount(groupAffinityPointer->Mask);
> >> +cpusPerNode[i] = __popcnt(static_cast >> int>(groupAffinityPointer->Mask));
> >>  }
> >>  delete groupAffinityPointer;
> >>  #elif HAVE_LIBNUMA
> >> @@ -623,7 +608,7 @@
> >>  for (int i = 0; i < numNumaNodes; i++)
> >>  {
> >>  GetNumaNodeProcessorMaskEx((UCHAR)i, );
> >> -cpus += popCount(groupAffinity.Mask);
> >> +cpus += __popcnt(static_cast int>(groupAffinity.Mask));
> >>  }
> >>  return cpus;
> >>  #elif _WIN32
> >> ___
> >> x265-devel mailing list
> >> x265-devel@videolan.org
> >> https://mailman.videolan.org/listinfo/x265-devel
> >
> >
> >
> > ___
> > x265-devel mailing list
> > x265-devel@videolan.org
> > https://mailman.videolan.org/listinfo/x265-devel
> >
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] threadpool.cpp: use WIN system call for popcount

2018-05-03 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1525328839 -19800
#  Thu May 03 11:57:19 2018 +0530
# Branch stable
# Node ID 9cbb2aadcca3a2f7a308ea1dc792fb817bcc5b51
# Parent  69aafa6d70ad4e151f4590766c6b125621c5d007
threadpool.cpp: use WIN system call for popcount

diff -r 69aafa6d70ad -r 9cbb2aadcca3 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Wed May 02 15:15:05 2018 +0530
+++ b/source/common/threadpool.cpp  Thu May 03 11:57:19 2018 +0530
@@ -71,21 +71,6 @@
 # define strcasecmp _stricmp
 #endif
 
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
-const uint64_t m1 = 0x; //binary: 0101...
-const uint64_t m2 = 0x; //binary: 00110011..
-const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary:  4 zeros,  4 ones ...
-const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power of 
0,1,2,3...
-
-static int popCount(uint64_t x)
-{
-x -= (x >> 1) & m1;
-x = (x & m2) + ((x >> 2) & m2);
-x = (x + (x >> 4)) & m3;
-return (x * h01) >> 56;
-}
-#endif
-
 namespace X265_NS {
 // x265 private namespace
 
@@ -274,7 +259,7 @@
 for (int i = 0; i < numNumaNodes; i++)
 {
 GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
-cpusPerNode[i] = popCount(groupAffinityPointer->Mask);
+cpusPerNode[i] = __popcnt(static_cast(groupAffinityPointer->Mask));
 }
 delete groupAffinityPointer;
 #elif HAVE_LIBNUMA
@@ -623,7 +608,7 @@
 for (int i = 0; i < numNumaNodes; i++)
 {
 GetNumaNodeProcessorMaskEx((UCHAR)i, );
-cpus += popCount(groupAffinity.Mask);
+cpus += __popcnt(static_cast(groupAffinity.Mask));
 }
 return cpus;
 #elif _WIN32
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Re: [x265] [PATCH 000 of 307 ] AVX-512 implementataion in x265: breaks 32-bit compilation

2018-04-11 Thread Praveen Tiwari

Thanks for reporting, we are looking at the issue, will send a fix soon.

Regards,
Praveen Tiwari

On Thu, Apr 12, 2018 at 2:31 AM, Mario Rohkrämer <cont...@ligh.de> wrote:

> Am 07.04.2018, 04:29 Uhr, schrieb <mythr...@multicorewareinc.com>:
>
> This series of patches enables AVX-512 in x265. USe CLI option --asm
>> avx512 to enable AVX-512 kernels.
>> ___
>> x265-devel mailing list
>> x265-devel@videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>
>
> Compiling x265 for Win32 target (here in MSYS2/MinGW32) is not possible
> anymore.
>
> Assembler code was still available for 8-bit depth core, at least. But:
>
> +
> [ 13%] Building ASM_NASM object common/CMakeFiles/common.dir/x
> 86/pixel-util8.asm.obj
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1867: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1880: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1880: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1880: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1880: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1941: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> H:/development/media-autobuild_suite-master/build/x265-hg/
> source/common/x86/pixel-util8.asm:1954: error: invalid combination of
> opcode and operands
> make[2]: *** [common/CMakeFiles/common.dir/build.make:159:
> common/CMakeFiles/common.dir/x86/pixel-util8.asm.obj] Error 1
> make[1]: *** [CMakeFiles/Makefile2:449: common/CMakeFiles/common.dir/all]
> Error 2
> make: *** [Makefile:130: all] Error 2
> +
>
> Trying to compile AVX-512 instructions may have to be avoided in 32-bit
> architecture mode (because there is surely no 32-bit only CPU supporting
> this instruction set extension).
>
> --
>
> Fun and success!
&g

Re: [x265] [PATCH 000 of 307 ] AVX-512 implementataion in x265

2018-04-06 Thread Praveen Tiwari

Your request is on the way, soon we will share the performance related
details. Thanks.

Regards,
Praveen Tiwari

On Fri, Apr 6, 2018 at 9:36 PM, Vittorio Giovara <vittorio.giov...@gmail.com
> wrote:

> just curious, what kind of general speed improvement does this give?
> I could have missed them in the series, but it would be nice to have some
> sort of benchmarks
> thanks
> Vittorio
>
> On Sat, Apr 7, 2018 at 4:29 AM, <mythr...@multicorewareinc.com> wrote:
>
>> This series of patches enables AVX-512 in x265. USe CLI option --asm
>> avx512 to enable AVX-512 kernels.
>> ___
>> x265-devel mailing list
>> x265-devel@videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>
>
>
> --
> Vittorio
>
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] x86: split ipfilter8 kernels into two different source file

2018-02-20 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1516343663 -19800
#  Fri Jan 19 12:04:23 2018 +0530
# Node ID 55a15ecc1110f206199db1b0f997272b5f7ddc82
# Parent  52782aeb20818273cbf749d221647a254b26c4a4
x86: split ipfilter8 kernels into two different source file

This patch implements infrastructure to split ipfiletr8 asm source file into two
different files in order to avoid longer build time. It moves 
interp_8tap_horizontal
kernels to the newly created file.

diff -r 52782aeb2081 -r 55a15ecc1110 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt  Fri Feb 16 11:40:59 2018 +0530
+++ b/source/common/CMakeLists.txt  Fri Jan 19 12:04:23 2018 +0530
@@ -56,17 +56,15 @@
 endif()
 set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES})
 source_group(Intrinsics FILES ${VEC_PRIMITIVES})
-
-set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h 
loopfilter.h seaintegral.h)
+set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h 
h-ipfilter8.h loopfilter.h seaintegral.h)
 set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm
mc-a2.asm pixel-util8.asm blockcopy8.asm
pixeladd8.asm dct8.asm seaintegral.asm)
 if(HIGH_BIT_DEPTH)
 set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm 
loopfilter.asm)
 else()
-set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm 
ipfilter8.asm loopfilter.asm)
+set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm 
h-ipfilter8.asm ipfilter8.asm loopfilter.asm)
 endif()
-
 if(NOT X64)
 set(A_SRCS ${A_SRCS} pixel-32.asm)
 endif()
diff -r 52782aeb2081 -r 55a15ecc1110 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Feb 16 11:40:59 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Jan 19 12:04:23 2018 +0530
@@ -115,8 +115,8 @@
 #include "intrapred.h"
 #include "dct8.h"
 #include "seaintegral.h"
+#include "h-ipfilter8.h"
 }
-
 #define ALL_LUMA_CU_TYPED(prim, fncdef, fname, cpu) \
 p.cu[BLOCK_8x8].prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
 p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
diff -r 52782aeb2081 -r 55a15ecc1110 source/common/x86/h-ipfilter8.asm
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/source/common/x86/h-ipfilter8.asm Fri Jan 19 12:04:23 2018 +0530
@@ -0,0 +1,267 @@
+;*
+;* Copyright (C) 2013-2017 MulticoreWare, Inc
+;*
+;* Authors: Min Chen <chenm...@163.com>
+;*  Nabajit Deka <naba...@multicorewareinc.com>
+;*  Praveen Kumar Tiwari <prav...@multicorewareinc.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at license @ x265.com.
+;*/
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA 32
+
+const h_tabw_LumaCoeff,  dw   0, 0,  0,  64,  0,   0,  0,  0
+   dw  -1, 4, -10, 58,  17, -5,  1,  0
+   dw  -1, 4, -11, 40,  40, -11, 4, -1
+   dw   0, 1, -5,  17,  58, -10, 4, -1
+
+SECTION .text
+
+cextern pw_32
+cextern pw_2000
+
+%macro FILTER_H8_W8_sse2 0
+movhm1, [r0 + x - 3]
+movhm4, [r0 + x - 2]
+punpcklbw   m1, m6
+punpcklbw   m4, m6
+movhm5, [r0 + x - 1]
+movhm0, [r0 + x]
+punpcklbw   m5, m6
+punpcklbw   m0, m6
+pmaddwd m1, m3
+pmaddwd m4, m3
+pmaddwd m5, m3
+pmaddwd m0, m3
+packssdwm1, m4
+packssdwm5, m0
+pshuflw m4, m1, q2301
+pshufhw m4, m4, q2301
+pshuflw m0, m5, q2301
+pshufhw m0, m0, q2301
+paddw   m1, m4
+paddw   m5, m0
+psrldq  m1, 2
+psrldq  m5, 2
+pshufd  m1, m1, q3120
+pshufd  m5, m5, q3120
+punpcklqdq  m1, m5
+movhm7, [r0 + x + 1]
+movhm4, [r0 + x + 2]
+punpcklbw   m7, m6
+punpcklbw   m4, m6
+mo

[x265] [PATCH] dct32 AVX512 Kernel

2017-11-29 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1512003711 28800
#  Wed Nov 29 17:01:51 2017 -0800
# Branch avx-512
# Node ID 96c57dd05464126451ae2100efe4c4b759390311
# Parent  82a58ec0b04a870dac11ae253c30a15a3002419e
dct32 AVX512 Kernel

diff -r 82a58ec0b04a -r 96c57dd05464 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Nov 28 13:51:06 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Nov 29 17:01:51 2017 -0800
@@ -5005,8 +5005,9 @@
 p.pu[LUMA_64x32].luma_vpp = PFX(interp_8tap_vert_pp_64x32_avx512);
 p.pu[LUMA_64x16].luma_vpp = PFX(interp_8tap_vert_pp_64x16_avx512);
 
-p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
-p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
+p.cu[BLOCK_8x8].dct= PFX(dct8_avx512);
+p.cu[BLOCK_32x32].dct  = PFX(dct32_avx512);
+p.cu[BLOCK_8x8].idct   = PFX(idct8_avx512);
 p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
 p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
 p.quant = PFX(quant_avx512);
diff -r 82a58ec0b04a -r 96c57dd05464 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asmTue Nov 28 13:51:06 2017 +0530
+++ b/source/common/x86/dct8.asmWed Nov 29 17:01:51 2017 -0800
@@ -30,16 +30,61 @@
 %include "x86util.asm"
 SECTION_RODATA 64
 
+tab_dct32:  dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 
 4, -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, 
-90
+dw 90, 87, 80, 70, 57, 43, 25,  9, -9, -25, -43, -57, -70, 
-80, -87, -90, -90, -87, -80, -70, -57, -43, -25, -9,  9, 25, 43, 57, 70, 80, 
87, 90
+dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, 
-61, -38, -13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31,  4, -22, -46, -67, -82, 
-90
+dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 
50, 75, 89, 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 
89
+dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 
85, 61, 22, -22, -61, -85, -90, -73, -38,  4, 46, 78, 90, 82, 54, 13, -31, -67, 
-88
+dw 87, 57,  9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, 
-9, -57, -87, -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43,  9, 
57, 87
+dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, 
-90, -78, -31, 31, 78, 90, 61,  4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, 
-85
+dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, 
-36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 
36, 83
+dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67,  4, 
73, 88, 38, -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, 
-82
+dw 80,  9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 
70, -9, -80, -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70,  9, 
80
+dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, 
-38, -90, -46, 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82,  
4, -78
+dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, 
-89, -18, 75, 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, 
-18, 75
+dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, 
-4, 85, 54, -54, -85,  4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, 
-73
+dw 70, -43, -87,  9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 
87, 43, -70, -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90,  9, -87, -43, 
70
+dw 67, -54, -78, 38, 85, -22, -90,  4, 90, 13, -88, -31, 82, 
46, -73, -61, 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, 
-67
+dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, 
-64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, 
-64, 64
+dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, 
-78, 54, 67, -67, -54, 78, 38, -85, -22, 90,  4, -90, 13, 88, -31, -82, 46, 73, 
-61
+dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87,  9, -90, 
25, 80, -57, -57, 80, 25, -90,  9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 
57
+dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 
90, -31, -73, 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88,  4, 85, 
-54
+dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 
18, -89, 50, 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, 
-89, 50
+dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, 
-82,  4, 78, -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 
90, -46
+dw 43, -90,

[x265] [PATCH] quant.cpp: use 'rdoQuant_c' primitive to optimize rdoQuant path

2017-11-28 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1511851222 -19800
#  Tue Nov 28 12:10:22 2017 +0530
# Node ID d732ca2095defdbf42748327006083befb30a89e
# Parent  4d242c555d14ca8214d9da89cef41c4418af4dca
quant.cpp: use 'rdoQuant_c' primitive to optimize rdoQuant path

diff -r 4d242c555d14 -r d732ca2095de source/common/quant.cpp
--- a/source/common/quant.cpp   Tue Nov 28 11:43:00 2017 +0530
+++ b/source/common/quant.cpp   Tue Nov 28 12:10:22 2017 +0530
@@ -803,20 +803,14 @@
 
 if (usePsyMask)
 {
-// TODO: we can't SIMD optimize because PSYVALUE need 64-bits 
multiplication, convert to Double can work faster by FMA
+// Expected to work faster by FMA SIMD
+primitives.rdoQuant(m_resiDctCoeff, m_fencDctCoeff, 
costUncoded, , , psyScale, blkPos, log2TrSize);
+blkPos = codeParams.scan[scanPosBase];
+
 for (int y = 0; y < MLS_CG_SIZE; y++)
 {
 for (int x = 0; x < MLS_CG_SIZE; x++)
 {
-int signCoef = m_resiDctCoeff[blkPos + x]; 
   /* pre-quantization DCT coeff */
-int predictedCoef= m_fencDctCoeff[blkPos + x] - 
signCoef; /* predicted DCT = source DCT - residual DCT*/
-costUncoded[blkPos + x] = 
static_cast(((int64_t)signCoef * signCoef) << scaleBits);
-/* when no residual coefficient is coded, predicted 
coef == recon coef */
-costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
-
-totalUncodedCost += costUncoded[blkPos + x];
-totalRdCost += costUncoded[blkPos + x];
-
 const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
 const uint32_t ctxSig = 
table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + 
ctxSigOffset;
 X265_CHECK(trSize > 4, "trSize check failure\n");
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] quant.cpp: use 'nonPsyRdoQuant_c' primitive to optimize rdoQuant path

2017-11-28 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1511855234 -19800
#  Tue Nov 28 13:17:14 2017 +0530
# Node ID 85970193df47aa5da685efc27aaef04d9f7f21a0
# Parent  d732ca2095defdbf42748327006083befb30a89e
quant.cpp: use 'nonPsyRdoQuant_c' primitive to optimize rdoQuant path

diff -r d732ca2095de -r 85970193df47 source/common/quant.cpp
--- a/source/common/quant.cpp   Tue Nov 28 12:10:22 2017 +0530
+++ b/source/common/quant.cpp   Tue Nov 28 13:17:14 2017 +0530
@@ -824,16 +824,14 @@
 }
 else
 {
-// non-psy path
+// non-psy path - expected to work faster by FMA SIMD
+primitives.nonPsyRdoQuant(m_resiDctCoeff, costUncoded, 
, , blkPos, log2TrSize);
+blkPos = codeParams.scan[scanPosBase];
+
 for (int y = 0; y < MLS_CG_SIZE; y++)
 {
 for (int x = 0; x < MLS_CG_SIZE; x++)
 {
-int signCoef = m_resiDctCoeff[blkPos + x];
/* pre-quantization DCT coeff */
-costUncoded[blkPos + x] = 
static_cast(((int64_t)signCoef * signCoef) << scaleBits);
-totalUncodedCost += costUncoded[blkPos + x];
-totalRdCost += costUncoded[blkPos + x];
-
 const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
 const uint32_t ctxSig = 
table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + 
ctxSigOffset;
 X265_CHECK(trSize > 4, "trSize check failure\n");
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] quant.cpp: 'nonPsyRdoQuant_c' primitive for SIMD optimization

2017-11-27 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1511849580 -19800
#  Tue Nov 28 11:43:00 2017 +0530
# Node ID 4d242c555d14ca8214d9da89cef41c4418af4dca
# Parent  dfd4951a93744f3d732cb4645abd2fd87eded750
quant.cpp: 'nonPsyRdoQuant_c' primitive for SIMD optimization

This particular section of code appears to be bottleneck in many profiles, as it
involves 64-bit multiplication operations. For SIMD optimization we need to 
convert
few buffer/variables to double.

diff -r dfd4951a9374 -r 4d242c555d14 source/common/dct.cpp
--- a/source/common/dct.cpp Mon Nov 20 14:17:36 2017 +0530
+++ b/source/common/dct.cpp Tue Nov 28 11:43:00 2017 +0530
@@ -1010,6 +1010,26 @@
 }
 }
 
+static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, double *costUncoded, 
double *totalUncodedCost, double *totalRdCost, uint32_t blkPos, uint32_t 
log2TrSize)
+{
+const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; 
/* Represents scaling through forward transform */
+const int scaleBits = SCALE_BITS - 2 * transformShift;
+const uint32_t trSize = 1 << log2TrSize;
+
+for (int y = 0; y < MLS_CG_SIZE; y++)
+{
+for (int x = 0; x < MLS_CG_SIZE; x++)
+{
+int signCoef = m_resiDctCoeff[blkPos + x];/* 
pre-quantization DCT coeff */
+costUncoded[blkPos + x] = static_cast(((int64_t)signCoef * 
signCoef) << scaleBits);
+
+*totalUncodedCost += costUncoded[blkPos + x];
+*totalRdCost += costUncoded[blkPos + x];
+}
+blkPos += trSize;
+}
+}
+
 namespace X265_NS {
 // x265 private namespace
 void setupDCTPrimitives_c(EncoderPrimitives& p)
@@ -1019,6 +1039,7 @@
 p.quant = quant_c;
 p.nquant = nquant_c;
 p.rdoQuant = rdoQuant_c;
+p.nonPsyRdoQuant = nonPsyRdoQuant_c;
 p.dst4x4 = dst4_c;
 p.cu[BLOCK_4x4].dct   = dct4_c;
 p.cu[BLOCK_8x8].dct   = dct8_c;
diff -r dfd4951a9374 -r 4d242c555d14 source/common/primitives.h
--- a/source/common/primitives.hMon Nov 20 14:17:36 2017 +0530
+++ b/source/common/primitives.hTue Nov 28 11:43:00 2017 +0530
@@ -216,6 +216,8 @@
 typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
 typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
 typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, 
double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t 
psyScale, uint32_t blkPos, uint32_t log2TrSize);
+typedef void (*nonPsyRdoQuant_t)(int16_t *m_resiDctCoeff, double *costUncoded, 
double *totalUncodedCost, double *totalRdCost, uint32_t blkPos, uint32_t 
log2TrSize);
+
 /* Function pointers to optimized encoder primitives. Each pointer can 
reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -303,6 +305,7 @@
 quant_t   quant;
 nquant_t  nquant;
 rdoQuant_trdoQuant;
+nonPsyRdoQuant_t  nonPsyRdoQuant;
 dequant_scaling_t dequant_scaling;
 dequant_normal_t  dequant_normal;
 denoiseDct_t  denoiseDct;
diff -r dfd4951a9374 -r 4d242c555d14 source/common/quant.cpp
--- a/source/common/quant.cpp   Mon Nov 20 14:17:36 2017 +0530
+++ b/source/common/quant.cpp   Tue Nov 28 11:43:00 2017 +0530
@@ -737,17 +737,7 @@
 uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
 uint32_t blkPos  = codeParams.scan[scanPosBase];
 
-for (int y = 0; y < MLS_CG_SIZE; y++)
-{
-for (int x = 0; x < MLS_CG_SIZE; x++)
-{
-int signCoef = m_resiDctCoeff[blkPos + x];/* 
pre-quantization DCT coeff */
-costUncoded[blkPos + x] = 
static_cast(((int64_t)signCoef * signCoef) << scaleBits);
-totalUncodedCost += costUncoded[blkPos + x];
-totalRdCost += costUncoded[blkPos + x];
-}
-blkPos += trSize;
-}
+primitives.nonPsyRdoQuant(m_resiDctCoeff, costUncoded, 
, , blkPos, log2TrSize);
 }
 }
 
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] quant.cpp: 'rdoQuant_c' primitive for SIMD optimization

2017-11-27 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1511167656 -19800
#  Mon Nov 20 14:17:36 2017 +0530
# Node ID dfd4951a93744f3d732cb4645abd2fd87eded750
# Parent  17bb240012fe990635be621ac261bfd7c9b2d0ba
quant.cpp: 'rdoQuant_c' primitive for SIMD optimization

This particular section of code appears to be bottleneck in many profiles, as it
involves 64-bit multiplication operations. For SIMD optimization we need to 
convert
few buffer/variables to double.

diff -r 17bb240012fe -r dfd4951a9374 source/common/dct.cpp
--- a/source/common/dct.cpp Fri Nov 24 17:23:59 2017 +0100
+++ b/source/common/dct.cpp Mon Nov 20 14:17:36 2017 +0530
@@ -984,15 +984,41 @@
 return (sum & 0x00FF) + (c1 << 26) + (firstC2Idx << 28);
 }
 
+static void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, 
double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t 
psyScale, uint32_t blkPos, uint32_t log2TrSize)
+{
+const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; 
/* Represents scaling through forward transform */
+const int scaleBits = SCALE_BITS - 2 * transformShift;
+const uint32_t trSize = 1 << log2TrSize;
+int max = X265_MAX(0, (2 * transformShift + 1));
+
+for (int y = 0; y < MLS_CG_SIZE; y++)
+{
+for (int x = 0; x < MLS_CG_SIZE; x++)
+{
+int64_t signCoef = m_resiDctCoeff[blkPos + x];/* 
pre-quantization DCT coeff */
+int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* 
predicted DCT = source DCT - residual DCT*/
+
+costUncoded[blkPos + x] = static_cast((signCoef * 
signCoef) << scaleBits);
+
+/* when no residual coefficient is coded, predicted coef == recon 
coef */
+costUncoded[blkPos + x] -= static_cast((psyScale * 
(predictedCoef)) >> max);
+
+*totalUncodedCost += costUncoded[blkPos + x];
+*totalRdCost += costUncoded[blkPos + x];
+}
+blkPos += trSize;
+}
+}
+
 namespace X265_NS {
 // x265 private namespace
-
 void setupDCTPrimitives_c(EncoderPrimitives& p)
 {
 p.dequant_scaling = dequant_scaling_c;
 p.dequant_normal = dequant_normal_c;
 p.quant = quant_c;
 p.nquant = nquant_c;
+p.rdoQuant = rdoQuant_c;
 p.dst4x4 = dst4_c;
 p.cu[BLOCK_4x4].dct   = dct4_c;
 p.cu[BLOCK_8x8].dct   = dct8_c;
diff -r 17bb240012fe -r dfd4951a9374 source/common/primitives.h
--- a/source/common/primitives.hFri Nov 24 17:23:59 2017 +0100
+++ b/source/common/primitives.hMon Nov 20 14:17:36 2017 +0530
@@ -213,10 +213,9 @@
 
 typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t 
offset, int32_t tcP, int32_t tcQ);
 typedef void (*pelFilterChroma_t)(pixel* src, intptr_t srcStep, intptr_t 
offset, int32_t tc, int32_t maskP, int32_t maskQ);
-
 typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
 typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
-
+typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, 
double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t 
psyScale, uint32_t blkPos, uint32_t log2TrSize);
 /* Function pointers to optimized encoder primitives. Each pointer can 
reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -301,9 +300,9 @@
  * the CU arrays */
 dct_t dst4x4;
 idct_tidst4x4;
-
 quant_t   quant;
 nquant_t  nquant;
+rdoQuant_trdoQuant;
 dequant_scaling_t dequant_scaling;
 dequant_normal_t  dequant_normal;
 denoiseDct_t  denoiseDct;
diff -r 17bb240012fe -r dfd4951a9374 source/common/quant.cpp
--- a/source/common/quant.cpp   Fri Nov 24 17:23:59 2017 +0100
+++ b/source/common/quant.cpp   Mon Nov 20 14:17:36 2017 +0530
@@ -661,11 +661,9 @@
 #define SIGCOST(bits)   ((lambda2 * (bits)) >> 8)
 #define RDCOST(d, bits) int64_t)d * d) << scaleBits) + SIGCOST(bits))
 #define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift 
+ 1)))
-
 int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
-int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0*/
+double costUncoded[trSize * trSize]; /* d*d + lambda * 0*/
 int64_t costSig[trSize * trSize]; /* lambda * bits   */
-
 int rateIncUp[trSize * trSize];  /* signal overhead of increasing 
level */
 int rateIncDown[trSize * trSize];/* signal overhead of decreasing 
level */
 int sigRateDelta[trSize * trSize];   /* signal difference between zero and 
non-zero */
@@ -675,15 +673,12 @@
 
 const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */
 bool bIsLuma = ttype == TEXT_LUMA;
-
 /* total rate distortion cost of transform block, as

Re: [x265] [PATCH] quant.cpp: 'rdoQuant_c' primitive for SIMD optimization

2017-11-27 Thread Praveen Tiwari

Please ignore this patch I messed an update. I will resend this soon. Thanks

On Mon, Nov 27, 2017 at 5:11 PM, <prav...@multicorewareinc.com> wrote:

> # HG changeset patch
> # User Praveen Tiwari <prav...@multicorewareinc.com>
> # Date 1511167656 -19800
> #  Mon Nov 20 14:17:36 2017 +0530
> # Node ID dffb056e5ad0e2298b0dd65d048f4f16d8508566
> # Parent  b24454f3ff6de650aab6835e291837fc4e2a4466
> quant.cpp: 'rdoQuant_c' primitive for SIMD optimization
>
> This particular section of code appears to be bottleneck in many profiles,
> as it
> involves 64-bit multiplication operations. For SIMD optimization we need
> to convert
> few buffer/variables to double.
>
> diff -r b24454f3ff6d -r dffb056e5ad0 source/common/dct.cpp
> --- a/source/common/dct.cpp Wed Nov 22 22:00:48 2017 +0530
> +++ b/source/common/dct.cpp Mon Nov 20 14:17:36 2017 +0530
> @@ -984,6 +984,32 @@
>  return (sum & 0x00FF) + (c1 << 26) + (firstC2Idx << 28);
>  }
>
> +void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double*
> costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t
> psyScale, uint32_t blkPos, uint32_t log2TrSize)
> +{
> +const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH -
> log2TrSize; /* Represents scaling through forward transform */
> +const int scaleBits = SCALE_BITS - 2 * transformShift;
> +const uint32_t trSize = 1 << log2TrSize;
> +int max = X265_MAX(0, (2 * transformShift + 1));
> +
> +for (int y = 0; y < MLS_CG_SIZE; y++)
> +{
> +for (int x = 0; x < MLS_CG_SIZE; x++)
> +{
> +int64_t signCoef = m_resiDctCoeff[blkPos + x];/*
> pre-quantization DCT coeff */
> +int64_t predictedCoef = m_fencDctCoeff[blkPos + x] -
> signCoef; /* predicted DCT = source DCT - residual DCT*/
> +
> +costUncoded[blkPos + x] = static_cast((signCoef *
> signCoef) << scaleBits);
> +
> +/* when no residual coefficient is coded, predicted coef ==
> recon coef */
> +costUncoded[blkPos + x] -= static_cast((psyScale *
> (predictedCoef)) >> max);
> +
> +*totalUncodedCost += costUncoded[blkPos + x];
> +*totalRdCost += costUncoded[blkPos + x];
> +}
> +blkPos += trSize;
> +}
> +}
> +
>  namespace X265_NS {
>  // x265 private namespace
>
> @@ -993,6 +1019,7 @@
>  p.dequant_normal = dequant_normal_c;
>  p.quant = quant_c;
>  p.nquant = nquant_c;
> +p.rdoQuant = rdoQuant_c;
>  p.dst4x4 = dst4_c;
>  p.cu[BLOCK_4x4].dct   = dct4_c;
>  p.cu[BLOCK_8x8].dct   = dct8_c;
> diff -r b24454f3ff6d -r dffb056e5ad0 source/common/primitives.h
> --- a/source/common/primitives.hWed Nov 22 22:00:48 2017 +0530
> +++ b/source/common/primitives.hMon Nov 20 14:17:36 2017 +0530
> @@ -216,6 +216,7 @@
>
>  typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
>  typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
> +typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t*
> m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double*
> totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize);
>
>  /* Function pointers to optimized encoder primitives. Each pointer can
> reference
>   * either an assembly routine, a SIMD intrinsic primitive, or a C
> function */
> @@ -304,6 +305,7 @@
>
>  quant_t   quant;
>  nquant_t  nquant;
> +rdoQuant_trdoQuant;
>  dequant_scaling_t dequant_scaling;
>  dequant_normal_t  dequant_normal;
>  denoiseDct_t  denoiseDct;
> diff -r b24454f3ff6d -r dffb056e5ad0 source/common/quant.cpp
> --- a/source/common/quant.cpp   Wed Nov 22 22:00:48 2017 +0530
> +++ b/source/common/quant.cpp   Mon Nov 20 14:17:36 2017 +0530
> @@ -663,7 +663,7 @@
>  #define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 *
> transformShift + 1)))
>
>  int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
> -int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0*/
> +double costUncoded[trSize * trSize]; /* d*d + lambda * 0*/
>  int64_t costSig[trSize * trSize]; /* lambda * bits   */
>
>  int rateIncUp[trSize * trSize];  /* signal overhead of increasing
> level */
> @@ -677,12 +677,12 @@
>  bool bIsLuma = ttype == TEXT_LUMA;
>
>  /* total rate distortion cost of transform block, as CBF=0 */
> -int64_t totalUncodedCost = 0;
> +double totalUncodedCost = 0;
>
>  /* Total rate distortion cost of this transform block, counting te
> di

[x265] [PATCH] quant.cpp: 'rdoQuant_c' primitive for SIMD optimization

2017-11-27 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1511167656 -19800
#  Mon Nov 20 14:17:36 2017 +0530
# Node ID dffb056e5ad0e2298b0dd65d048f4f16d8508566
# Parent  b24454f3ff6de650aab6835e291837fc4e2a4466
quant.cpp: 'rdoQuant_c' primitive for SIMD optimization

This particular section of code appears to be bottleneck in many profiles, as it
involves 64-bit multiplication operations. For SIMD optimization we need to 
convert
few buffer/variables to double.

diff -r b24454f3ff6d -r dffb056e5ad0 source/common/dct.cpp
--- a/source/common/dct.cpp Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/dct.cpp Mon Nov 20 14:17:36 2017 +0530
@@ -984,6 +984,32 @@
 return (sum & 0x00FF) + (c1 << 26) + (firstC2Idx << 28);
 }
 
+void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* 
costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, 
uint32_t blkPos, uint32_t log2TrSize)
+{
+const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; 
/* Represents scaling through forward transform */
+const int scaleBits = SCALE_BITS - 2 * transformShift;
+const uint32_t trSize = 1 << log2TrSize;
+int max = X265_MAX(0, (2 * transformShift + 1));
+
+for (int y = 0; y < MLS_CG_SIZE; y++)
+{
+for (int x = 0; x < MLS_CG_SIZE; x++)
+{
+int64_t signCoef = m_resiDctCoeff[blkPos + x];/* 
pre-quantization DCT coeff */
+int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* 
predicted DCT = source DCT - residual DCT*/
+
+costUncoded[blkPos + x] = static_cast((signCoef * 
signCoef) << scaleBits);
+
+/* when no residual coefficient is coded, predicted coef == recon 
coef */
+costUncoded[blkPos + x] -= static_cast((psyScale * 
(predictedCoef)) >> max);
+
+*totalUncodedCost += costUncoded[blkPos + x];
+*totalRdCost += costUncoded[blkPos + x];
+}
+blkPos += trSize;
+}
+}
+
 namespace X265_NS {
 // x265 private namespace
 
@@ -993,6 +1019,7 @@
 p.dequant_normal = dequant_normal_c;
 p.quant = quant_c;
 p.nquant = nquant_c;
+p.rdoQuant = rdoQuant_c;
 p.dst4x4 = dst4_c;
 p.cu[BLOCK_4x4].dct   = dct4_c;
 p.cu[BLOCK_8x8].dct   = dct8_c;
diff -r b24454f3ff6d -r dffb056e5ad0 source/common/primitives.h
--- a/source/common/primitives.hWed Nov 22 22:00:48 2017 +0530
+++ b/source/common/primitives.hMon Nov 20 14:17:36 2017 +0530
@@ -216,6 +216,7 @@
 
 typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
 typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
+typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, 
double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t 
psyScale, uint32_t blkPos, uint32_t log2TrSize);
 
 /* Function pointers to optimized encoder primitives. Each pointer can 
reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
@@ -304,6 +305,7 @@
 
 quant_t   quant;
 nquant_t  nquant;
+rdoQuant_trdoQuant;
 dequant_scaling_t dequant_scaling;
 dequant_normal_t  dequant_normal;
 denoiseDct_t  denoiseDct;
diff -r b24454f3ff6d -r dffb056e5ad0 source/common/quant.cpp
--- a/source/common/quant.cpp   Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/quant.cpp   Mon Nov 20 14:17:36 2017 +0530
@@ -663,7 +663,7 @@
 #define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift 
+ 1)))
 
 int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
-int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0*/
+double costUncoded[trSize * trSize]; /* d*d + lambda * 0*/
 int64_t costSig[trSize * trSize]; /* lambda * bits   */
 
 int rateIncUp[trSize * trSize];  /* signal overhead of increasing 
level */
@@ -677,12 +677,12 @@
 bool bIsLuma = ttype == TEXT_LUMA;
 
 /* total rate distortion cost of transform block, as CBF=0 */
-int64_t totalUncodedCost = 0;
+double totalUncodedCost = 0;
 
 /* Total rate distortion cost of this transform block, counting te 
distortion of uncoded blocks,
  * the distortion and signal cost of coded blocks, and the coding cost of 
significant
  * coefficient and coefficient group bitmaps */
-int64_t totalRdCost = 0;
+double totalRdCost = 0;
 
 TUEntropyCodingParameters codeParams;
 cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, 
bIsLuma);
@@ -729,24 +729,9 @@
 uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
 uint32_t blkPos  = codeParams.scan[scanPosBase];
 
-// TODO: we can't SIMD optimize because PSYVALUE need 64-bits 
multiplication, convert to Double can work faster by FMA
-for (int y = 0; y < MLS_CG

[x265] [PATCH] encoder.cpp: fix encoder crash for --analysis-reuse-level=10

2017-11-22 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1511338280 -19800
#  Wed Nov 22 13:41:20 2017 +0530
# Branch stable
# Node ID dd11aa99f40a1af59065984afa9b699d2eb1162e
# Parent  752ed1108fce1b475e0458b70f92503d6343818b
encoder.cpp: fix encoder crash for --analysis-reuse-level=10

diff -r 752ed1108fce -r dd11aa99f40a source/encoder/encoder.cpp
--- a/source/encoder/encoder.cppTue Nov 21 09:50:45 2017 +0530
+++ b/source/encoder/encoder.cppWed Nov 22 13:41:20 2017 +0530
@@ -1155,6 +1155,8 @@
 pic_out->analysisData.wt = outFrame->m_analysisData.wt;
 pic_out->analysisData.interData = 
outFrame->m_analysisData.interData;
 pic_out->analysisData.intraData = 
outFrame->m_analysisData.intraData;
+pic_out->analysisData.modeFlag[0] = 
outFrame->m_analysisData.modeFlag[0];
+pic_out->analysisData.modeFlag[1] = 
outFrame->m_analysisData.modeFlag[1];
 if (m_param->bDisableLookahead)
 {
 int factor = 1;
@@ -3067,6 +3069,7 @@
 CHECKED_MALLOC(interData->mvpIdx[dir], uint8_t, 
analysis->numPartitions * analysis->numCUsInFrame);
 CHECKED_MALLOC(interData->refIdx[dir], int8_t, 
analysis->numPartitions * analysis->numCUsInFrame);
 CHECKED_MALLOC(interData->mv[dir], MV, analysis->numPartitions 
* analysis->numCUsInFrame);
+CHECKED_MALLOC(analysis->modeFlag[dir], uint8_t, 
analysis->numPartitions * analysis->numCUsInFrame);
 }
 
 /* Allocate intra in inter */
@@ -3146,7 +3149,11 @@
 
X265_FREE(((analysis_inter_data*)analysis->interData)->mvpIdx[dir]);
 
X265_FREE(((analysis_inter_data*)analysis->interData)->refIdx[dir]);
 
X265_FREE(((analysis_inter_data*)analysis->interData)->mv[dir]);
-X265_FREE(analysis->modeFlag[dir]);
+if (analysis->modeFlag[dir] != NULL)
+{
+X265_FREE(analysis->modeFlag[dir]);
+analysis->modeFlag[dir] = NULL;
+}
 }
 }
 else
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Re: [x265] [PATCH 2 of 2] x86: Change assembler from YASM to NASM

2017-11-21 Thread Praveen Tiwari

Yes, that's true looking at the future prospect we have decided to move the
support to NASM. It comes with additional advantages as Andrey mentioned
above,  but we understand the concern to change assembler support,  we will
make it a smooth transition as much as possible. Thanks.

Regards,
Praveen Tiwari
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] Fwd: [PATCH] intra: sse4 version of strong intra smoothing

2017-11-20 Thread Praveen Tiwari

-- Forwarded message --
From: chen 
Date: Tue, Nov 21, 2017 at 10:07 AM
Subject: Re: [x265] [PATCH] intra: sse4 version of strong intra smoothing
To: Development for x265 


>diff -r a7c2f80c18af -r 973560d58dfb source/common/x86/intrapred8.asm
>--- a/source/common/x86/intrapred8.asm Mon Nov 20 14:31:22 2017 +0530
>+++ b/source/common/x86/intrapred8.asm Tue Nov 21 03:10:14 2017 +0800
>@@ -22313,11 +22313,144 @@
> mov [r1 + 64], r3b  ; LeftLast
> RET
>
>-INIT_XMM sse4
>-cglobal intra_filter_32x32, 2,4,6
>-mov r2b, byte [r0 +  64]; topLast
>-mov r3b, byte [r0 + 128]; LeftLast
>-
>+; this function add strong intra filter
>+

INIT_XMM sse4
>+cglobal intra_filter_32x32, 3,8,7
>+xor r3d, r3d ; R9
>+xor r4d, r4d ; R10
>+mov r3b, byte [r0 +  64] ; topLast
>+mov r4b, byte [r0 + 128] ; LeftLast

xor+mov = movzx, the xor (clear to zero) does not spending cycle, but
affect instruction decode rate

>+
>+; strong intra filter is diabled
>+cmp r2m, byte 0
>+jz  .normal_filter32
>+; decide to do strong intra filter
>+xor r5d, r5d ; R11
>+xor r6d, r6d ; RAX
>+xor r7d, r7d ; RDI
>+mov r5b, byte [r0]   ; topLeft
>+mov r6b, byte [r0 + 96]  ; leftMiddle
>+mov r7b, byte [r0 + 32]  ; topMiddle
>+
>+; threshold = 8
>+mov r2d, r3d ; R8
>+add r2d, r5d ; (topLast + topLeft)
>+shl r7d, 1   ; 2 * topMiddle
>+sub r2d, r7d
(A+B) - 2 * C  <==> (A-C) + (B-C)

>+mov r7d, r2d ; backup r2d
>+sar r7d, 31
>+xor r2d, r7d
>+sub r2d, r7d ; abs(r2d)
>+cmp r2d, 8
; how about this or instruction cdq?
; abs(x-y)
mov eax, X
sub eax, Y
sub Y, X
cmovg eax, Y


>+; bilinearAbove is false
>+jns .normal_filter32
>+
>+mov r2d, r5d
>+add r2d, r4d
>+shl r6d, 1
>+sub r2d, r6d
>+mov r6d, r2d
>+sar r6d, 31
>+xor r2d, r6d
>+sub r2d, r6d
>+cmp r2d, 8
>+; bilinearLeft is false
>+jns .normal_filter32
>+
>+; do strong intra filter shift = 6
>+mov r2d, r5d
>+shl r2d, 6
>+add r2d, 32  ; init
>+mov r6d, r4d
>+sub r6w, r5w ; deltaL size is word
partial register may stall in here

>+mov r7d, r3d
>+sub r7w, r5w ; deltaR size is word
>+movdxmm0, r2d
>+

vpbroadcastwxmm0, xmm0
SSE4?
This is AVX2 instruction, so
* *intialization on top is wrong. We genrally we don't prefix xmm,
ymm for native version m0, m1 will be better.


>+movaxmm4, xmm0
>+



___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] analysis: use AVC CU analysis-info for HEVC mode analysis

2017-11-17 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1510926794 -19800
#  Fri Nov 17 19:23:14 2017 +0530
# Node ID 6b248ccb14169d2b0d5b84d50d94a153bd8f3b4f
# Parent  9723e8812e63ce51e38ede41f7d5edf73cad0849
analysis: use AVC CU analysis-info for HEVC mode analysis

This patch work implements the functionality for anlysis-reuselevel 7, here we 
want
to use AVC analysis-info for HEVC mode decision and use the depth from offload
for AVC sizes

diff -r 9723e8812e63 -r 6b248ccb1416 source/common/cudata.cpp
--- a/source/common/cudata.cpp  Fri Nov 17 14:16:31 2017 +0530
+++ b/source/common/cudata.cpp  Fri Nov 17 19:23:14 2017 +0530
@@ -201,6 +201,8 @@
 m_cuDepth= charBuf; charBuf += m_numPartitions;
 m_predMode   = charBuf; charBuf += m_numPartitions; /* the 
order up to here is important in initCTU() and initSubCU() */
 m_partSize   = charBuf; charBuf += m_numPartitions;
+m_skipFlag[0]= charBuf; charBuf += m_numPartitions;
+m_skipFlag[1]= charBuf; charBuf += m_numPartitions;
 m_mergeFlag  = charBuf; charBuf += m_numPartitions;
 m_interDir   = charBuf; charBuf += m_numPartitions;
 m_mvpIdx[0]  = charBuf; charBuf += m_numPartitions;
@@ -239,6 +241,8 @@
 m_cuDepth= charBuf; charBuf += m_numPartitions;
 m_predMode   = charBuf; charBuf += m_numPartitions; /* the 
order up to here is important in initCTU() and initSubCU() */
 m_partSize   = charBuf; charBuf += m_numPartitions;
+m_skipFlag[0]= charBuf; charBuf += m_numPartitions;
+m_skipFlag[1]= charBuf; charBuf += m_numPartitions;
 m_mergeFlag  = charBuf; charBuf += m_numPartitions;
 m_interDir   = charBuf; charBuf += m_numPartitions;
 m_mvpIdx[0]  = charBuf; charBuf += m_numPartitions;
diff -r 9723e8812e63 -r 6b248ccb1416 source/common/cudata.h
--- a/source/common/cudata.hFri Nov 17 14:16:31 2017 +0530
+++ b/source/common/cudata.hFri Nov 17 19:23:14 2017 +0530
@@ -199,13 +199,14 @@
 uint8_t*  m_predMode; // array of prediction modes
 uint8_t*  m_partSize; // array of partition sizes
 uint8_t*  m_mergeFlag;// array of merge flags
+uint8_t*  m_skipFlag[2];
 uint8_t*  m_interDir; // array of inter directions
 uint8_t*  m_mvpIdx[2];// array of motion vector predictor 
candidates or merge candidate indices [0]
 uint8_t*  m_tuDepth;  // array of transform indices
 uint8_t*  m_transformSkip[3]; // array of transform skipping flags per 
plane
 uint8_t*  m_cbf[3];   // array of coded block flags (CBF) per 
plane
 uint8_t*  m_chromaIntraDir;   // array of intra directions (chroma)
-enum { BytesPerPartition = 21 };  // combined sizeof() of all per-part data
+enum { BytesPerPartition = 23 };  // combined sizeof() of all per-part data
 
 sse_t*m_distortion;
 coeff_t*  m_trCoeff[3];   // transformed coefficient buffer per 
plane
diff -r 9723e8812e63 -r 6b248ccb1416 source/common/framedata.h
--- a/source/common/framedata.h Fri Nov 17 14:16:31 2017 +0530
+++ b/source/common/framedata.h Fri Nov 17 19:23:14 2017 +0530
@@ -195,6 +195,7 @@
 uint8_t*mvpIdx[2];
 int8_t* refIdx[2];
 MV* mv[2];
+   int64_t* sadCost;
 };
 
 struct analysis2PassFrameData
diff -r 9723e8812e63 -r 6b248ccb1416 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp   Fri Nov 17 14:16:31 2017 +0530
+++ b/source/encoder/analysis.cpp   Fri Nov 17 19:23:14 2017 +0530
@@ -75,6 +75,10 @@
 m_reuseInterDataCTU = NULL;
 m_reuseRef = NULL;
 m_bHD = false;
+m_modeFlag[0] = false;
+m_modeFlag[1] = false;
+m_checkMergeAndSkipOnly[0] = false;
+m_checkMergeAndSkipOnly[1] = false;
 m_evaluateInter = 0;
 }
 
@@ -247,6 +251,9 @@
 memcpy(ctu.m_cuDepth, >depth[posCTU], 
sizeof(uint8_t) * numPartition);
 memcpy(ctu.m_predMode, >modes[posCTU], 
sizeof(uint8_t) * numPartition);
 memcpy(ctu.m_partSize, >partSize[posCTU], 
sizeof(uint8_t) * numPartition);
+for (int list = 0; list < m_slice->isInterB() + 1; list++)
+memcpy(ctu.m_skipFlag[list], 
_frame->m_analysisData.modeFlag[list][posCTU], sizeof(uint8_t) * 
numPartition);
+
 if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) 
&& !m_param->bMVType)
 {
 analysis_intra_data* intraDataCTU = 
(analysis_intra_data*)m_frame->m_analysisData.intraData;
@@ -1162,7 +1169,11 @@
 PicYuv& reconPic = *m_frame->m_reconPic;
 SplitData splitCUData;
 
-if ((m_param->bMVType && cuGeom.numPartitions > 16) || !m_param->bMVType)
+bool bHEVCBlockAnalysis = (m_param-

[x265] [PATCH] analysis: update analysisReuseLevel 7 for analysis sharing

2017-11-16 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1510561483 -19800
#  Mon Nov 13 13:54:43 2017 +0530
# Node ID 4b6af5ba01f8244aec5862dc5fccc0019c44d0c8
# Parent  2fec493b990ee3066e5ffc853d83669955f0ee3c
analysis: update analysisReuseLevel 7 for analysis sharing

diff -r 2fec493b990e -r 4b6af5ba01f8 doc/reST/cli.rst
--- a/doc/reST/cli.rst  Mon Nov 13 12:20:50 2017 +0530
+++ b/doc/reST/cli.rst  Mon Nov 13 13:54:43 2017 +0530
@@ -887,17 +887,19 @@
 
Note that --analysis-reuse-level must be paired with 
analysis-reuse-mode.
 
-   ++-+
-   | Level  | Description |
-   ++=+
-   | 1  | Lookahead information   |
-   ++-+
-   | 2 to 4 | Level 1 + intra/inter modes, ref's  |
-   ++-+
-   | 5 to 9 | Level 2 + rect-amp  |
-   ++-+
-   | 10 | Level 5 + Full CU analysis-info |
-   ++-+
++--+--+
+| Level| Description  |
++==+==+
+| 1| Lookahead information|
++--+--+
+| 2 to 4   | Level 1 + intra/inter modes, ref's   |
++--+--+
+| 5,6,8 and 9  | Level 2 + rect-amp   |
++--+--+
+| 7| Level 5 + AVC size CU refinement |
++--+--+
+| 10   | Level 5 + Full CU analysis-info  |
++--+--+
 
 .. option:: --refine-mv-type 
 
diff -r 2fec493b990e -r 4b6af5ba01f8 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp   Mon Nov 13 12:20:50 2017 +0530
+++ b/source/encoder/analysis.cpp   Mon Nov 13 13:54:43 2017 +0530
@@ -2301,7 +2301,7 @@
 for (uint32_t part = 0; part < numPU; part++)
 {
 PredictionUnit pu(mode.cu, cuGeom, part);
-if (m_param->analysisReuseLevel == 10)
+if (m_param->analysisReuseLevel >= 7)
 {
 analysis_inter_data* interDataCTU = 
(analysis_inter_data*)m_frame->m_analysisData.interData;
 int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) 
+ cuGeom.absPartIdx;
@@ -2407,7 +2407,7 @@
 if (m_slice->m_pps->bUseDQP && nextDepth <= 
m_slice->m_pps->maxCuDQPDepth)
 nextQP = setLambdaFromQP(parentCTU, 
calculateQpforCuSize(parentCTU, childGeom));
 
-int lamdaQP = m_param->analysisReuseLevel == 10 ? nextQP : lqp;
+int lamdaQP = (m_param->analysisReuseLevel >= 7) ? nextQP : 
lqp;
 
 if (split)
 m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, 
childGeom, nextQP) : compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
diff -r 2fec493b990e -r 4b6af5ba01f8 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cppMon Nov 13 12:20:50 2017 +0530
+++ b/source/encoder/encoder.cppMon Nov 13 13:54:43 2017 +0530
@@ -3036,7 +3036,7 @@
 CHECKED_MALLOC(interData->mergeFlag, uint8_t, 
analysis->numPartitions * analysis->numCUsInFrame);
 }
 
-if (m_param->analysisReuseLevel == 10)
+if (m_param->analysisReuseLevel >= 7)
 {
 CHECKED_MALLOC(interData->interDir, uint8_t, 
analysis->numPartitions * analysis->numCUsInFrame);
 for (int dir = 0; dir < numDir; dir++)
@@ -3113,7 +3113,7 @@
 
X265_FREE(((analysis_inter_data*)analysis->interData)->mergeFlag);
 
X265_FREE(((analysis_inter_data*)analysis->interData)->partSize);
 }
-if (m_param->analysisReuseLevel == 10)
+if (m_param->analysisReuseLevel >= 7)
 {
 
X265_FREE(((analysis_inter_data*)analysis->interData)->interDir);
 int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] API: 'x265_set_analysis_data' to capture analysis information

2017-11-15 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1510555850 -19800
#  Mon Nov 13 12:20:50 2017 +0530
# Node ID 2fec493b990ee3066e5ffc853d83669955f0ee3c
# Parent  dd9772385d152528201d335efbc6f75fdc43b08c
API: 'x265_set_analysis_data' to capture analysis information

diff -r dd9772385d15 -r 2fec493b990e doc/reST/api.rst
--- a/doc/reST/api.rst  Tue Nov 14 11:00:09 2017 +0530
+++ b/doc/reST/api.rst  Mon Nov 13 12:20:50 2017 +0530
@@ -215,6 +215,13 @@
 *the encoder will wait for this copy to complete if enabled.
 */
 
+**x265_set_analysis_data()** may be used to recive analysis information from 
external application::
+
+/* x265_set_analysis_data:
+ * set the analysis data,
+ * returns negative on error, 0 access unit were output. */
+ int x265_set_analysis_data(x265_encoder *encoder, x265_analysis_data 
*analysis_data, int poc, uint32_t cuBytes);
+
 Pictures
 
 
diff -r dd9772385d15 -r 2fec493b990e source/CMakeLists.txt
--- a/source/CMakeLists.txt Tue Nov 14 11:00:09 2017 +0530
+++ b/source/CMakeLists.txt Mon Nov 13 12:20:50 2017 +0530
@@ -29,7 +29,7 @@
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 145)
+set(X265_BUILD 146)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
"${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
diff -r dd9772385d15 -r 2fec493b990e source/encoder/api.cpp
--- a/source/encoder/api.cppTue Nov 14 11:00:09 2017 +0530
+++ b/source/encoder/api.cppMon Nov 13 12:20:50 2017 +0530
@@ -365,6 +365,18 @@
 return encoder->getRefFrameList((PicYuv**)l0, (PicYuv**)l1, sliceType, 
poc);
 }
 
+int x265_set_analysis_data(x265_encoder *enc, x265_analysis_data 
*analysis_data, int poc, uint32_t cuBytes)
+{
+if (!enc)
+return -1;
+
+Encoder *encoder = static_cast<Encoder*>(enc);
+if (!encoder->setAnalysisData(analysis_data, poc, cuBytes))
+return 0;
+
+return -1;
+}
+
 void x265_cleanup(void)
 {
 BitCost::destroy();
@@ -444,6 +456,7 @@
 _csvlog_frame,
 _csvlog_encode,
 _dither_image,
+_set_analysis_data
 };
 
 typedef const x265_api* (*api_get_func)(int bitDepth);
diff -r dd9772385d15 -r 2fec493b990e source/encoder/encoder.cpp
--- a/source/encoder/encoder.cppTue Nov 14 11:00:09 2017 +0530
+++ b/source/encoder/encoder.cppMon Nov 13 12:20:50 2017 +0530
@@ -574,6 +574,88 @@
 return 0;
 }
 
+int Encoder::setAnalysisData(x265_analysis_data *analysis_data, int poc, 
uint32_t cuBytes)
+{
+uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> 
m_param->maxLog2CUSize;
+uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> 
m_param->maxLog2CUSize;
+
+Frame* curFrame = m_dpb->m_picList.getPOC(poc);
+if (curFrame != NULL)
+{
+curFrame->m_analysisData = (*analysis_data);
+curFrame->m_analysisData.numCUsInFrame = widthInCU * heightInCU;
+curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
+allocAnalysis(>m_analysisData);
+if (m_param->maxCUSize == 16)
+{
+if (analysis_data->sliceType == X265_TYPE_IDR || 
analysis_data->sliceType == X265_TYPE_I)
+{
+curFrame->m_analysisData.sliceType = X265_TYPE_I;
+if (m_param->analysisReuseLevel < 2)
+return -1;
+
+curFrame->m_analysisData.numPartitions = 
m_param->num4x4Partitions;
+size_t count = 0;
+analysis_intra_data * currIntraData = (analysis_intra_data 
*)curFrame->m_analysisData.intraData;
+analysis_intra_data * intraData = (analysis_intra_data 
*)analysis_data->intraData;
+for (uint32_t d = 0; d < cuBytes; d++)
+{
+int bytes = curFrame->m_analysisData.numPartitions >> 
((intraData)->depth[d] * 2);
+memset(&(currIntraData)->depth[count], 
(intraData)->depth[d], bytes);
+memset(&(currIntraData)->chromaModes[count], 
(intraData)->chromaModes[d], bytes);
+memset(&(currIntraData)->partSizes[count], 
(intraData)->partSizes[d], bytes);
+memset(&(currIntraData)->partSizes[count], 
(intraData)->partSizes[d], bytes);
+count += bytes;
+}
+memcpy(&(currIntraData)->modes, (intraData)->modes, 
curFrame->m_analysisData.numPartitions * analysis_data->numCUsInFrame);
+}
+else
+{
+uint

[x265] [PATCH] analysis: update analysisReuseLevel 7 for analysis sharing

2017-11-13 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1510561483 -19800
#  Mon Nov 13 13:54:43 2017 +0530
# Node ID 02f21c4eafe13b52966f3fc1f925bb34070d647d
# Parent  65eff30eb192d836b52edb5c3e2e6deae06dabf7
analysis: update analysisReuseLevel 7 for analysis sharing

diff -r 65eff30eb192 -r 02f21c4eafe1 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp   Mon Nov 13 12:20:50 2017 +0530
+++ b/source/encoder/analysis.cpp   Mon Nov 13 13:54:43 2017 +0530
@@ -2301,7 +2301,7 @@
 for (uint32_t part = 0; part < numPU; part++)
 {
 PredictionUnit pu(mode.cu, cuGeom, part);
-if (m_param->analysisReuseLevel == 10)
+if (m_param->analysisReuseLevel >= 7)
 {
 analysis_inter_data* interDataCTU = 
(analysis_inter_data*)m_frame->m_analysisData.interData;
 int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) 
+ cuGeom.absPartIdx;
@@ -2407,7 +2407,7 @@
 if (m_slice->m_pps->bUseDQP && nextDepth <= 
m_slice->m_pps->maxCuDQPDepth)
 nextQP = setLambdaFromQP(parentCTU, 
calculateQpforCuSize(parentCTU, childGeom));
 
-int lamdaQP = m_param->analysisReuseLevel == 10 ? nextQP : lqp;
+int lamdaQP = m_param->analysisReuseLevel >= 7 ? nextQP : lqp;
 
 if (split)
 m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, 
childGeom, nextQP) : compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
diff -r 65eff30eb192 -r 02f21c4eafe1 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cppMon Nov 13 12:20:50 2017 +0530
+++ b/source/encoder/encoder.cppMon Nov 13 13:54:43 2017 +0530
@@ -3038,7 +3038,7 @@
 CHECKED_MALLOC(interData->mergeFlag, uint8_t, 
analysis->numPartitions * analysis->numCUsInFrame);
 }
 
-if (m_param->analysisReuseLevel == 10)
+if (m_param->analysisReuseLevel >= 7)
 {
 CHECKED_MALLOC(interData->interDir, uint8_t, 
analysis->numPartitions * analysis->numCUsInFrame);
 for (int dir = 0; dir < numDir; dir++)
@@ -3115,7 +3115,7 @@
 
X265_FREE(((analysis_inter_data*)analysis->interData)->mergeFlag);
 
X265_FREE(((analysis_inter_data*)analysis->interData)->partSize);
 }
-if (m_param->analysisReuseLevel == 10)
+if (m_param->analysisReuseLevel >= 7)
 {
 
X265_FREE(((analysis_inter_data*)analysis->interData)->interDir);
 int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] API: 'x265_set_analysis_data' to capture analysis information

2017-11-13 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1510555850 -19800
#  Mon Nov 13 12:20:50 2017 +0530
# Node ID 65eff30eb192d836b52edb5c3e2e6deae06dabf7
# Parent  563cbe1f4a21dcfe2117ccaa874b713d94434f92
API: 'x265_set_analysis_data' to capture analysis information

diff -r 563cbe1f4a21 -r 65eff30eb192 source/CMakeLists.txt
--- a/source/CMakeLists.txt Wed Nov 08 17:08:18 2017 +0530
+++ b/source/CMakeLists.txt Mon Nov 13 12:20:50 2017 +0530
@@ -29,7 +29,7 @@
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 143)
+set(X265_BUILD 144)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
"${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
diff -r 563cbe1f4a21 -r 65eff30eb192 source/encoder/api.cpp
--- a/source/encoder/api.cppWed Nov 08 17:08:18 2017 +0530
+++ b/source/encoder/api.cppMon Nov 13 12:20:50 2017 +0530
@@ -365,6 +365,18 @@
 return encoder->getRefFrameList((PicYuv**)l0, (PicYuv**)l1, sliceType, 
poc);
 }
 
+int x265_set_analysis_data(x265_encoder *enc, x265_analysis_data 
*analysis_data, int poc, uint32_t cuBytes)
+{
+if (!enc)
+return -1;
+
+Encoder *encoder = static_cast<Encoder*>(enc);
+if (!encoder->setAnalysisData(analysis_data, poc, cuBytes))
+return 0;
+
+return -1;
+}
+
 void x265_cleanup(void)
 {
 BitCost::destroy();
@@ -444,6 +456,7 @@
 _csvlog_frame,
 _csvlog_encode,
 _dither_image,
+_set_analysis_data
 };
 
 typedef const x265_api* (*api_get_func)(int bitDepth);
diff -r 563cbe1f4a21 -r 65eff30eb192 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cppWed Nov 08 17:08:18 2017 +0530
+++ b/source/encoder/encoder.cppMon Nov 13 12:20:50 2017 +0530
@@ -576,6 +576,88 @@
 return 0;
 }
 
+int Encoder::setAnalysisData(x265_analysis_data *analysis_data, int poc, 
uint32_t cuBytes)
+{
+uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> 
m_param->maxLog2CUSize;
+uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> 
m_param->maxLog2CUSize;
+
+Frame* curFrame = m_dpb->m_picList.getPOC(poc);
+if (curFrame != NULL)
+{
+curFrame->m_analysisData = (*analysis_data);
+curFrame->m_analysisData.numCUsInFrame = widthInCU * heightInCU;
+curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
+allocAnalysis(>m_analysisData);
+if (m_param->maxCUSize == 16)
+{
+if (analysis_data->sliceType == X265_TYPE_IDR || 
analysis_data->sliceType == X265_TYPE_I)
+{
+curFrame->m_analysisData.sliceType = X265_TYPE_I;
+if (m_param->analysisReuseLevel < 2)
+return -1;
+
+curFrame->m_analysisData.numPartitions = 
m_param->num4x4Partitions;
+size_t count = 0;
+analysis_intra_data * currIntraData = (analysis_intra_data 
*)curFrame->m_analysisData.intraData;
+analysis_intra_data * intraData = (analysis_intra_data 
*)analysis_data->intraData;
+for (uint32_t d = 0; d < cuBytes; d++)
+{
+int bytes = curFrame->m_analysisData.numPartitions >> 
((intraData)->depth[d] * 2);
+memset(&(currIntraData)->depth[count], 
(intraData)->depth[d], bytes);
+memset(&(currIntraData)->chromaModes[count], 
(intraData)->chromaModes[d], bytes);
+memset(&(currIntraData)->partSizes[count], 
(intraData)->partSizes[d], bytes);
+memset(&(currIntraData)->partSizes[count], 
(intraData)->partSizes[d], bytes);
+count += bytes;
+}
+memcpy(&(currIntraData)->modes, (intraData)->modes, 
curFrame->m_analysisData.numPartitions * analysis_data->numCUsInFrame);
+}
+else
+{
+uint32_t numDir = analysis_data->sliceType == X265_TYPE_P ? 1 
: 2;
+if (m_param->analysisReuseLevel < 2)
+return -1;
+
+curFrame->m_analysisData.numPartitions = 
m_param->num4x4Partitions;
+size_t count = 0;
+analysis_inter_data * currInterData = (analysis_inter_data 
*)curFrame->m_analysisData.interData;
+analysis_inter_data * interData = (analysis_inter_data 
*)analysis_data->interData;
+for (uint32_t d = 0; d < cuBytes; d++)
+{
+int bytes = curFrame->m_analysi

[x265] [PATCH] analysis: update analysisReuseLevel 7 for analysis sharing

2017-11-13 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1510561483 -19800
#  Mon Nov 13 13:54:43 2017 +0530
# Node ID 67f2dd6203ff8a1e2271ef0ac052ac5f1ad99110
# Parent  5ea4fdbdea99a8bdd91d0d7961bcf50764d445b6
analysis: update analysisReuseLevel 7 for analysis sharing

diff -r 5ea4fdbdea99 -r 67f2dd6203ff source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp   Mon Nov 13 12:20:50 2017 +0530
+++ b/source/encoder/analysis.cpp   Mon Nov 13 13:54:43 2017 +0530
@@ -2301,7 +2301,7 @@
 for (uint32_t part = 0; part < numPU; part++)
 {
 PredictionUnit pu(mode.cu, cuGeom, part);
-if (m_param->analysisReuseLevel == 10)
+if (m_param->analysisReuseLevel >= 7)
 {
 analysis_inter_data* interDataCTU = 
(analysis_inter_data*)m_frame->m_analysisData.interData;
 int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) 
+ cuGeom.absPartIdx;
@@ -2407,7 +2407,7 @@
 if (m_slice->m_pps->bUseDQP && nextDepth <= 
m_slice->m_pps->maxCuDQPDepth)
 nextQP = setLambdaFromQP(parentCTU, 
calculateQpforCuSize(parentCTU, childGeom));
 
-int lamdaQP = m_param->analysisReuseLevel == 10 ? nextQP : lqp;
+int lamdaQP = m_param->analysisReuseLevel >= 7 ? nextQP : lqp;
 
 if (split)
 m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, 
childGeom, nextQP) : compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
diff -r 5ea4fdbdea99 -r 67f2dd6203ff source/encoder/encoder.cpp
--- a/source/encoder/encoder.cppMon Nov 13 12:20:50 2017 +0530
+++ b/source/encoder/encoder.cppMon Nov 13 13:54:43 2017 +0530
@@ -3035,7 +3035,7 @@
 CHECKED_MALLOC(interData->mergeFlag, uint8_t, 
analysis->numPartitions * analysis->numCUsInFrame);
 }
 
-if (m_param->analysisReuseLevel == 10)
+if (m_param->analysisReuseLevel >= 7)
 {
 CHECKED_MALLOC(interData->interDir, uint8_t, 
analysis->numPartitions * analysis->numCUsInFrame);
 for (int dir = 0; dir < numDir; dir++)
@@ -3112,7 +3112,7 @@
 
X265_FREE(((analysis_inter_data*)analysis->interData)->mergeFlag);
 
X265_FREE(((analysis_inter_data*)analysis->interData)->partSize);
 }
-if (m_param->analysisReuseLevel == 10)
+if (m_param->analysisReuseLevel >= 7)
 {
 
X265_FREE(((analysis_inter_data*)analysis->interData)->interDir);
 int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] API: 'x265_set_analysis_data' to capture analysis information

2017-11-12 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1510555850 -19800
#  Mon Nov 13 12:20:50 2017 +0530
# Node ID 5ea4fdbdea99a8bdd91d0d7961bcf50764d445b6
# Parent  563cbe1f4a21dcfe2117ccaa874b713d94434f92
API: 'x265_set_analysis_data' to capture analysis information

diff -r 563cbe1f4a21 -r 5ea4fdbdea99 source/CMakeLists.txt
--- a/source/CMakeLists.txt Wed Nov 08 17:08:18 2017 +0530
+++ b/source/CMakeLists.txt Mon Nov 13 12:20:50 2017 +0530
@@ -29,7 +29,7 @@
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 143)
+set(X265_BUILD 144)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
"${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
diff -r 563cbe1f4a21 -r 5ea4fdbdea99 source/encoder/api.cpp
--- a/source/encoder/api.cppWed Nov 08 17:08:18 2017 +0530
+++ b/source/encoder/api.cppMon Nov 13 12:20:50 2017 +0530
@@ -365,6 +365,18 @@
 return encoder->getRefFrameList((PicYuv**)l0, (PicYuv**)l1, sliceType, 
poc);
 }
 
+int x265_set_analysis_data(x265_encoder *enc, x265_analysis_data 
*analysis_data, int poc, uint32_t cuBytes)
+{
+if (!enc)
+return -1;
+
+Encoder *encoder = static_cast<Encoder*>(enc);
+if (!encoder->setAnalysisData(analysis_data, poc, cuBytes))
+return 0;
+
+return -1;
+}
+
 void x265_cleanup(void)
 {
 BitCost::destroy();
@@ -444,6 +456,7 @@
 _csvlog_frame,
 _csvlog_encode,
 _dither_image,
+_set_analysis_data
 };
 
 typedef const x265_api* (*api_get_func)(int bitDepth);
diff -r 563cbe1f4a21 -r 5ea4fdbdea99 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cppWed Nov 08 17:08:18 2017 +0530
+++ b/source/encoder/encoder.cppMon Nov 13 12:20:50 2017 +0530
@@ -576,6 +576,85 @@
 return 0;
 }
 
+int Encoder::setAnalysisData(x265_analysis_data *analysis_data, int poc, 
uint32_t cuBytes)
+{
+uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> 
m_param->maxLog2CUSize;
+uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> 
m_param->maxLog2CUSize;
+
+Frame* curFrame = m_dpb->m_picList.getPOC(poc);
+if (curFrame != NULL)
+{
+curFrame->m_analysisData = (*analysis_data);
+curFrame->m_analysisData.numCUsInFrame = widthInCU * heightInCU;
+curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
+allocAnalysis(>m_analysisData);
+if (m_param->maxCUSize == 16)
+{
+if (analysis_data->sliceType == X265_TYPE_IDR || 
analysis_data->sliceType == X265_TYPE_I)
+{
+curFrame->m_analysisData.sliceType = X265_TYPE_I;
+if (m_param->analysisReuseLevel < 2)
+return -1;
+
+curFrame->m_analysisData.numPartitions = 
m_param->num4x4Partitions;
+size_t count = 0;
+for (uint32_t d = 0; d < cuBytes; d++)
+{
+int bytes = curFrame->m_analysisData.numPartitions >> 
(((analysis_intra_data *)analysis_data->intraData)->depth[d] * 2);
+memset(&((analysis_intra_data 
*)curFrame->m_analysisData.intraData)->depth[count], ((analysis_intra_data 
*)analysis_data->intraData)->depth[d], bytes);
+memset(&((analysis_intra_data 
*)curFrame->m_analysisData.intraData)->chromaModes[count], 
((analysis_intra_data *)analysis_data->intraData)->chromaModes[d], bytes);
+memset(&((analysis_intra_data 
*)curFrame->m_analysisData.intraData)->partSizes[count], ((analysis_intra_data 
*)analysis_data->intraData)->partSizes[d], bytes);
+memset(&((analysis_intra_data 
*)curFrame->m_analysisData.intraData)->partSizes[count], ((analysis_intra_data 
*)analysis_data->intraData)->partSizes[d], bytes);
+count += bytes;
+}
+memcpy(&((analysis_intra_data 
*)curFrame->m_analysisData.intraData)->modes, ((analysis_intra_data 
*)analysis_data->intraData)->modes, curFrame->m_analysisData.numPartitions * 
analysis_data->numCUsInFrame);
+}
+else
+{
+uint32_t numDir = analysis_data->sliceType == X265_TYPE_P ? 1 
: 2;
+if (m_param->analysisReuseLevel < 2)
+return -1;
+
+curFrame->m_analysisData.numPartitions = 
m_param->num4x4Partitions;
+size_t count = 0;
+for (uint32_t d = 0; d < cuBytes; d++)
+{
+int bytes =

[x265] [PATCH] api: 'x265_get_ref_frame_list' to get forward and backward refrence list

2017-11-03 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1509446629 -19800
#  Tue Oct 31 16:13:49 2017 +0530
# Node ID 6ad93877ffe19cd6cf285f0cc8189f41dce606b8
# Parent  de91aae2db5353e4e548d002e2dce530a6c8078d
api: 'x265_get_ref_frame_list' to get forward and backward refrence list

diff -r de91aae2db53 -r 6ad93877ffe1 doc/reST/api.rst
--- a/doc/reST/api.rst  Tue Oct 31 13:57:37 2017 +0530
+++ b/doc/reST/api.rst  Tue Oct 31 16:13:49 2017 +0530
@@ -201,6 +201,13 @@
  * This API must be called after(poc >= lookaheadDepth + bframes + 2) 
condition check. */
  int x265_get_slicetype_poc_and_scenecut(x265_encoder *encoder, int 
*slicetype, int *poc, int* sceneCut);
 
+**x265_get_ref_frame_list()** may be used to fetch forward and backward 
refrence list::
+
+/* x265_get_ref_frame_list:
+ * returns negative on error, 0 when access unit were output.
+ * This API must be called after(poc >= lookaheadDepth + bframes + 2) 
condition check */
+ int x265_get_ref_frame_list(x265_encoder *encoder, x265_picyuv**, 
x265_picyuv**, int, int);
+ 
 **x265_encoder_ctu_info**
/* x265_encoder_ctu_info:
 *Copy CTU information such as ctu address and ctu partition 
structure of all
diff -r de91aae2db53 -r 6ad93877ffe1 source/CMakeLists.txt
--- a/source/CMakeLists.txt Tue Oct 31 13:57:37 2017 +0530
+++ b/source/CMakeLists.txt Tue Oct 31 16:13:49 2017 +0530
@@ -29,7 +29,7 @@
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 137)
+set(X265_BUILD 138)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
"${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
diff -r de91aae2db53 -r 6ad93877ffe1 source/common/frame.h
--- a/source/common/frame.h Tue Oct 31 13:57:37 2017 +0530
+++ b/source/common/frame.h Tue Oct 31 16:13:49 2017 +0530
@@ -98,6 +98,7 @@
 
 float* m_quantOffsets;   // points to quantOffsets in 
x265_picture
 x265_sei   m_userSEI;
+Event  m_reconEncoded;
 
 /* Frame Parallelism - notification between FrameEncoders of available 
motion reference rows */
 ThreadSafeInteger* m_reconRowFlag;   // flag of CTU rows 
completely reconstructed and extended for motion reference
diff -r de91aae2db53 -r 6ad93877ffe1 source/common/picyuv.h
--- a/source/common/picyuv.hTue Oct 31 13:57:37 2017 +0530
+++ b/source/common/picyuv.hTue Oct 31 16:13:49 2017 +0530
@@ -27,6 +27,7 @@
 #include "common.h"
 #include "md5.h"
 #include "x265.h"
+struct x265_picyuv {};
 
 namespace X265_NS {
 // private namespace
@@ -34,7 +35,7 @@
 class ShortYuv;
 struct SPS;
 
-class PicYuv
+class PicYuv : public x265_picyuv
 {
 public:
 
diff -r de91aae2db53 -r 6ad93877ffe1 source/encoder/api.cpp
--- a/source/encoder/api.cppTue Oct 31 13:57:37 2017 +0530
+++ b/source/encoder/api.cppTue Oct 31 16:13:49 2017 +0530
@@ -350,6 +350,15 @@
 return -1;
 }
 
+int x265_get_ref_frame_list(x265_encoder *enc, x265_picyuv** l0, x265_picyuv** 
l1, int sliceType, int poc)
+{
+if (!enc)
+return -1;
+
+Encoder *encoder = static_cast<Encoder*>(enc);
+return encoder->getRefFrameList((PicYuv**)l0, (PicYuv**)l1, sliceType, 
poc);
+}
+
 void x265_cleanup(void)
 {
 BitCost::destroy();
@@ -424,6 +433,7 @@
 _encoder_intra_refresh,
 _encoder_ctu_info,
 _get_slicetype_poc_and_scenecut,
+_get_ref_frame_list,
 };
 
 typedef const x265_api* (*api_get_func)(int bitDepth);
diff -r de91aae2db53 -r 6ad93877ffe1 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cppTue Oct 31 13:57:37 2017 +0530
+++ b/source/encoder/encoder.cppTue Oct 31 16:13:49 2017 +0530
@@ -446,6 +446,47 @@
 return 0;
 }
 
+int Encoder::getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc)
+{
+if (!(IS_X265_TYPE_I(sliceType)))
+{
+Frame *framePtr = m_dpb->m_picList.getPOC(poc);
+if (framePtr != NULL)
+{
+for (int j = 0; j < framePtr->m_encData->m_slice->m_numRefIdx[0]; 
j++)// check only for --ref=n number of frames.
+{
+if (framePtr->m_encData->m_slice->m_refFrameList[0][j] && 
framePtr->m_encData->m_slice->m_refFrameList[0][j]->m_reconPic != NULL)
+{
+int l0POC = 
framePtr->m_encData->m_slice->m_refFrameList[0][j]->m_poc;
+Frame* l0Fp = m_dpb->m_picList.getPOC(l0POC);
+if (l0Fp->m_reconPic->m_picOrg[0] == NULL)
+l0Fp->m_reconEncoded.wait(); /* If recon is not ready, 
current frame encoder need to wait. */
+

[x265] [PATCH] api: 'x265_get_slicetype_poc_and_scenecut' to fetch slicetype, poc and scenecut information

2017-11-03 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1509438457 -19800
#  Tue Oct 31 13:57:37 2017 +0530
# Node ID de91aae2db5353e4e548d002e2dce530a6c8078d
# Parent  6a310b24c6a2d831ef08bbda1bdcf9d929daa308
api: 'x265_get_slicetype_poc_and_scenecut' to fetch slicetype, poc and scenecut 
information

diff -r 6a310b24c6a2 -r de91aae2db53 doc/reST/api.rst
--- a/doc/reST/api.rst  Thu Nov 02 12:17:29 2017 +0530
+++ b/doc/reST/api.rst  Tue Oct 31 13:57:37 2017 +0530
@@ -192,6 +192,15 @@
 *  presets is not recommended without a more fine-grained 
breakdown of
 *  parameters to take this into account. */
int x265_encoder_reconfig(x265_encoder *, x265_param *);
+
+**x265_get_slicetype_poc_and_scenecut()** may be used to fetch slice type, poc 
and scene cut information mid-encode::
+
+/* x265_get_slicetype_poc_and_scenecut:
+ * get the slice type, poc and scene cut information for the current 
frame,
+ * returns negative on error, 0 on success.
+ * This API must be called after(poc >= lookaheadDepth + bframes + 2) 
condition check. */
+ int x265_get_slicetype_poc_and_scenecut(x265_encoder *encoder, int 
*slicetype, int *poc, int* sceneCut);
+
 **x265_encoder_ctu_info**
/* x265_encoder_ctu_info:
 *Copy CTU information such as ctu address and ctu partition 
structure of all
diff -r 6a310b24c6a2 -r de91aae2db53 source/CMakeLists.txt
--- a/source/CMakeLists.txt Thu Nov 02 12:17:29 2017 +0530
+++ b/source/CMakeLists.txt Tue Oct 31 13:57:37 2017 +0530
@@ -29,7 +29,7 @@
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 136)
+set(X265_BUILD 137)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
"${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
diff -r 6a310b24c6a2 -r de91aae2db53 source/common/piclist.cpp
--- a/source/common/piclist.cpp Thu Nov 02 12:17:29 2017 +0530
+++ b/source/common/piclist.cpp Tue Oct 31 13:57:37 2017 +0530
@@ -117,6 +117,15 @@
 return NULL;
 }
 
+Frame* PicList::getCurFrame(void)
+{
+Frame *curFrame = m_start;
+if (curFrame != NULL)
+return curFrame;
+else
+return NULL;
+}
+
 void PicList::remove(Frame& curFrame)
 {
 #if _DEBUG
diff -r 6a310b24c6a2 -r de91aae2db53 source/common/piclist.h
--- a/source/common/piclist.h   Thu Nov 02 12:17:29 2017 +0530
+++ b/source/common/piclist.h   Tue Oct 31 13:57:37 2017 +0530
@@ -62,6 +62,9 @@
 /** Find frame with specified POC */
 Frame* getPOC(int poc);
 
+/** Get the current Frame from the list **/
+Frame* getCurFrame(void);
+
 /** Remove picture from list */
 void remove(Frame& pic);
 
diff -r 6a310b24c6a2 -r de91aae2db53 source/encoder/api.cpp
--- a/source/encoder/api.cppThu Nov 02 12:17:29 2017 +0530
+++ b/source/encoder/api.cppTue Oct 31 13:57:37 2017 +0530
@@ -340,6 +340,16 @@
 return 0;
 }
 
+int x265_get_slicetype_poc_and_scenecut(x265_encoder *enc, int *slicetype, int 
*poc, int *sceneCut)
+{
+if (!enc)
+return -1;
+Encoder *encoder = static_cast<Encoder*>(enc);
+if (!encoder->copySlicetypePocAndSceneCut(slicetype, poc, sceneCut))
+return 0;
+return -1;
+}
+
 void x265_cleanup(void)
 {
 BitCost::destroy();
@@ -413,6 +423,7 @@
 sizeof(x265_frame_stats),
 _encoder_intra_refresh,
 _encoder_ctu_info,
+_get_slicetype_poc_and_scenecut,
 };
 
 typedef const x265_api* (*api_get_func)(int bitDepth);
diff -r 6a310b24c6a2 -r de91aae2db53 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cppThu Nov 02 12:17:29 2017 +0530
+++ b/source/encoder/encoder.cppTue Oct 31 13:57:37 2017 +0530
@@ -429,6 +429,23 @@
 }
 }
 
+int Encoder::copySlicetypePocAndSceneCut(int *slicetype, int *poc, int 
*sceneCut)
+{
+Frame *FramePtr = m_dpb->m_picList.getCurFrame();
+if (FramePtr != NULL)
+{
+*slicetype = FramePtr->m_lowres.sliceType;
+*poc = FramePtr->m_encData->m_slice->m_poc;
+*sceneCut = FramePtr->m_lowres.bScenecut;
+}
+else
+{
+x265_log(NULL, X265_LOG_WARNING, "Frame is still in lookahead 
pipeline, this API must be called after (poc >= lookaheadDepth + bframes + 2) 
condition check\n");
+return -1;
+}
+return 0;
+}
+
 void Encoder::destroy()
 {
 #if ENABLE_HDR10_PLUS
diff -r 6a310b24c6a2 -r de91aae2db53 source/encoder/encoder.h
--- a/source/encoder/encoder.h  Thu Nov 02 12:17:29 2017 +0530
+++ b/source/encoder/encoder.h  Tue Oct 31 13:57:37 2017 +0530
@@ -205,6 +205,8 @@
 
 void copyCtuInfo(x265_ctu_info_t** frameCtuInfo, int poc);
 
+int copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *scene

[x265] [PATCH] avx2: integral_init4h -> added 'INTEGRAL_FOUR_HORIZONTAL_4' macro to reduce data movement for '4' element case

2017-06-14 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1497422024 -19800
#  Wed Jun 14 12:03:44 2017 +0530
# Node ID 65e038ecbbf63a2f449ccf52358c5fbbec408b27
# Parent  28bbc349d17035a3c1fcdfbdca3b8e21ae6b669b
avx2: integral_init4h -> added 'INTEGRAL_FOUR_HORIZONTAL_4' macro to reduce 
data movement for '4' element case

diff -r 28bbc349d170 -r 65e038ecbbf6 source/common/x86/seaintegral.asm
--- a/source/common/x86/seaintegral.asm Wed Jun 07 17:06:57 2017 +0530
+++ b/source/common/x86/seaintegral.asm Wed Jun 14 12:03:44 2017 +0530
@@ -148,11 +148,6 @@
 jnz .loop
 RET
 
-;-
-;static void integral_init4h_c(uint32_t *sum, pixel *pix, intptr_t stride)
-;-
-INIT_YMM avx2
-
 %macro INTEGRAL_FOUR_HORIZONTAL_16 0
 pmovzxbw   m0, [r1]
 pmovzxbw   m1, [r1 + 1]
@@ -163,6 +158,24 @@
 paddw  m0, m1
 %endmacro
 
+%macro INTEGRAL_FOUR_HORIZONTAL_4 0
+movd   xm0, [r1]
+movd   xm1, [r1 + 1]
+pmovzxbw   xm0, xm0
+pmovzxbw   xm1, xm1
+paddw  xm0, xm1
+movd   xm1, [r1 + 2]
+pmovzxbw   xm1, xm1
+paddw  xm0, xm1
+movd   xm1, [r1 + 3]
+pmovzxbw   xm1, xm1
+paddw  xm0, xm1
+%endmacro
+
+;-
+;static void integral_init4h(uint32_t *sum, pixel *pix, intptr_t stride)
+;-
+INIT_YMM avx2
 cglobal integral4h, 3, 5, 3
 lear3, [4 * r2]
 subr0, r3
@@ -205,7 +218,7 @@
 jmp .end
 
 .loop_4:
-INTEGRAL_FOUR_HORIZONTAL_16
+INTEGRAL_FOUR_HORIZONTAL_4
 pmovzxwd   xm0, xm0
 movu   xm1, [r0]
 paddd  xm0, xm1
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] avx2: integral_init4h -> added 'INTEGRAL_FOUR_HORIZONTAL_4' macro to reduce data movement for '4' element case

2017-06-13 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1497417160 -19800
#  Wed Jun 14 10:42:40 2017 +0530
# Node ID f6f920ab5be6e00b9c32ac225959fc6b9f68d36b
# Parent  28bbc349d17035a3c1fcdfbdca3b8e21ae6b669b
avx2: integral_init4h -> added 'INTEGRAL_FOUR_HORIZONTAL_4' macro to reduce 
data movement for '4' element case

diff -r 28bbc349d170 -r f6f920ab5be6 source/common/x86/seaintegral.asm
--- a/source/common/x86/seaintegral.asm Wed Jun 07 17:06:57 2017 +0530
+++ b/source/common/x86/seaintegral.asm Wed Jun 14 10:42:40 2017 +0530
@@ -148,11 +148,6 @@
 jnz .loop
 RET
 
-;-
-;static void integral_init4h_c(uint32_t *sum, pixel *pix, intptr_t stride)
-;-
-INIT_YMM avx2
-
 %macro INTEGRAL_FOUR_HORIZONTAL_16 0
 pmovzxbw   m0, [r1]
 pmovzxbw   m1, [r1 + 1]
@@ -163,6 +158,23 @@
 paddw  m0, m1
 %endmacro
 
+%macro INTEGRAL_FOUR_HORIZONTAL_4 0
+movd   xm0, [r1]
+movd   xm1, [r1 + 1]
+pmovzxbw   xm0, xm0
+pmovzxbw   xm1, xm1
+paddw  xm0, xm1
+movd   xm1, [r1 + 2]
+pmovzxbw   xm1, xm1
+paddw  xm0, xm1
+movd   xm1, [r1 + 3]
+paddw  xm0, xm1
+%endmacro
+
+;-
+;static void integral_init4h(uint32_t *sum, pixel *pix, intptr_t stride)
+;-
+INIT_YMM avx2
 cglobal integral4h, 3, 5, 3
 lear3, [4 * r2]
 subr0, r3
@@ -205,7 +217,7 @@
 jmp .end
 
 .loop_4:
-INTEGRAL_FOUR_HORIZONTAL_16
+INTEGRAL_FOUR_HORIZONTAL_4
 pmovzxwd   xm0, xm0
 movu   xm1, [r0]
 paddd  xm0, xm1
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] avx2: 'integral4v' asm code -> 7.48x faster than 'C' version

2017-05-07 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1493905428 -19800
#  Thu May 04 19:13:48 2017 +0530
# Node ID 41611825c2f4661536500e1306db7d8c4bf7fd07
# Parent  48502979a4b21f6982dcdacbf7796bf5d9fb395c
avx2: 'integral4v' asm code -> 7.48x faster than 'C' version

   integral_init4v  7.48x202.53  1515.14

diff -r 48502979a4b2 -r 41611825c2f4 source/common/x86/seaintegral.asm
--- a/source/common/x86/seaintegral.asm Wed May 03 11:26:26 2017 +0530
+++ b/source/common/x86/seaintegral.asm Thu May 04 19:13:48 2017 +0530
@@ -32,8 +32,19 @@
 ;void integral_init4v_c(uint32_t *sum4, intptr_t stride)
 ;-
 INIT_YMM avx2
-cglobal integral4v, 2, 2, 0
- 
+cglobal integral4v, 2, 3, 2
+mov r2, r1
+shl r2, 4
+
+.loop
+movum0, [r0]
+movum1, [r0 + r2]
+psubd   m1, m0
+movu[r0], m1
+add r0, 32
+sub r1, 8
+cmp r1, 0
+jnz .loop
 RET
 
 ;-
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] Fwd: [PATCH 3 of 3] SEA motion search:integralv functions avx2 implementation

2017-05-02 Thread Praveen Tiwari

-- Forwarded message --
From: 
Date: Tue, May 2, 2017 at 3:16 PM
Subject: [x265] [PATCH 3 of 3] SEA motion search:integralv functions avx2
implementation
To: x265-devel@videolan.org


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1493121121 -19800
#  Tue Apr 25 17:22:01 2017 +0530
# Node ID e5ee88d08fcedee83efa63869a5a346c711a0e3d
# Parent  1afc127e62b4502c8f052ee989843c64b45ffc56
SEA motion search:integralv functions avx2 implementation

diff -r 1afc127e62b4 -r e5ee88d08fce source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt  Fri Apr 28 11:22:29 2017 +0530
+++ b/source/common/CMakeLists.txt  Tue Apr 25 17:22:01 2017 +0530
@@ -57,10 +57,10 @@
 set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES})
 source_group(Intrinsics FILES ${VEC_PRIMITIVES})

-set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h
dct8.h loopfilter.h)
+set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h
dct8.h loopfilter.h seaintegral.h)
 set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm
mc-a2.asm pixel-util8.asm blockcopy8.asm
-   pixeladd8.asm dct8.asm)
+   pixeladd8.asm dct8.asm seaintegral.asm)
 if(HIGH_BIT_DEPTH)
 set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm
loopfilter.asm)
 else()
diff -r 1afc127e62b4 -r e5ee88d08fce source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Apr 28 11:22:29 2017
+0530
+++ b/source/common/x86/asm-primitives.cpp  Tue Apr 25 17:22:01 2017
+0530
@@ -2158,6 +2158,13 @@
 p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
 p.fix8Pack = PFX(cutree_fix8_pack_avx2);

+p.integral_init4v = PFX(integral4v_avx2);
+p.integral_init8v = PFX(integral8v_avx2);
+p.integral_init12v = PFX(integral12v_avx2);
+p.integral_init16v = PFX(integral16v_avx2);
+p.integral_init24v = PFX(integral24v_avx2);
+p.integral_init32v = PFX(integral32v_avx2);
+
 /* TODO: This kernel needs to be modified to work with
HIGH_BIT_DEPTH only
 p.planeClipAndMax = PFX(planeClipAndMax_avx2); */

@@ -2178,6 +2185,7 @@
 p.costCoeffNxN = PFX(costCoeffNxN_avx2_bmi2);
 }
 }
+
 }
 #else // if HIGH_BIT_DEPTH

@@ -3696,6 +3704,13 @@
 p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
 p.fix8Pack = PFX(cutree_fix8_pack_avx2);

+p.integral_init4v = PFX(integral4v_avx2);
+p.integral_init8v = PFX(integral8v_avx2);
+p.integral_init12v = PFX(integral12v_avx2);
+p.integral_init16v = PFX(integral16v_avx2);
+p.integral_init24v = PFX(integral24v_avx2);
+p.integral_init32v = PFX(integral32v_avx2);
+
 }
 #endif
 }
diff -r 1afc127e62b4 -r e5ee88d08fce source/common/x86/seaintegral.asm
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/source/common/x86/seaintegral.asm Tue Apr 25 17:22:01 2017 +0530
@@ -0,0 +1,155 @@
+;**
***
+;* Copyright (C) 2013-2017 MulticoreWare, Inc
+;*
+;* Authors: Jayashri Murugan 
+;*  Vignesh V Menon 
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111,
USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at license @ x265.com.
+;**
***/
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION .text
+
+;--
---
+;void integral_init4v_c(uint32_t *sum4, intptr_t stride)
+;--
---
+INIT_YMM avx2
+cglobal integral4v, 2, 4, 2
+
+mov r2, 0

xor will be faster method of clearing a register.


+mov r3, r1

What are possible values of stride here, is it random number or multiple
of a specific number?


+shl r3, 4
+
+.loop:
+movum0, [r0]
+movum1, [r0 + r3]
+psubd   m0, m1, m0
+movu[r0], m0
+add r2, 8
+add r0, 32
+cmp r2, r1
+jl  .loop
+RET
+

[x265] Fwd: [PATCH 2 of 3] SEA motion search:Add testbench for integralv functions

2017-05-02 Thread Praveen Tiwari

-- Forwarded message --
From: 
Date: 2017-05-02 15:16 GMT+05:30
Subject: [x265] [PATCH 2 of 3] SEA motion search:Add testbench for
integralv functions
To: x265-devel@videolan.org


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1493358749 -19800
#  Fri Apr 28 11:22:29 2017 +0530
# Node ID 1afc127e62b4502c8f052ee989843c64b45ffc56
# Parent  cb67dffd0e2a596c8d3c6d042b8e6c532487d427
SEA motion search:Add testbench for integralv functions

diff -r cb67dffd0e2a -r 1afc127e62b4 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp  Tue May 02 09:58:13 2017 +0530
+++ b/source/test/pixelharness.cpp  Fri Apr 28 11:22:29 2017 +0530
@@ -2003,6 +2003,228 @@
 return true;
 }

+bool PixelHarness::check_integral_init4v(integral4v_t ref, integral4v_t
opt)
+{
+intptr_t srcStep = 64;
+int j = 0;
>>
+uint32_t sum_ans[BUFFSIZE] = { 0 };
>>
+uint32_t sum_ans1[BUFFSIZE] = { 0 };

Better names please, check existing naming conventions.

+
+for (int i = 0; i < 64; i++)
+{
+sum_ans[i] = pixel_test_buff[0][i];
+sum_ans1[i] = pixel_test_buff[0][i];
+}
+for (int i = 0, k = 0; i < BUFFSIZE; i++)
+{
+if (i % 64 == 0)
+k++;
+sum_ans[i] = sum_ans[i % 64] + k;
+sum_ans1[i] = sum_ans1[i % 64] + k;
+}
+int padx = 4;
+int pady = 4;
+uint32_t *sum_ans_ptr = sum_ans + srcStep * pady + padx;
+uint32_t *sum_ans1_ptr = sum_ans1 + srcStep * pady + padx;
+for (int i = 0; i < ITERS; i++)
+{
+ref(sum_ans_ptr, srcStep);
+checked(opt, sum_ans1_ptr, srcStep);
+
+if (memcmp(sum_ans, sum_ans1, sizeof(uint32_t) * BUFFSIZE))
+return false;
+
+reportfail()
+j += INCR;
+}
+return true;
+}
+
+bool PixelHarness::check_integral_init8v(integral8v_t ref, integral8v_t
opt)
+ {
+intptr_t srcStep = 64;
+int j = 0;
+uint32_t sum_ans[BUFFSIZE] = { 0 };
+uint32_t sum_ans1[BUFFSIZE] = { 0 };
+
+for (int i = 0; i < 64; i++)
+{
+sum_ans[i] = pixel_test_buff[0][i];
+sum_ans1[i] = pixel_test_buff[0][i];
+}
+for (int i = 0, k = 0; i < BUFFSIZE; i++)
+{
+if (i % 64 == 0)
+k++;
+sum_ans[i] = sum_ans[i % 64] + k;
+sum_ans1[i] = sum_ans1[i % 64] + k;
+}
+int padx = 4;
+int pady = 4;
+uint32_t *sum_ans_ptr = sum_ans + srcStep * pady + padx;
+uint32_t *sum_ans1_ptr = sum_ans1 + srcStep * pady + padx;
+for (int i = 0; i < ITERS; i++)
+{
+ref(sum_ans_ptr, srcStep);
+checked(opt, sum_ans1_ptr, srcStep);
+
+if (memcmp(sum_ans, sum_ans1, sizeof(uint32_t) * BUFFSIZE))
+return false;
+
+reportfail()
+j += INCR;
+}
+return true;
+}
+
+bool PixelHarness::check_integral_init12v(integral12v_t ref, integral12v_t
opt)
+ {
+intptr_t srcStep = 64;
+int j = 0;
+uint32_t sum_ans[BUFFSIZE] = { 0 };
+uint32_t sum_ans1[BUFFSIZE] = { 0 };
+
+for (int i = 0; i < 64; i++)
+{
+sum_ans[i] = pixel_test_buff[0][i];
+sum_ans1[i] = pixel_test_buff[0][i];
+}
+for (int i = 0, k = 0; i < BUFFSIZE; i++)
+{
+if (i % 64 == 0)
+k++;
+sum_ans[i] = sum_ans[i % 64] + k;
+sum_ans1[i] = sum_ans1[i % 64] + k;
+}
+int padx = 4;
+int pady = 4;
+uint32_t *sum_ans_ptr = sum_ans + srcStep * pady + padx;
+uint32_t *sum_ans1_ptr = sum_ans1 + srcStep * pady + padx;
+for (int i = 0; i < ITERS; i++)
+{
+ref(sum_ans_ptr, srcStep);
+checked(opt, sum_ans1_ptr, srcStep);
+
+if (memcmp(sum_ans, sum_ans1, sizeof(uint32_t) * BUFFSIZE))
+return false;
+
+reportfail()
+j += INCR;
+}
+return true;
+}
+
+bool PixelHarness::check_integral_init16v(integral16v_t ref, integral16v_t
opt)
+{
+intptr_t srcStep = 64;
+int j = 0;
+uint32_t sum_ans[BUFFSIZE] = { 0 };
+uint32_t sum_ans1[BUFFSIZE] = { 0 };
+
+for (int i = 0; i < 64; i++)
+{
+sum_ans[i] = pixel_test_buff[0][i];
+sum_ans1[i] = pixel_test_buff[0][i];
+}
+for (int i = 0, k = 0; i < BUFFSIZE; i++)
+{
+if (i % 64 == 0)
+k++;
+sum_ans[i] = sum_ans[i % 64] + k;
+sum_ans1[i] = sum_ans1[i % 64] + k;
+}
+int padx = 4;
+int pady = 4;
+uint32_t *sum_ans_ptr = sum_ans + srcStep * pady + padx;
+uint32_t *sum_ans1_ptr = sum_ans1 + srcStep * pady + padx;
+for (int i = 0; i < ITERS; i++)
+{
+ref(sum_ans_ptr, srcStep);
+checked(opt, sum_ans1_ptr, srcStep);
+
+if (memcmp(sum_ans, sum_ans1, sizeof(uint32_t) * BUFFSIZE))
+return false;
+
+reportfail()
+j += INCR;
+}
+return true;
+}
+
+bool PixelHarness::check_integral_init24v(integral24v_t ref, integral24v_t
opt)
+{
+intptr_t srcStep = 64;
+int j = 0;
+uint32_t

[x265] Fwd: [PATCH 1 of 3] SEA motion search:Setup asm primitives for integral calculation

2017-05-02 Thread Praveen Tiwari

-- Forwarded message --
From: 
Date: Tue, May 2, 2017 at 3:16 PM
Subject: [x265] [PATCH 1 of 3] SEA motion search:Setup asm primitives for
integral calculation
To: x265-devel@videolan.org


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1493699293 -19800
#  Tue May 02 09:58:13 2017 +0530
# Node ID cb67dffd0e2a596c8d3c6d042b8e6c532487d427
# Parent  5bc5e73760cdb61d2674e74cc52149fa0603af8a
SEA motion search:Setup asm primitives for integral calculation

diff -r 5bc5e73760cd -r cb67dffd0e2a source/common/primitives.cpp
--- a/source/common/primitives.cpp  Sat Apr 22 17:00:28 2017 -0700
+++ b/source/common/primitives.cpp  Tue May 02 09:58:13 2017 +0530
@@ -57,6 +57,7 @@
 void setupIntraPrimitives_c(EncoderPrimitives );
 void setupLoopFilterPrimitives_c(EncoderPrimitives );
 void setupSaoPrimitives_c(EncoderPrimitives );
+void setupSeaIntegralPrimitives_c(EncoderPrimitives );

 void setupCPrimitives(EncoderPrimitives )
 {
@@ -66,6 +67,7 @@
 setupIntraPrimitives_c(p);  // intrapred.cpp
 setupLoopFilterPrimitives_c(p); // loopfilter.cpp
 setupSaoPrimitives_c(p);// sao.cpp
+setupSeaIntegralPrimitives_c(p);  // framefilter.cpp
 }

 void setupAliasPrimitives(EncoderPrimitives )
diff -r 5bc5e73760cd -r cb67dffd0e2a source/common/primitives.h
--- a/source/common/primitives.hSat Apr 22 17:00:28 2017 -0700
+++ b/source/common/primitives.hTue May 02 09:58:13 2017 +0530
@@ -202,6 +202,18 @@

 typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep,
intptr_t offset, int32_t tcP, int32_t tcQ);
 typedef void (*pelFilterChroma_t)(pixel* src, intptr_t srcStep, intptr_t
offset, int32_t tc, int32_t maskP, int32_t maskQ);
>>
+

typedef void(*integral4h_t)(uint32_t *sum, pixel *pix, intptr_t stride);
>>
+typedef void(*integral8h_t)(uint32_t *sum, pixel *pix, intptr_t stride);
>>
+typedef void(*integral12h_t)(uint32_t *sum, pixel *pix, intptr_t stride);
>>
+typedef void(*integral16h_t)(uint32_t *sum, pixel *pix, intptr_t stride);
>>
+typedef void(*integral24h_t)(uint32_t *sum, pixel *pix, intptr_t stride);
>>
+typedef void(*integral32h_t)(uint32_t *sum, pixel *pix, intptr_t stride);
>>
+


typedef void(*integral4v_t)(uint32_t *sum, intptr_t stride);
>>
+typedef void(*integral8v_t)(uint32_t *sum, intptr_t stride);
>>
+typedef void(*integral12v_t)(uint32_t *sum, intptr_t stride);
>>
+typedef void(*integral16v_t)(uint32_t *sum, intptr_t stride);
>>
+typedef void(*integral24v_t)(uint32_t *sum, intptr_t stride);
>>
+typedef void(*integral32v_t)(uint32_t *sum, intptr_t stride);

Just needed two typedef here,  one for horitontal and one for vertical
rest of the typedef are redudent here.

 /* Function pointers to optimized encoder primitives. Each pointer can
reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function
*/
@@ -342,6 +354,19 @@
 pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0,
EDGE_HOR = 1
 pelFilterChroma_t pelFilterChroma[2]; // EDGE_VER = 0,
EDGE_HOR = 1

>>
+integral4h_tintegral_init4h;
>>
+integral8h_tintegral_init8h;
>>
+integral12h_tintegral_init12h;
>>
+integral16h_tintegral_init16h;
>>
+integral24h_tintegral_init24h;
>>
+integral32h_tintegral_init32h;
>>
+integral4v_tintegral_init4v;
>>
+integral8v_tintegral_init8v;
>>
+integral12v_tintegral_init12v;
>>
+integral16v_tintegral_init16v;
>>
+integral24v_tintegral_init24v;
>>
+integral32v_tintegral_init32v;
>>
+

An array of appropiate size for horizontal and another for vertical.


 /* There is one set of chroma primitives per color space. An encoder
will
  * have just a single color space and thus it will only ever use one
entry
  * in this array. However we always fill all entries in the array in
case
diff -r 5bc5e73760cd -r cb67dffd0e2a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Sat Apr 22 17:00:28 2017
-0700
+++ b/source/common/x86/asm-primitives.cpp  Tue May 02 09:58:13 2017
+0530
@@ -114,6 +114,7 @@
 #include "blockcopy8.h"
 #include "intrapred.h"
 #include "dct8.h"
+#include "seaintegral.h"
 }

 #define ALL_LUMA_CU_TYPED(prim, fncdef, fname, cpu) \
diff -r 5bc5e73760cd -r cb67dffd0e2a source/common/x86/seaintegral.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/source/common/x86/seaintegral.h   Tue May 02 09:58:13 2017 +0530
@@ -0,0 +1,41 @@
+/**
***
+* Copyright (C) 2013-2017 MulticoreWare, Inc
+*
+* Authors: Vignesh V Menon 
+*  Jayashri Murugan 
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the

Re: [x265] Interested in fast popcnt substitute below SSE4.2?

2017-03-01 Thread Praveen Tiwari

Hi Mario,

Sorry for late reply, you have shared an interesting and useful
information. Currently we are doing some experimental refactoring over the
ASM code base, so it might take some time. Hoping to receive more post like
this.

Regards,
Praveen Tiwari

On Wed, Mar 1, 2017 at 8:21 PM, Mario *LigH* Rohkrämer <cont...@ligh.de>
wrote:

> Apparently not interesting...
>
>
>
> Am 23.02.2017, 10:05 Uhr, schrieb Mario *LigH* Rohkrämer <cont...@ligh.de
> >:
>
> Another point of view on this matter:
>>
>> http://danluu.com/assembly-intrinsics/
>>
>> Seems to relativate the impact.
>>
>> I don't know if you already knew about all this before...
>>
>>
>> Am 22.02.2017, 13:39 Uhr, schrieb Mario *LigH* Rohkrämer <cont...@ligh.de
>> >:
>>
>> http://wm.ite.pl/articles/sse-popcount.html
>>>
>>> May even be faster than the popcnt instruction implemented in a
>>> supporting CPU!
>>>
>>> Found via a German "conspiracy news" blog (no, that's not at all meant
>>> seriously) which sometimes also mentions computer security issues and
>>> interesting programming challenges: https://blog.fefe.de/?ts=a653b91f
>>>
>>>
>>
>>
>
> --
>
> Fun and success!
> Mario *LigH* Rohkrämer
> mailto:cont...@ligh.de
>
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] Threadpool-fix: correctly detect 'mac' os

2017-02-06 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1486449333 -19800
#  Tue Feb 07 12:05:33 2017 +0530
# Node ID 816af3e011a6194ca62bd24f03b514feffc3493c
# Parent  20141129537b00b09d66f50082059a91b0b7e7f1
Threadpool-fix: correctly detect 'mac' os

diff -r 20141129537b -r 816af3e011a6 source/CMakeLists.txt
--- a/source/CMakeLists.txt Fri Feb 03 17:30:27 2017 +0530
+++ b/source/CMakeLists.txt Tue Feb 07 12:05:33 2017 +0530
@@ -122,7 +122,7 @@
   set(XCODE 1)
 endif()
 if(APPLE)
-  add_definitions(-DMACOS)
+  add_definitions(-DMACOS=1)
 endif()
 
 if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
diff -r 20141129537b -r 816af3e011a6 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Fri Feb 03 17:30:27 2017 +0530
+++ b/source/common/threadpool.cpp  Tue Feb 07 12:05:33 2017 +0530
@@ -57,7 +57,10 @@
 
 #endif
 
-#if MACOS
+/* TODO FIX: Macro __MACH__ ideally should be part of MACOS definition, but 
adding to Cmake
+   behaving is not as expected, need to fix this. */
+
+#if MACOS && __MACH__
 #include 
 #include 
 #endif
@@ -617,7 +620,7 @@
 return sysconf(_SC_NPROCESSORS_CONF);
 #elif __unix__
 return sysconf(_SC_NPROCESSORS_ONLN);
-#elif MACOS
+#elif MACOS && __MACH__
 int nm[2];
 size_t len = 4;
 uint32_t count;
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH 1 of 9] pcs: update design to have 'm_achivedFps' for every PCS Instance

2016-11-17 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1479128885 -19800
#  Mon Nov 14 18:38:05 2016 +0530
# Branch stable
# Node ID 8defd4e7b2e4875247e4ec95e0dd3b9630983526
# Parent  bdf273f9521784ceeda868222d415303a0bcf58b
pcs: update design to have 'm_achivedFps' for every PCS Instance

diff -r bdf273f95217 -r 8defd4e7b2e4 source/api-uhdkit.cpp
--- a/source/api-uhdkit.cpp Tue Nov 08 14:20:24 2016 +0530
+++ b/source/api-uhdkit.cpp Mon Nov 14 18:38:05 2016 +0530
@@ -206,8 +206,6 @@
 return -1;
 if (numEncoded > 0)
 {
-uhdkitEnc->m_achievedFps = numEncoded * 100.0 / 
(double)(endTime - startTime);
-uhdkitEnc->m_achievedFps = uhdkitEnc->m_achievedFps / 
uhdkitEnc->m_param->gops; // Achieved fps for each gop encoder
 uhdkitEnc->m_encodedFrameCount += numEncoded;
 controllerIndex = ((uhdkitEnc->m_encodedFrameCount - 1) / 
uhdkitEnc->m_param->x265Param->keyframeMax) % uhdkitEnc->m_param->gops;
 X265_CHECK(controllerIndex >= 0 && controllerIndex < 
uhdkitEnc->m_param->gops, "Invalid controllerIndex: %d, must be between 0 and 
%d\n", controllerIndex, uhdkitEnc->m_param->gops);
diff -r bdf273f95217 -r 8defd4e7b2e4 source/pcs/api-pcs.cpp
--- a/source/pcs/api-pcs.cppTue Nov 08 14:20:24 2016 +0530
+++ b/source/pcs/api-pcs.cppMon Nov 14 18:38:05 2016 +0530
@@ -211,6 +211,7 @@
 m_pcsParam->statusPrintInterval  = param->statusPrintInterval;
 m_curTimeStamp = m_lastTimeStamp = X265_NS::x265_mdate();
 m_framesWindow = 1;
+m_achievedFps = 0.0;
 m_outFrameCountOfLastAccumulatorReset = 0;
 time(_lastStatusOutputTime);
 
@@ -289,11 +290,11 @@
 int64_t elapsedEncTime = m_curTimeStamp - m_lastTimeStamp;
 int controllerIndex = ((uhdkitEnc->m_encodedFrameCount - 1) / 
uhdkitEnc->m_param->x265Param->keyframeMax) % uhdkitEnc->m_param->gops;
 X265_CHECK(controllerIndex >= 0 && controllerIndex < 
uhdkitEnc->m_param->gops, "Invalid controllerIndex: %d, must be between 0 and 
%d\n", controllerIndex, uhdkitEnc->m_param->gops);
-if (((m_bScenecut == 1) && elapsedEncTime > 0) || elapsedEncTime >= 
30 || uhdkitEnc->m_achievedFps < m_pcsParam->fpsSetPoint)
+if (((m_bScenecut == 1) && elapsedEncTime > 0) || elapsedEncTime >= 
30 || m_achievedFps < m_pcsParam->fpsSetPoint)
 {
 // Don't allow outrageously high frame rate measurements to skew 
the controller.
-uhdkitEnc->m_achievedFps = X265_MIN(uhdkitEnc->m_achievedFps, 4 * 
m_pcsParam->fpsSetPoint);
-error = (m_pcsParam->fpsSetPoint - uhdkitEnc->m_achievedFps) / 
m_pcsParam->fpsSetPoint;
+m_achievedFps = X265_MIN(m_achievedFps, 4 * 
m_pcsParam->fpsSetPoint);
+error = (m_pcsParam->fpsSetPoint - m_achievedFps) / 
m_pcsParam->fpsSetPoint;
 
 if (m_pcsParam->integralReset > 0)
 {
@@ -308,7 +309,7 @@
 {
 double lowerBound = (m_pcsParam->fpsSetPoint * 
SATURATION_RANGE_MIN) / 100.0;   /* Lower bound, 3% of set-point */
 double upperBound = (m_pcsParam->fpsSetPoint * 
SATURATION_RANGE_MAX) / 100.0;   /* Upper bound, 10% of set-point */
-double fpsDiff = (uhdkitEnc->m_achievedFps - 
m_pcsParam->fpsSetPoint);
+double fpsDiff =(m_achievedFps - m_pcsParam->fpsSetPoint);
 resetErrorAccumulater = (fpsDiff >= lowerBound && fpsDiff <= 
upperBound) || m_bScenecut; /* Steady state, or scenecut */
 }
 
@@ -388,7 +389,7 @@
 m_outFrameCountOfLastAccumulatorReset = uhdkitEnc->m_encodedFrameCount;
 m_lastTimeStamp = m_curTimeStamp;
 if (uhdkitEnc->m_reconfigParam->logLevel == UHDKIT_LOG_INFO)
-
uhdkit_pcs_printStatus(>m_reconfigParam[controllerIndex], 
uhdkitEnc->m_achievedFps);
+
uhdkit_pcs_printStatus(>m_reconfigParam[controllerIndex], 
m_achievedFps);
 }
 return true;
 }
@@ -398,6 +399,11 @@
 m_bScenecut = pic->frameData.bScenecut;
 }
 
+void pcs::uhdkit_pcs_update_fps(int64_t startTime, int64_t endTime, int 
numEncoded)
+{
+m_achievedFps = numEncoded * 100.0 / (double)(endTime - startTime);
+}
+
 int pcs::uhdkit_pcs_getControlParamValue(const x265_param *param, int index)
 {
 int controlParamValue[NUM_CONTROLLER] = { param->bEnableFastIntra, 
param->bEnableEarlySkip, param->bEnableRectInter,
diff -r bdf273f95217 -r 8defd4e7b2e4 source/pcs/pcs.h
--- a/source/pcs/pcs.h  Tue Nov 08 14:20:24 2016 +0530
+++ b/source/pcs/pcs.h  Mon Nov 14 18:38:05 2016 +0530
@@ -32,6 +32,7 @@
 /* variables handled by the PCS Instance */
 pcs_param*  m_pcsParam;

Re: [x265] [PATCH 1 of 9] pcs: update design to have 'm_achivedFps' for every PCS Instance

2016-11-17 Thread Praveen Tiwari

Please, ignore this patch. Thanks.


On Thu, Nov 17, 2016 at 8:51 PM, <prav...@multicorewareinc.com> wrote:

> # HG changeset patch
> # User Praveen Tiwari <prav...@multicorewareinc.com>
> # Date 1479128885 -19800
> #  Mon Nov 14 18:38:05 2016 +0530
> # Branch stable
> # Node ID 8defd4e7b2e4875247e4ec95e0dd3b9630983526
> # Parent  bdf273f9521784ceeda868222d415303a0bcf58b
> pcs: update design to have 'm_achivedFps' for every PCS Instance
>
> diff -r bdf273f95217 -r 8defd4e7b2e4 source/api-uhdkit.cpp
> --- a/source/api-uhdkit.cpp Tue Nov 08 14:20:24 2016 +0530
> +++ b/source/api-uhdkit.cpp Mon Nov 14 18:38:05 2016 +0530
> @@ -206,8 +206,6 @@
>  return -1;
>  if (numEncoded > 0)
>  {
> -uhdkitEnc->m_achievedFps = numEncoded * 100.0 /
> (double)(endTime - startTime);
> -uhdkitEnc->m_achievedFps = uhdkitEnc->m_achievedFps /
> uhdkitEnc->m_param->gops; // Achieved fps for each gop encoder
>  uhdkitEnc->m_encodedFrameCount += numEncoded;
>  controllerIndex = ((uhdkitEnc->m_encodedFrameCount - 1) /
> uhdkitEnc->m_param->x265Param->keyframeMax) % uhdkitEnc->m_param->gops;
>  X265_CHECK(controllerIndex >= 0 && controllerIndex <
> uhdkitEnc->m_param->gops, "Invalid controllerIndex: %d, must be between 0
> and %d\n", controllerIndex, uhdkitEnc->m_param->gops);
> diff -r bdf273f95217 -r 8defd4e7b2e4 source/pcs/api-pcs.cpp
> --- a/source/pcs/api-pcs.cppTue Nov 08 14:20:24 2016 +0530
> +++ b/source/pcs/api-pcs.cppMon Nov 14 18:38:05 2016 +0530
> @@ -211,6 +211,7 @@
>  m_pcsParam->statusPrintInterval  = param->statusPrintInterval;
>  m_curTimeStamp = m_lastTimeStamp = X265_NS::x265_mdate();
>  m_framesWindow = 1;
> +m_achievedFps = 0.0;
>  m_outFrameCountOfLastAccumulatorReset = 0;
>  time(_lastStatusOutputTime);
>
> @@ -289,11 +290,11 @@
>  int64_t elapsedEncTime = m_curTimeStamp - m_lastTimeStamp;
>  int controllerIndex = ((uhdkitEnc->m_encodedFrameCount - 1) /
> uhdkitEnc->m_param->x265Param->keyframeMax) % uhdkitEnc->m_param->gops;
>  X265_CHECK(controllerIndex >= 0 && controllerIndex <
> uhdkitEnc->m_param->gops, "Invalid controllerIndex: %d, must be between 0
> and %d\n", controllerIndex, uhdkitEnc->m_param->gops);
> -if (((m_bScenecut == 1) && elapsedEncTime > 0) || elapsedEncTime
> >= 30 || uhdkitEnc->m_achievedFps < m_pcsParam->fpsSetPoint)
> +if (((m_bScenecut == 1) && elapsedEncTime > 0) || elapsedEncTime
> >= 30 || m_achievedFps < m_pcsParam->fpsSetPoint)
>  {
>  // Don't allow outrageously high frame rate measurements to
> skew the controller.
> -uhdkitEnc->m_achievedFps = X265_MIN(uhdkitEnc->m_achievedFps,
> 4 * m_pcsParam->fpsSetPoint);
> -error = (m_pcsParam->fpsSetPoint - uhdkitEnc->m_achievedFps)
> / m_pcsParam->fpsSetPoint;
> +m_achievedFps = X265_MIN(m_achievedFps, 4 *
> m_pcsParam->fpsSetPoint);
> +error = (m_pcsParam->fpsSetPoint - m_achievedFps) /
> m_pcsParam->fpsSetPoint;
>
>  if (m_pcsParam->integralReset > 0)
>  {
> @@ -308,7 +309,7 @@
>  {
>  double lowerBound = (m_pcsParam->fpsSetPoint *
> SATURATION_RANGE_MIN) / 100.0;   /* Lower bound, 3% of set-point */
>  double upperBound = (m_pcsParam->fpsSetPoint *
> SATURATION_RANGE_MAX) / 100.0;   /* Upper bound, 10% of set-point */
> -double fpsDiff = (uhdkitEnc->m_achievedFps -
> m_pcsParam->fpsSetPoint);
> +double fpsDiff =(m_achievedFps -
> m_pcsParam->fpsSetPoint);
>  resetErrorAccumulater = (fpsDiff >= lowerBound && fpsDiff
> <= upperBound) || m_bScenecut; /* Steady state, or scenecut */
>  }
>
> @@ -388,7 +389,7 @@
>  m_outFrameCountOfLastAccumulatorReset = uhdkitEnc->m_
> encodedFrameCount;
>  m_lastTimeStamp = m_curTimeStamp;
>  if (uhdkitEnc->m_reconfigParam->logLevel == UHDKIT_LOG_INFO)
> -
> uhdkit_pcs_printStatus(>m_reconfigParam[controllerIndex],
> uhdkitEnc->m_achievedFps);
> +
> uhdkit_pcs_printStatus(>m_reconfigParam[controllerIndex],
> m_achievedFps);
>  }
>  return true;
>  }
> @@ -398,6 +399,11 @@
>  m_bScenecut = pic->frameData.bScenecut;
>  }
>
> +void pcs::uhdkit_pcs_update_fps(int64_t startTime, int64_t endTime, int
&

[x265] [PATCH] encoder.cpp: print reconfigure params for debug purpose

2016-11-17 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1479392046 -19800
#  Thu Nov 17 19:44:06 2016 +0530
# Node ID 64dc12e9aae9acacaaab5f7875d01fb09d4156d6
# Parent  4c1652f3884fba9fab4c589dd057b12e6bf33d5b
encoder.cpp: print reconfigure params for debug purpose

diff -r 4c1652f3884f -r 64dc12e9aae9 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cppTue Nov 15 11:16:04 2016 +0530
+++ b/source/encoder/encoder.cppThu Nov 17 19:44:06 2016 +0530
@@ -2433,10 +2433,10 @@
 x265_param* oldParam = m_param;
 x265_param* newParam = m_latestParam;
 
-x265_log(newParam, X265_LOG_INFO, "Reconfigured param options, input 
Frame: %d\n", m_pocLast + 1);
+x265_log(newParam, X265_LOG_DEBUG, "Reconfigured param options, input 
Frame: %d\n", m_pocLast + 1);
 
 char tmp[40];
-#define TOOLCMP(COND1, COND2, STR)  if (COND1 != COND2) { sprintf(tmp, STR, 
COND1, COND2); x265_log(newParam, X265_LOG_INFO, tmp); }
+#define TOOLCMP(COND1, COND2, STR)  if (COND1 != COND2) { sprintf(tmp, STR, 
COND1, COND2); x265_log(newParam, X265_LOG_DEBUG, tmp); }
 TOOLCMP(oldParam->maxNumReferences, newParam->maxNumReferences, "ref=%d to 
%d\n");
 TOOLCMP(oldParam->bEnableFastIntra, newParam->bEnableFastIntra, 
"fast-intra=%d to %d\n");
 TOOLCMP(oldParam->bEnableEarlySkip, newParam->bEnableEarlySkip, 
"early-skip=%d to %d\n");
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Re: [x265] [PATCH] [multi-lib] Support 8+10+12 bits in single DLL (Workaround)

2016-09-23 Thread Praveen Tiwari

Hi Min,
 Can you please verify for VC12 ? I double checked on this I am
getting different output for this patch. 8-bit encoded file size is same
but different binary (compared using beyond compare), 10 and 12 bit size
and binary both are different. I applied you patch build once (like 8 bit
build)  and collected all depth outputs (8, 10 and 12), compared with three
builds of x265 i.e 8 bit, 10 bit and 12 bit.

Regards,
Praveen


On Fri, Sep 23, 2016 at 2:47 AM, chen <chenm...@163.com> wrote:

> Hi Praveen,
>
> I test your cmdlind on my VS2008 build.
> I build three bit-depth version and compare with one bit-depth version,
> but the output are still matched in both 10 and 12 bit.
>
> Regards,
> Min
>
> At 2016-09-22 14:39:50,"Praveen Tiwari" <prav...@multicorewareinc.com>
> wrote:
>
> Hi Min,
>
>  After this patch outputs are changing, tested for following command line
> for 10-bit and 12-bit outputs.
>
> --input=NebutaFestival_2560x1600_60_10bit_crop.yuv --input-res=2560x1600
> --fps=60  --numa-pools="NULL" --output-depth=12 --hash=1 -o  NFOut12.hevc
>
>
>
>
> Regards,
> Praveen
>
> On Thu, Sep 15, 2016 at 1:55 AM, chen <chenm...@163.com> wrote:
>
>> From ea50e494473623ed0dbff2907194aaf268dc449a Mon Sep 17 00:00:00 2001
>> From: Min Chen <min.c...@multicorewareinc.com>
>> Date: Wed, 14 Sep 2016 15:23:38 -0500
>> Subject: [PATCH] [multi-lib] Support 8+10+12 bits in single DLL
>> (Workaround)
>>
>> ---
>>  source/CMakeLists.txt |   40 +++-
>>  1 files changed, 39 insertions(+), 1 deletions(-)
>>
>> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
>> index dd19d28..c2c2f7f 100644
>> --- a/source/CMakeLists.txt
>> +++ b/source/CMakeLists.txt
>> @@ -36,6 +36,7 @@ configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
>>  configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
>> "${PROJECT_BINARY_DIR}/x265_config.h")
>>
>> +
>>  SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake"
>> "${CMAKE_MODULE_PATH}")
>>
>>  # System architecture detection
>> @@ -396,6 +397,39 @@ if(WIN32)
>>  endif(WINXP_SUPPORT)
>>  endif()
>>
>> +
>> +if(ENABLE_SHARED AND LINKED_10BIT AND LINKED_12BIT)
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?setParamAspectRatio@x265
>> @@YAXPEAUx265_param@@HH@Z\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?getParamAspectRatio@x265
>> @@YAXPEAUx265_param@@AEAH1@Z\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?general_log_file@x265
>> @@YAXPEBUx265_param@@PEBDH1ZZ\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?general_log@x265
>> @@YAXPEBUx265_param@@PEBDH1ZZ\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def
>> "?x265_api_get_94@x265_10bit@@YAPEBUx265_api@@H@Z\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def
>> "?x265_api_get_94@x265_12bit@@YAPEBUx265_api@@H@Z\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def
>> "?x265_api_query@x265_10bit@@YAPEBUx265_api@@HHPEAH@Z\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def
>> "?x265_api_query@x265_12bit@@YAPEBUx265_api@@HHPEAH@Z\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_mdate@x265
>> @@YA_JXZ\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def
>> "?x265_picturePlaneSize@x265@@YAI@Z\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_ssim2dB@x265
>> @@YANN@Z\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_ssim2dB@x265
>> @@YANN@Z\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_report_simd@x265
>> @@YAXPEAUx265_param@@@Z\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_fopen@x265
>> @@YAPEAU_iobuf@@PEBD0@Z\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_malloc@x265
>> @@YAPEAX_K@Z\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_free@x265
>> @@YAXPEAX@Z\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_atoi@x265
>> @@YAHPEBDAEA_N@Z\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?start@Thread@x265@
>> @QEAA_NXZ\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?stop@Thread@x265@
>> @QEAAXXZ\n")
>> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "??0Thre

Re: [x265] [PATCH] [multi-lib] Support 8+10+12 bits in single DLL (Workaround)

2016-09-22 Thread Praveen Tiwari

Hi Min,

 After this patch outputs are changing, tested for following command line
for 10-bit and 12-bit outputs.

--input=NebutaFestival_2560x1600_60_10bit_crop.yuv --input-res=2560x1600
--fps=60  --numa-pools="NULL" --output-depth=12 --hash=1 -o  NFOut12.hevc




Regards,
Praveen

On Thu, Sep 15, 2016 at 1:55 AM, chen <chenm...@163.com> wrote:

> From ea50e494473623ed0dbff2907194aaf268dc449a Mon Sep 17 00:00:00 2001
> From: Min Chen <min.c...@multicorewareinc.com>
> Date: Wed, 14 Sep 2016 15:23:38 -0500
> Subject: [PATCH] [multi-lib] Support 8+10+12 bits in single DLL
> (Workaround)
>
> ---
>  source/CMakeLists.txt |   40 +++-
>  1 files changed, 39 insertions(+), 1 deletions(-)
>
> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
> index dd19d28..c2c2f7f 100644
> --- a/source/CMakeLists.txt
> +++ b/source/CMakeLists.txt
> @@ -36,6 +36,7 @@ configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
>  configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
> "${PROJECT_BINARY_DIR}/x265_config.h")
>
> +
>  SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake"
> "${CMAKE_MODULE_PATH}")
>
>  # System architecture detection
> @@ -396,6 +397,39 @@ if(WIN32)
>  endif(WINXP_SUPPORT)
>  endif()
>
> +
> +if(ENABLE_SHARED AND LINKED_10BIT AND LINKED_12BIT)
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?setParamAspectRatio@x265
> @@YAXPEAUx265_param@@HH@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?getParamAspectRatio@x265
> @@YAXPEAUx265_param@@AEAH1@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?general_log_file@x265@@
> YAXPEBUx265_param@@PEBDH1ZZ\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?general_log@x265@@
> YAXPEBUx265_param@@PEBDH1ZZ\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def
> "?x265_api_get_94@x265_10bit@@YAPEBUx265_api@@H@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def
> "?x265_api_get_94@x265_12bit@@YAPEBUx265_api@@H@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_api_query@x265_10bit
> @@YAPEBUx265_api@@HHPEAH@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_api_query@x265_12bit
> @@YAPEBUx265_api@@HHPEAH@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_mdate@x265
> @@YA_JXZ\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def
> "?x265_picturePlaneSize@x265@@YAI@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_ssim2dB@x265
> @@YANN@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_ssim2dB@x265
> @@YANN@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_report_simd@x265@@
> YAXPEAUx265_param@@@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_fopen@x265@@YAPEAU_
> iobuf@@PEBD0@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_malloc@x265
> @@YAPEAX_K@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_free@x265
> @@YAXPEAX@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_atoi@x265
> @@YAHPEBDAEA_N@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?start@Thread@x265@
> @QEAA_NXZ\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?stop@Thread@x265@
> @QEAAXXZ\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "??0Thread@x265@@QEAA@XZ
> \n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "??1Thread@x265@@UEAA@XZ
> \n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?g_maxCUDepth@x265
> @@3IA\n")
> +if(WINXP_SUPPORT)
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?cond_init@x265@@
> YAHPEAUConditionVariable@1@@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?cond_wait@x265@@
> YAHPEAUConditionVariable@1@PEAU_RTL_CRITICAL_SECTION@@K@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?cond_destroy@x265@@
> YAXPEAUConditionVariable@1@@Z\n")
> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?cond_broadcast@x265
> @@YAXPEAUConditionVariable@1@@Z\n")
> +endif()
> +endif()
> +
>  include(version) # determine X265_VERSION and X265_LATEST_TAG
>  include_directories(. common encoder "${PROJECT_BINARY_DIR}")
>
> @@ -608,7 +642,11 @@ if(ENABLE_CLI)
>  if(WIN32 OR NOT ENABLE_SHARED OR INTEL_CXX)
>  # The CLI cannot link to the shared library on Windows, it
>  # requires internal APIs not exported fr

[x265] [PATCH] threadpool.cpp: fix default pool param behaviour for window systems, if NULL or "*" or "" (default) x265 will use all available threads on each NUMA node

2016-09-08 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1473370883 25200
#  Thu Sep 08 14:41:23 2016 -0700
# Branch stable
# Node ID 6e301b63952bc77f9e4710b6f46a6409e093d9c7
# Parent  6a9b6a828f791902a02fbf700caee2d3f32129c0
threadpool.cpp: fix default pool param behaviour for window systems, if NULL or 
"*" or "" (default) x265 will use all available threads on each NUMA node

diff -r 6a9b6a828f79 -r 6e301b63952b source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Wed Jul 13 19:24:23 2016 +0530
+++ b/source/common/threadpool.cpp  Thu Sep 08 14:41:23 2016 -0700
@@ -299,8 +299,8 @@
  * For windows because threads can't be allocated to live across sockets
  * changing the default behavior to be per-socket pools -- FIXME */
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
- if (!p->numaPools)
- {
+if (!p->numaPools || (strcmp(p->numaPools, "NULL") == 0 || 
strcmp(p->numaPools, "*") == 0 || strcmp(p->numaPools, "") == 0))
+{
  char poolString[50] = "";
  for (int i = 0; i < numNumaNodes; i++)
  {
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Re: [x265] [PATCH] threadpool.cpp: fix default pool param behaviour, if NULL or “” (default) x265 will use all available threads on each NUMA node

2016-09-08 Thread Praveen Tiwari

Please ignore this this behaviour is not required for linux systems.
Thanks.

Regards,
Praveen

On Wed, Sep 7, 2016 at 5:19 PM, <prav...@multicorewareinc.com> wrote:

> # HG changeset patch
> # User Praveen Tiwari <prav...@multicorewareinc.com>
> # Date 1473246754 -19800
> #  Wed Sep 07 16:42:34 2016 +0530
> # Node ID 9587a394ba58a2c3a579db5fb3f7531daf49559b
> # Parent  df559450949bd085b0fc5e01332aa8458af2fa43
> threadpool.cpp: fix default pool param behaviour, if NULL or 灯 (default)
> x265 will use all available threads on each NUMA node
>
> diff -r df559450949b -r 9587a394ba58 source/common/threadpool.cpp
> --- a/source/common/threadpool.cpp  Wed Aug 10 13:26:18 2016 +0530
> +++ b/source/common/threadpool.cpp  Wed Sep 07 16:42:34 2016 +0530
> @@ -330,8 +330,8 @@
>  {
>  for (int j = i; j < numNumaNodes; j++)
>  {
> -threadsPerPool[numNumaNodes] += cpusPerNode[j];
> -nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << j);
> +threadsPerPool[j] += cpusPerNode[j];
> +nodeMaskPerPool[j] |= ((uint64_t)1 << j);
>  }
>  break;
>  }
> @@ -366,8 +366,8 @@
>  {
>  for (int i = 0; i < numNumaNodes; i++)
>  {
> -threadsPerPool[numNumaNodes]  += cpusPerNode[i];
> -nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << i);
> +threadsPerPool[i]  += cpusPerNode[i];
> +nodeMaskPerPool[i] |= ((uint64_t)1 << i);
>  }
>  }
>
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] threadpool.cpp: fix default pool param behaviour, if NULL or “” (default) x265 will use all available threads on each NUMA node

2016-09-07 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1473246754 -19800
#  Wed Sep 07 16:42:34 2016 +0530
# Node ID 9587a394ba58a2c3a579db5fb3f7531daf49559b
# Parent  df559450949bd085b0fc5e01332aa8458af2fa43
threadpool.cpp: fix default pool param behaviour, if NULL or  (default) x265 
will use all available threads on each NUMA node

diff -r df559450949b -r 9587a394ba58 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Wed Aug 10 13:26:18 2016 +0530
+++ b/source/common/threadpool.cpp  Wed Sep 07 16:42:34 2016 +0530
@@ -330,8 +330,8 @@
 {
 for (int j = i; j < numNumaNodes; j++)
 {
-threadsPerPool[numNumaNodes] += cpusPerNode[j];
-nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << j);
+threadsPerPool[j] += cpusPerNode[j];
+nodeMaskPerPool[j] |= ((uint64_t)1 << j);
 }
 break;
 }
@@ -366,8 +366,8 @@
 {
 for (int i = 0; i < numNumaNodes; i++)
 {
-threadsPerPool[numNumaNodes]  += cpusPerNode[i];
-nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << i);
+threadsPerPool[i]  += cpusPerNode[i];
+nodeMaskPerPool[i] |= ((uint64_t)1 << i);
 }
 }
  
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] asm: replace mova by movu to avoid crashing x265_denoise_dct_sse4() 32-bit version

2016-08-23 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1471952630 -19800
#  Tue Aug 23 17:13:50 2016 +0530
# Node ID 1c646739b6628a3a8e308a22c948a4495a157140
# Parent  49a0d1176aef5bc6330fcfd39b4589616c174f0a
asm: replace mova by movu to avoid crashing x265_denoise_dct_sse4() 32-bit 
version

diff -r 49a0d1176aef -r 1c646739b662 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asmWed Jul 27 21:47:20 2016 +0200
+++ b/source/common/x86/dct8.asmTue Aug 23 17:13:50 2016 +0530
@@ -2112,7 +2112,7 @@
 pxor m5,  m5
 shr  r3d, 3
 .loop:
-mova m0, [r0]
+movu m0, [r0]
 pabswm1, m0
 
 movu m2, [r1]
@@ -2130,7 +2130,7 @@
 pcmpgtw  m4, m1, m5
 pand m1, m4
 psignw   m1, m0
-mova [r0], m1
+movu [r0], m1
 add  r0, 16
 add  r1, 32
 add  r2, 16
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Re: [x265] [PATCH] threadpool: fix warning: ‘int popCount(uint64_t)’ defined but not used [-Wunused-function]

2016-05-30 Thread Praveen Tiwari

I remember some numa functionality requires minimum window 7, they are not
supported on previous version of window OS.

Regards,
Praveen

On Mon, May 30, 2016 at 6:43 PM, Mateusz <mateu...@poczta.onet.pl> wrote:

> There is a serious bug in threadpool code that prevent working in Windows
> XP/Vista.
> VS 2015 error when compiling for 32-bit Windows XP:
> (ClCompile target) ->
>   I:\vs\x265\source\common\threadpool.cpp(590): error C3861:
> 'GetNumaNodeProcessorMaskEx': identifier not found [I:\vs\x265\ma\
> 8-b\common\common.vcxproj]
>
> Did you see patch https://patches.videolan.org/patch/13495/ (it fixes
> also this warning)?
>
>
> W dniu 2016-05-30 o 14:45, prav...@multicorewareinc.com pisze:
> > # HG changeset patch
> > # User Praveen Tiwari <prav...@multicorewareinc.com>
> > # Date 1464585837 -19800
> > #  Mon May 30 10:53:57 2016 +0530
> > # Node ID b8dbe8d7c09e7fc0b7cce236569fc5df2eb70b1e
> > # Parent  aeade2e8d8688ebffb8455b8948d89d6a72e2c38
> > threadpool: fix warning: ‘int popCount(uint64_t)’ defined but not used
> [-Wunused-function]
> >  static int popCount(uint64_t x)
> >
> > diff -r aeade2e8d868 -r b8dbe8d7c09e source/common/threadpool.cpp
> > --- a/source/common/threadpool.cppThu May 26 16:45:09 2016 +0530
> > +++ b/source/common/threadpool.cppMon May 30 10:53:57 2016 +0530
> > @@ -68,6 +68,7 @@
> >  # define strcasecmp _stricmp
> >  #endif
> >
> > +#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
> >  const uint64_t m1 = 0x; //binary: 0101...
> >  const uint64_t m2 = 0x; //binary: 00110011..
> >  const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary:  4 zeros,  4 ones ...
> > @@ -80,6 +81,7 @@
> >  x = (x + (x >> 4)) & m3;
> >  return (x * h01) >> 56;
> >  }
> > +#endif
> >
> >  namespace X265_NS {
> >  // x265 private namespace
> >
> >
> >
> > ___
> > x265-devel mailing list
> > x265-devel@videolan.org
> > https://mailman.videolan.org/listinfo/x265-devel
> >
>
>
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] threadpool: fix warning: ‘int popCount(uint64_t)’ defined but not used [-Wunused-function]

2016-05-30 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1464585837 -19800
#  Mon May 30 10:53:57 2016 +0530
# Node ID b8dbe8d7c09e7fc0b7cce236569fc5df2eb70b1e
# Parent  aeade2e8d8688ebffb8455b8948d89d6a72e2c38
threadpool: fix warning: int popCount(uint64_t) defined but not used 
[-Wunused-function]
 static int popCount(uint64_t x)

diff -r aeade2e8d868 -r b8dbe8d7c09e source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Thu May 26 16:45:09 2016 +0530
+++ b/source/common/threadpool.cpp  Mon May 30 10:53:57 2016 +0530
@@ -68,6 +68,7 @@
 # define strcasecmp _stricmp
 #endif
 
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
 const uint64_t m1 = 0x; //binary: 0101...
 const uint64_t m2 = 0x; //binary: 00110011..
 const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary:  4 zeros,  4 ones ...
@@ -80,6 +81,7 @@
 x = (x + (x >> 4)) & m3;
 return (x * h01) >> 56;
 }
+#endif
 
 namespace X265_NS {
 // x265 private namespace
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] threadpool: fix memory leak

2016-05-23 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1464004220 -19800
#  Mon May 23 17:20:20 2016 +0530
# Node ID 75d8cadc3f1acbffbbbc651d26c597a96007167b
# Parent  5af929bc0ed0827ae0be018c1c8edc10d8650406
threadpool: fix memory leak

diff -r 5af929bc0ed0 -r 75d8cadc3f1a source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Mon May 23 15:47:38 2016 +0530
+++ b/source/common/threadpool.cpp  Mon May 23 17:20:20 2016 +0530
@@ -3,6 +3,7 @@
  *
  * Authors: Steve Borho <st...@borho.org>
  *  Min Chen <chenm...@163.com>
+ *  Praveen Kumar Tiwari <prav...@multicorewareinc.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -420,6 +421,7 @@
 if ((nodeMaskPerPool[node] >> j) & 1)
 len += sprintf(nodesstr + len, ",%d", j);
 x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on 
numa nodes %s\n", i, numThreads, nodesstr + 1);
+delete[] nodesstr;
 }
 else
 x265_log(p, X265_LOG_INFO, "Thread pool created using %d 
threads\n", numThreads);
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH 1 of 2] threadpool-fix: utilize all available NUMA nodes for threadpool distribution for windows system,

2016-05-23 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1463997405 -19800
#  Mon May 23 15:26:45 2016 +0530
# Node ID 2f8a373347649f29953ca9f434eec329e1339aca
# Parent  4723933fdec920debefe606d50a9a312f7bc7f6b
threadpool-fix: utilize all available NUMA nodes for threadpool distribution 
for windows system,
linux threadpool configuration info, match with windows -> clean logic

diff -r 4723933fdec9 -r 2f8a37334764 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Fri May 13 09:32:11 2016 +0530
+++ b/source/common/threadpool.cpp  Mon May 23 15:26:45 2016 +0530
@@ -28,6 +28,10 @@
 
 #include 
 
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+#include 
+#endif
+
 #if X86_64
 
 #ifdef __GNUC__
@@ -64,6 +68,19 @@
 # define strcasecmp _stricmp
 #endif
 
+const uint64_t m1 = 0x; //binary: 0101...
+const uint64_t m2 = 0x; //binary: 00110011..
+const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary:  4 zeros,  4 ones ...
+const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power of 
0,1,2,3...
+
+int popCount(uint64_t x)
+{
+x -= (x >> 1) & m1;
+x = (x & m2) + ((x >> 2) & m2);
+x = (x + (x >> 4)) & m3;
+return (x * h01) >> 56;
+}
+
 namespace X265_NS {
 // x265 private namespace
 
@@ -238,7 +255,6 @@
 memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
 
 int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
-int cpuCount = getCpuCount();
 bool bNumaSupport = false;
 
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
@@ -248,26 +264,54 @@
 #endif
 
 
-for (int i = 0; i < cpuCount; i++)
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY;
+for (int i = 0; i < numNumaNodes; i++)
 {
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
-UCHAR node;
-if (GetNumaProcessorNode((UCHAR)i, ))
-cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++;
-else
+GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
+cpusPerNode[i] = popCount(groupAffinityPointer->Mask);
+}
+delete groupAffinityPointer;
 #elif HAVE_LIBNUMA
-if (bNumaSupport >= 0)
-cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++;
-else
+if (bNumaSupport >= 0)
+{
+for (int i = 0; i < numNumaNodes; i++)
+{
+struct bitmask* bitMask = numa_allocate_cpumask();
+int ret = numa_node_to_cpus(i, bitMask);
+if (!ret)
+cpusPerNode[i] = numa_num_possible_cpus();
+else
+x265_log(p, X265_LOG_ERROR, "Failed to genrate CPU mask\n");
+numa_free_cpumask(bitMask);
+}
+}
+#else // NUMA not supported
+cpusPerNode[0] = getCpuCount();
 #endif
-cpusPerNode[0]++;
-}
 
 if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG)
-for (int i = 0; i < numNumaNodes; i++)
-x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical 
cores\n", i, cpusPerNode[i]);
-
-/* limit threads based on param->numaPools */
+for (int i = 0; i < numNumaNodes; i++)
+x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical 
cores\n", i, cpusPerNode[i]);
+/* limit threads based on param->numaPools
+ * For windows because threads can't be allocated to live across sockets
+ * changing the default behavior to be per-socket pools -- FIXME */
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+ if (!p->numaPools)
+ {
+ char poolString[50] = "";
+ for (int i = 0; i < numNumaNodes; i++)
+ {
+ char nextCount[10] = "";
+ if (i)
+ sprintf(nextCount, ",%d", cpusPerNode[i]);
+ else
+   sprintf(nextCount, "%d", cpusPerNode[i]);
+ strcat(poolString, nextCount);
+ }
+ x265_param_parse(p, "pools", poolString);
+ }
+#endif
 if (p->numaPools && *p->numaPools)
 {
 const char *nodeStr = p->numaPools;
@@ -389,16 +433,15 @@
 X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot 
have more than MAX_POOL_THREADS threads\n");
 
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
-m_winCpuMask = 0x0;
-GROUP_AFFINITY groupAffinity;
+memset(_groupAffinity, 0, sizeof(GROUP_AFFINITY));
 for (int i = 0; i < getNumaNodeCount(); i++)
 {
 int numaNode = ((nodeMask >> i) & 0x1U) ? i : -1;
 if (numaNode != -1)
-if (GetNumaNodeProcessorMaskEx((USHORT)

Re: [x265] [PATCH 1 of 7] threadpool.cpp: get correct CPU count for multisocket machines -> windows system fix

2016-05-23 Thread Praveen Tiwari

Hi,
I am combining these patches into a single patch along with some
updates, so please ignore these patches. On top of this I will update
Mateusz patch (CLI: new logic for '--pools ' option ) to avoid
merge conflicts. Thanks.


.

Regards,
Praveen

On Fri, May 20, 2016 at 4:31 PM, <prav...@multicorewareinc.com> wrote:

> # HG changeset patch
> # User Praveen Tiwari <prav...@multicorewareinc.com>
> # Date 1463655478 -19800
> #  Thu May 19 16:27:58 2016 +0530
> # Node ID 9a6ab28b736e1167ac26977d7da8ab2d23cc296f
> # Parent  aca781339b4c8dae94ff7da73f18cd4439757e87
> threadpool.cpp: get correct CPU count for multisocket machines -> windows
> system fix
>
> diff -r aca781339b4c -r 9a6ab28b736e source/common/threadpool.cpp
> --- a/source/common/threadpool.cpp  Tue May 10 15:33:17 2016 +0530
> +++ b/source/common/threadpool.cpp  Thu May 19 16:27:58 2016 +0530
> @@ -64,6 +64,19 @@
>  # define strcasecmp _stricmp
>  #endif
>
> +const uint64_t m1 = 0x; //binary: 0101...
> +const uint64_t m2 = 0x; //binary: 00110011..
> +const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary:  4 zeros,  4 ones ...
> +const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power of
> 0,1,2,3...
> +
> +int popCount(uint64_t x)
> +{
> +x -= (x >> 1) & m1;
> +x = (x & m2) + ((x >> 2) & m2);
> +x = (x + (x >> 4)) & m3;
> +return (x * h01) >> 56;
> +}
> +
>  namespace X265_NS {
>  // x265 private namespace
>
> @@ -525,9 +538,17 @@
>  int ThreadPool::getCpuCount()
>  {
>  #if _WIN32
> -SYSTEM_INFO sysinfo;
> -GetSystemInfo();
> -return sysinfo.dwNumberOfProcessors;
> +enum { MAX_NODE_NUM = 127 };
> +int cpus = 0;
> +int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
> +PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY;
> +for (int i = 0; i < numNumaNodes; i++)
> +{
> +GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
> +cpus += popCount(groupAffinityPointer->Mask);
> +}
> +delete groupAffinityPointer;
> +return cpus;
>  #elif __unix__ && X265_ARCH_ARM
>  /* Return the number of processors configured by OS. Because, most
> embedded linux distributions
>   * uses only one processor as the scheduler doesn't have enough work
> to utilize all processors */
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH 1 of 7] threadpool.cpp: get correct CPU count for multisocket machines -> windows system fix

2016-05-20 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1463655478 -19800
#  Thu May 19 16:27:58 2016 +0530
# Node ID 9a6ab28b736e1167ac26977d7da8ab2d23cc296f
# Parent  aca781339b4c8dae94ff7da73f18cd4439757e87
threadpool.cpp: get correct CPU count for multisocket machines -> windows 
system fix

diff -r aca781339b4c -r 9a6ab28b736e source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Tue May 10 15:33:17 2016 +0530
+++ b/source/common/threadpool.cpp  Thu May 19 16:27:58 2016 +0530
@@ -64,6 +64,19 @@
 # define strcasecmp _stricmp
 #endif
 
+const uint64_t m1 = 0x; //binary: 0101...
+const uint64_t m2 = 0x; //binary: 00110011..
+const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary:  4 zeros,  4 ones ...
+const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power of 
0,1,2,3...
+
+int popCount(uint64_t x)
+{
+x -= (x >> 1) & m1;
+x = (x & m2) + ((x >> 2) & m2);
+x = (x + (x >> 4)) & m3;
+return (x * h01) >> 56;
+}
+
 namespace X265_NS {
 // x265 private namespace
 
@@ -525,9 +538,17 @@
 int ThreadPool::getCpuCount()
 {
 #if _WIN32
-SYSTEM_INFO sysinfo;
-GetSystemInfo();
-return sysinfo.dwNumberOfProcessors;
+enum { MAX_NODE_NUM = 127 };
+int cpus = 0;
+int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
+PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY;
+for (int i = 0; i < numNumaNodes; i++)
+{
+GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
+cpus += popCount(groupAffinityPointer->Mask);
+}
+delete groupAffinityPointer;
+return cpus;
 #elif __unix__ && X265_ARCH_ARM
 /* Return the number of processors configured by OS. Because, most 
embedded linux distributions
  * uses only one processor as the scheduler doesn't have enough work to 
utilize all processors */
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH 2 of 7] threadpool.cpp: nicely populate detected NUMA node along with logical cores per node -> windows system fix

2016-05-20 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1463738737 -19800
#  Fri May 20 15:35:37 2016 +0530
# Node ID e988eee2f0dc962b9b94c8cef6f739522bce9afb
# Parent  9a6ab28b736e1167ac26977d7da8ab2d23cc296f
threadpool.cpp: nicely populate detected NUMA node along with logical cores per 
node -> windows system fix

diff -r 9a6ab28b736e -r e988eee2f0dc source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Thu May 19 16:27:58 2016 +0530
+++ b/source/common/threadpool.cpp  Fri May 20 15:35:37 2016 +0530
@@ -251,7 +251,6 @@
 memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
 
 int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
-int cpuCount = getCpuCount();
 bool bNumaSupport = false;
 
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
@@ -261,20 +260,24 @@
 #endif
 
 
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY;
+for (int i = 0; i < numNumaNodes; i++)
+{
+GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
+cpusPerNode[i] = popCount(groupAffinityPointer->Mask);
+}
+delete groupAffinityPointer;
+#elif HAVE_LIBNUMA
+int cpuCount = getCpuCount();
 for (int i = 0; i < cpuCount; i++)
 {
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
-UCHAR node;
-if (GetNumaProcessorNode((UCHAR)i, ))
-cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++;
-else
-#elif HAVE_LIBNUMA
 if (bNumaSupport >= 0)
 cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++;
-else
+}
+#elif
+cpusPerNode[0] = getCpuCount();
 #endif
-cpusPerNode[0]++;
-}
 
 if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG)
 for (int i = 0; i < numNumaNodes; i++)
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH 7 of 7] threadpool.cpp: correctly set pool string -> windows system

2016-05-20 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1463740562 -19800
#  Fri May 20 16:06:02 2016 +0530
# Node ID 7be1c425db3030d901382ceb6c837f5e93014ca8
# Parent  521874f0eea07c26b6a9580302fe61fc7f223e02
threadpool.cpp: correctly set pool string -> windows system

diff -r 521874f0eea0 -r 7be1c425db30 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Fri May 20 15:44:40 2016 +0530
+++ b/source/common/threadpool.cpp  Fri May 20 16:06:02 2016 +0530
@@ -284,10 +284,25 @@
 #endif
 
 if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG)
-for (int i = 0; i < numNumaNodes; i++)
-x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical 
cores\n", i, cpusPerNode[i]);
-
-/* limit threads based on param->numaPools */
+/* limit threads based on param->numaPools
+ * For windows because threads can't be allocated to live across sockets
+ * changing the default behavior to be per-socket pools -- FIXME */
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+ if (!p->numaPools)
+ {
+ char poolString[50] = "";
+ for (int i = 0; i < numNumaNodes; i++)
+ {
+ char nextCount[10] = "";
+ if (i)
+ sprintf(nextCount, ",%d", cpusPerNode[i]);
+ else
+   sprintf(nextCount, "%d", cpusPerNode[i]);
+ strcat(poolString, nextCount);
+ }
+ x265_param_parse(p, "pools", poolString);
+ }
+#endif
 if (p->numaPools && *p->numaPools)
 {
 const char *nodeStr = p->numaPools;
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH 6 of 7] threadpool.cpp: ensure for minimum window version -> _WIN32_WINNT_WIN7

2016-05-20 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1463739280 -19800
#  Fri May 20 15:44:40 2016 +0530
# Node ID 521874f0eea07c26b6a9580302fe61fc7f223e02
# Parent  e96bb0aaec630d44ea318222ae221fe116f4f11a
threadpool.cpp: ensure for minimum window version -> _WIN32_WINNT_WIN7

diff -r e96bb0aaec63 -r 521874f0eea0 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Fri May 20 15:35:37 2016 +0530
+++ b/source/common/threadpool.cpp  Fri May 20 15:44:40 2016 +0530
@@ -28,9 +28,9 @@
 
 #include 
 
-#if _WIN32
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
 #include 
-#endif // _WIN32
+#endif
 
 #if X86_64
 
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH 3 of 7] threadpool.cpp: utilize all available NUMA nodes for threadpool distribution -> windows system fix

2016-05-20 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1463738737 -19800
#  Fri May 20 15:35:37 2016 +0530
# Node ID ab1b27a1712d581c32b007f0f72e482f4a83905d
# Parent  e988eee2f0dc962b9b94c8cef6f739522bce9afb
threadpool.cpp: utilize all available NUMA nodes for threadpool distribution -> 
windows system fix

diff -r e988eee2f0dc -r ab1b27a1712d source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Fri May 20 15:35:37 2016 +0530
+++ b/source/common/threadpool.cpp  Fri May 20 15:35:37 2016 +0530
@@ -27,6 +27,7 @@
 #include "threading.h"
 
 #include 
+#include 
 
 #if X86_64
 
@@ -405,16 +406,15 @@
 X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot 
have more than MAX_POOL_THREADS threads\n");
 
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
-m_winCpuMask = 0x0;
-GROUP_AFFINITY groupAffinity;
+memset(_groupAffinity, 0, sizeof(GROUP_AFFINITY));
 for (int i = 0; i < getNumaNodeCount(); i++)
 {
 int numaNode = ((nodeMask >> i) & 0x1U) ? i : -1;
 if (numaNode != -1)
-if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, ))
-m_winCpuMask |= groupAffinity.Mask;
+if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, _groupAffinity))
+break;
 }
-m_numaMask = _winCpuMask;
+m_numaMask = _groupAffinity.Mask;
 #elif HAVE_LIBNUMA
 if (numa_available() >= 0)
 {
@@ -496,11 +496,16 @@
 setThreadNodeAffinity(m_numaMask);
 }
 
-/* static */
 void ThreadPool::setThreadNodeAffinity(void *numaMask)
 {
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
-if (SetThreadAffinityMask(GetCurrentThread(), *((DWORD_PTR*)numaMask)))
+UNREFERENCED_PARAMETER(numaMask);
+GROUP_AFFINITY groupAffinity;
+memset(, 0, sizeof(GROUP_AFFINITY));
+groupAffinity.Group = m_groupAffinity.Group;
+groupAffinity.Mask = m_groupAffinity.Mask;
+const PGROUP_AFFINITY affinityPointer = 
+if (SetThreadGroupAffinity(GetCurrentThread(), affinityPointer, NULL))
 return;
 else
 x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity for NUMA 
node mask\n");
diff -r e988eee2f0dc -r ab1b27a1712d source/common/threadpool.h
--- a/source/common/threadpool.hFri May 20 15:35:37 2016 +0530
+++ b/source/common/threadpool.hFri May 20 15:35:37 2016 +0530
@@ -85,7 +85,7 @@
 int   m_numWorkers;
 void* m_numaMask; // node mask in linux, cpu mask in windows
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
-DWORD_PTR m_winCpuMask;
+GROUP_AFFINITY m_groupAffinity;
 #endif
 bool  m_isActive;
 
@@ -99,6 +99,7 @@
 bool start();
 void stopWorkers();
 void setCurrentThreadAffinity();
+void setThreadNodeAffinity(void *numaMask);
 int  tryAcquireSleepingThread(sleepbitmap_t firstTryBitmap, sleepbitmap_t 
secondTryBitmap);
 int  tryBondPeers(int maxPeers, sleepbitmap_t peerBitmap, BondedTaskGroup& 
master);
 
@@ -106,7 +107,6 @@
 
 static int  getCpuCount();
 static int  getNumaNodeCount();
-static void setThreadNodeAffinity(void *numaMask);
 };
 
 /* Any worker thread may enlist the help of idle worker threads from the same
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH 4 of 7] threadpool.cpp: window specific header 'winnt.h'

2016-05-20 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1463738737 -19800
#  Fri May 20 15:35:37 2016 +0530
# Node ID 598afbba907e06563ac08c0058abdbd7ba50d5d8
# Parent  ab1b27a1712d581c32b007f0f72e482f4a83905d
threadpool.cpp: window specific header 'winnt.h'

diff -r ab1b27a1712d -r 598afbba907e source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Fri May 20 15:35:37 2016 +0530
+++ b/source/common/threadpool.cpp  Fri May 20 15:35:37 2016 +0530
@@ -27,7 +27,10 @@
 #include "threading.h"
 
 #include 
+
+#if _WIN32
 #include 
+#endif // _WIN32
 
 #if X86_64
 
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH 5 of 7] threadpool.cpp: fix linux error: #elif with no expression

2016-05-20 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1463738737 -19800
#  Fri May 20 15:35:37 2016 +0530
# Node ID e96bb0aaec630d44ea318222ae221fe116f4f11a
# Parent  598afbba907e06563ac08c0058abdbd7ba50d5d8
threadpool.cpp: fix linux error: #elif with no expression

diff -r 598afbba907e -r e96bb0aaec63 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Fri May 20 15:35:37 2016 +0530
+++ b/source/common/threadpool.cpp  Fri May 20 15:35:37 2016 +0530
@@ -279,7 +279,7 @@
 if (bNumaSupport >= 0)
 cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++;
 }
-#elif
+#else // NUMA not supported
 cpusPerNode[0] = getCpuCount();
 #endif
 
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Re: [x265] [PATCH] ThreadPool.cpp: fix getCpuCount function for windows systems

2016-05-20 Thread Praveen Tiwari

Please ignore this sending updated patch. thanks.

Regards,
Praveen

On Tue, May 17, 2016 at 7:17 PM, <prav...@multicorewareinc.com> wrote:

> # HG changeset patch
> # User Praveen Tiwari <prav...@multicorewareinc.com>
> # Date 1463492830 -19800
> #  Tue May 17 19:17:10 2016 +0530
> # Node ID cf3c2e0dce0997a499ae1d50fda6891cae83e685
> # Parent  372fc5b12ed6003f8784702956ccf7203ea68a2e
> ThreadPool.cpp: fix getCpuCount function for windows systems
>
> diff -r 372fc5b12ed6 -r cf3c2e0dce09 source/common/threadpool.cpp
> --- a/source/common/threadpool.cpp  Tue May 17 19:06:36 2016 +0530
> +++ b/source/common/threadpool.cpp  Tue May 17 19:17:10 2016 +0530
> @@ -545,9 +545,17 @@
>  int ThreadPool::getCpuCount()
>  {
>  #if _WIN32
> -SYSTEM_INFO sysinfo;
> -GetSystemInfo();
> -return sysinfo.dwNumberOfProcessors;
> +enum { MAX_NODE_NUM = 127 };
> +int cpus = 0;
> +int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
> +PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY;
> +for (int i = 0; i < numNumaNodes; i++)
> +{
> +GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
> +cpus += (int)bitCount(groupAffinityPointer->Mask);
> +}
> +delete groupAffinityPointer;
> +return cpus;
>  #elif __unix__ && X265_ARCH_ARM
>  /* Return the number of processors configured by OS. Because, most
> embedded linux distributions
>   * uses only one processor as the scheduler doesn't have enough work
> to utilize all processors */
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Re: [x265] [PATCH] ThreadPool.cpp: fix core count for windows machines

2016-05-20 Thread Praveen Tiwari

Please  ignore this sending updated patch. Thanks

Regards,
Praveen

On Tue, May 17, 2016 at 8:01 PM, Pradeep Ramachandran <
prad...@multicorewareinc.com> wrote:

>
> On Tue, May 17, 2016 at 7:07 PM, <prav...@multicorewareinc.com> wrote:
>
>> # HG changeset patch
>> # User Praveen Tiwari <prav...@multicorewareinc.com>
>> # Date 1463492196 -19800
>> #  Tue May 17 19:06:36 2016 +0530
>> # Node ID 372fc5b12ed6003f8784702956ccf7203ea68a2e
>> # Parent  e5b5bdc3c154f908706fb75e006f9abf9b3de96f
>> ThreadPool.cpp: fix core count for windows machines
>>
>> diff -r e5b5bdc3c154 -r 372fc5b12ed6 source/common/threadpool.cpp
>> --- a/source/common/threadpool.cpp  Sat May 14 07:29:46 2016 +0530
>> +++ b/source/common/threadpool.cpp  Tue May 17 19:06:36 2016 +0530
>> @@ -27,6 +27,7 @@
>>  #include "threading.h"
>>
>>  #include 
>> +#include 
>>
>>  #if X86_64
>>
>> @@ -64,6 +65,18 @@
>>  # define strcasecmp _stricmp
>>  #endif
>>
>> +uint64_t bitCount(uint64_t value)
>> +{
>> +uint64_t count = 0;
>> +while (value > 0) // until all bits are zero
>> +{
>> +if ((value & 1) == 1) // check lower bit
>> +count++;
>> +value >>= 1;  // shift bits, removing lower bit
>> +}
>> +return count;
>> +}
>> +
>>  namespace X265_NS {
>>  // x265 private namespace
>>
>> @@ -238,7 +251,6 @@
>>  memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
>>
>>  int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
>> -int cpuCount = getCpuCount();
>>  bool bNumaSupport = false;
>>
>>  #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
>> @@ -248,20 +260,28 @@
>>  #endif
>>
>>
>> +#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
>> +PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY;
>> +for (int i = 0; i < numNumaNodes; i++)
>> +{
>> +GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
>> +cpusPerNode[i] = (int)bitCount(groupAffinityPointer->Mask);
>> +}
>> +delete groupAffinityPointer;
>> +#elif HAVE_LIBNUMA
>> +int cpuCount = getCpuCount();
>>
>
> Can we move to the cleaner implementation of not relying on CPU counts for
> non-windows platforms also?
>
>
>>  for (int i = 0; i < cpuCount; i++)
>>  {
>> -#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
>> -UCHAR node;
>> -if (GetNumaProcessorNode((UCHAR)i, ))
>> -cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++;
>> -else
>> -#elif HAVE_LIBNUMA
>>  if (bNumaSupport >= 0)
>>  cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++;
>> -else
>> +}
>> +#elif
>> +int cpuCount = getCpuCount();
>> +for (int i = 0; i < cpuCount; i++)
>> +{
>> +cpusPerNode[0]++;
>> +}
>>
>
> How about cpusPerNode[0] = getCpuCount() here? The for loop is unnecessary.
>
>
>>  #endif
>> -cpusPerNode[0]++;
>> -}
>>
>>  if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG)
>>  for (int i = 0; i < numNumaNodes; i++)
>> ___
>> x265-devel mailing list
>> x265-devel@videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>
>
> ___
> x265-devel mailing list
> x265-devel@videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] ThreadPool.cpp: fix getCpuCount function for windows systems

2016-05-17 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1463492830 -19800
#  Tue May 17 19:17:10 2016 +0530
# Node ID cf3c2e0dce0997a499ae1d50fda6891cae83e685
# Parent  372fc5b12ed6003f8784702956ccf7203ea68a2e
ThreadPool.cpp: fix getCpuCount function for windows systems

diff -r 372fc5b12ed6 -r cf3c2e0dce09 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Tue May 17 19:06:36 2016 +0530
+++ b/source/common/threadpool.cpp  Tue May 17 19:17:10 2016 +0530
@@ -545,9 +545,17 @@
 int ThreadPool::getCpuCount()
 {
 #if _WIN32
-SYSTEM_INFO sysinfo;
-GetSystemInfo();
-return sysinfo.dwNumberOfProcessors;
+enum { MAX_NODE_NUM = 127 };
+int cpus = 0;
+int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
+PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY;
+for (int i = 0; i < numNumaNodes; i++)
+{
+GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
+cpus += (int)bitCount(groupAffinityPointer->Mask);
+}
+delete groupAffinityPointer;
+return cpus;
 #elif __unix__ && X265_ARCH_ARM
 /* Return the number of processors configured by OS. Because, most 
embedded linux distributions
  * uses only one processor as the scheduler doesn't have enough work to 
utilize all processors */
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] ThreadPool.cpp: fix core count for windows machines

2016-05-17 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1463492196 -19800
#  Tue May 17 19:06:36 2016 +0530
# Node ID 372fc5b12ed6003f8784702956ccf7203ea68a2e
# Parent  e5b5bdc3c154f908706fb75e006f9abf9b3de96f
ThreadPool.cpp: fix core count for windows machines

diff -r e5b5bdc3c154 -r 372fc5b12ed6 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp  Sat May 14 07:29:46 2016 +0530
+++ b/source/common/threadpool.cpp  Tue May 17 19:06:36 2016 +0530
@@ -27,6 +27,7 @@
 #include "threading.h"
 
 #include 
+#include 
 
 #if X86_64
 
@@ -64,6 +65,18 @@
 # define strcasecmp _stricmp
 #endif
 
+uint64_t bitCount(uint64_t value)
+{
+uint64_t count = 0;
+while (value > 0) // until all bits are zero
+{
+if ((value & 1) == 1) // check lower bit
+count++;
+value >>= 1;  // shift bits, removing lower bit
+}
+return count;
+}
+
 namespace X265_NS {
 // x265 private namespace
 
@@ -238,7 +251,6 @@
 memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
 
 int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
-int cpuCount = getCpuCount();
 bool bNumaSupport = false;
 
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
@@ -248,20 +260,28 @@
 #endif
 
 
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY;
+for (int i = 0; i < numNumaNodes; i++)
+{
+GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
+cpusPerNode[i] = (int)bitCount(groupAffinityPointer->Mask);
+}
+delete groupAffinityPointer;
+#elif HAVE_LIBNUMA
+int cpuCount = getCpuCount();
 for (int i = 0; i < cpuCount; i++)
 {
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
-UCHAR node;
-if (GetNumaProcessorNode((UCHAR)i, ))
-cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++;
-else
-#elif HAVE_LIBNUMA
 if (bNumaSupport >= 0)
 cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++;
-else
+}
+#elif
+int cpuCount = getCpuCount();
+for (int i = 0; i < cpuCount; i++)
+{
+cpusPerNode[0]++;
+}
 #endif
-cpusPerNode[0]++;
-}
 
 if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG)
 for (int i = 0; i < numNumaNodes; i++)
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Re: [x265] [PATCH] motion.cpp: optimize 'X265_DIA_SEARCH' byeliminating costly branch instructions

2016-03-08 Thread praveen tiwari

Yes, this is for eliminating if...else so it's perform a conditional assignment 
for correctness of code. I will try to update macro definition. Thanks. 

-Original Message-
From: "chen" <chenm...@163.com>
Sent: ‎09-‎03-‎2016 05:52
To: "Development for x265" <x265-devel@videolan.org>
Subject: Re: [x265] [PATCH] motion.cpp: optimize 'X265_DIA_SEARCH' 
byeliminating costly branch instructions

I suggest you to modify macro
And this patch depends on side effect of conditional statment, it is bad code 
style.

At 2016-03-08 22:48:49,prav...@multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari <prav...@multicorewareinc.com>
># Date 1457448163 -19800
>#  Tue Mar 08 20:12:43 2016 +0530
># Node ID 519441d72cf723dc3b279a91a6080f329729cb49
># Parent  0e1b6472c05e3a53538d8e064e502d8a7508eb6e
>motion.cpp: optimize 'X265_DIA_SEARCH' by eliminating costly branch 
>instructions
>
>diff -r 0e1b6472c05e -r 519441d72cf7 source/encoder/motion.cpp
>--- a/source/encoder/motion.cppTue Mar 08 19:08:57 2016 +0530
>+++ b/source/encoder/motion.cppTue Mar 08 20:12:43 2016 +0530
>@@ -659,10 +659,10 @@
> do
> {
> COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
>-COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
>-COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
>-COPY1_IF_LT(bcost, (costs[2] << 4) + 4);
>-COPY1_IF_LT(bcost, (costs[3] << 4) + 12);
>+(((costs[0] << 4) + 1) < bcost) && (bcost = ((costs[0] << 4) + 
>1));  // if ((y) < (x)) (x) = (y);
>+(((costs[1] << 4) + 3) < bcost) && (bcost = ((costs[1] << 4) + 
>3));
>+(((costs[2] << 4) + 4) < bcost) && (bcost = ((costs[2] << 4) + 
>4));
>+(((costs[3] << 4) + 12) < bcost) && (bcost = ((costs[3] << 4) + 
>12));
> if (!(bcost & 15))
> break;
> bmv.x -= (bcost << 28) >> 30;
>___
>x265-devel mailing list
>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] motion.cpp: optimization, eliminate branching

2016-03-08 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1457444337 -19800
#  Tue Mar 08 19:08:57 2016 +0530
# Node ID 0e1b6472c05e3a53538d8e064e502d8a7508eb6e
# Parent  d50663b272c4f308de6f669afe1323990971e08d
motion.cpp: optimization, eliminate branching

diff -r d50663b272c4 -r 0e1b6472c05e source/encoder/motion.cpp
--- a/source/encoder/motion.cpp Tue Mar 08 15:50:05 2016 +0530
+++ b/source/encoder/motion.cpp Tue Mar 08 19:08:57 2016 +0530
@@ -30,6 +30,7 @@
 
 #if _MSC_VER
 #pragma warning(disable: 4127) // conditional  expression is constant (macros 
use this construct)
+#pragma warning (disable: 4706) // assignment within conditional expression
 #endif
 
 using namespace X265_NS;
@@ -762,8 +763,7 @@
 ucost2 = bcost;
 if (bmv.notZero() && bmv != pmv)
 DIA1_ITER(bmv.x, bmv.y);
-if (bcost == ucost2)
-cross_start = 3;
+(bcost == ucost2) && (cross_start = 3);
 
 /* Early Termination */
 omv = bmv;
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] param: cleanup, print reconfigured param option along with its old and new value

2016-03-08 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1457432405 -19800
#  Tue Mar 08 15:50:05 2016 +0530
# Node ID d50663b272c4f308de6f669afe1323990971e08d
# Parent  88aebc166fa8e16f91d5f0acce77690003be9d91
param: cleanup, print reconfigured param option along with its old and new value

diff -r 88aebc166fa8 -r d50663b272c4 source/common/param.cpp
--- a/source/common/param.cpp   Fri Mar 04 16:59:45 2016 +0530
+++ b/source/common/param.cpp   Tue Mar 08 15:50:05 2016 +0530
@@ -1373,36 +1373,32 @@
 if (!param || !reconfiguredParam)
 return;
 
-x265_log(param,X265_LOG_INFO, "Reconfigured param options :\n");
-
-char buf[80] = { 0 };
 char tmp[40];
-#define TOOLCMP(COND1, COND2, STR, VAL)  if (COND1 != COND2) { sprintf(tmp, 
STR, VAL); appendtool(param, buf, sizeof(buf), tmp); }
-TOOLCMP(param->maxNumReferences, reconfiguredParam->maxNumReferences, 
"ref=%d", reconfiguredParam->maxNumReferences);
-TOOLCMP(param->maxTUSize, reconfiguredParam->maxTUSize, "max-tu-size=%d", 
reconfiguredParam->maxTUSize);
-TOOLCMP(param->searchRange, reconfiguredParam->searchRange, "merange=%d", 
reconfiguredParam->searchRange);
-TOOLCMP(param->subpelRefine, reconfiguredParam->subpelRefine, "subme= %d", 
reconfiguredParam->subpelRefine);
-TOOLCMP(param->rdLevel, reconfiguredParam->rdLevel, "rd=%d", 
reconfiguredParam->rdLevel);
-TOOLCMP(param->psyRd, reconfiguredParam->psyRd, "psy-rd=%.2lf", 
reconfiguredParam->psyRd);
-TOOLCMP(param->rdoqLevel, reconfiguredParam->rdoqLevel, "rdoq=%d", 
reconfiguredParam->rdoqLevel);
-TOOLCMP(param->psyRdoq, reconfiguredParam->psyRdoq, "psy-rdoq=%.2lf", 
reconfiguredParam->psyRdoq);
-TOOLCMP(param->noiseReductionIntra, 
reconfiguredParam->noiseReductionIntra, "nr-intra=%d", 
reconfiguredParam->noiseReductionIntra);
-TOOLCMP(param->noiseReductionInter, 
reconfiguredParam->noiseReductionInter, "nr-inter=%d", 
reconfiguredParam->noiseReductionInter);
-TOOLCMP(param->bEnableTSkipFast, reconfiguredParam->bEnableTSkipFast, 
"tskip-fast=%d", reconfiguredParam->bEnableTSkipFast);
-TOOLCMP(param->bEnableSignHiding, reconfiguredParam->bEnableSignHiding, 
"signhide=%d", reconfiguredParam->bEnableSignHiding);
-TOOLCMP(param->bEnableFastIntra, reconfiguredParam->bEnableFastIntra, 
"fast-intra=%d", reconfiguredParam->bEnableFastIntra);
-if (param->bEnableLoopFilter && (param->deblockingFilterBetaOffset != 
reconfiguredParam->deblockingFilterBetaOffset 
+#define TOOLCMP(COND1, COND2, STR, OLD_VAL, NEW_VAL)  if (COND1 != COND2) { 
sprintf(tmp, STR, OLD_VAL, NEW_VAL); printf("\n%s\n", tmp);}
+TOOLCMP(param->maxNumReferences, reconfiguredParam->maxNumReferences, 
"[x265] Reconfigure: ref=%d to %d", param->maxNumReferences, 
reconfiguredParam->maxNumReferences);
+TOOLCMP(param->maxTUSize, reconfiguredParam->maxTUSize, "[x265] 
Reconfigure: max-tu-size=%d to %d", param->maxTUSize, 
reconfiguredParam->maxTUSize);
+TOOLCMP(param->searchRange, reconfiguredParam->searchRange, "[x265] 
Reconfigure: merange=%d to %d", param->searchRange, 
reconfiguredParam->searchRange);
+TOOLCMP(param->subpelRefine, reconfiguredParam->subpelRefine, "[x265] 
Reconfigure: subme=%d to %d", param->subpelRefine, 
reconfiguredParam->subpelRefine);
+TOOLCMP(param->rdLevel, reconfiguredParam->rdLevel, "[x265] Reconfigure: 
rd=%d to %d", param->rdLevel, reconfiguredParam->rdLevel);
+TOOLCMP(param->psyRd, reconfiguredParam->psyRd, "[x265] Reconfigure: 
psy-rd=%.2lf to %.2lf", param->psyRd, reconfiguredParam->psyRd);
+TOOLCMP(param->rdoqLevel, reconfiguredParam->rdoqLevel, "[x265] 
Reconfigure: rdoq=%d to %d", param->rdoqLevel, reconfiguredParam->rdoqLevel);
+TOOLCMP(param->psyRdoq, reconfiguredParam->psyRdoq, "[x265] Reconfigure: 
psy-rdoq=%.2lf to %.2lf", param->psyRdoq, reconfiguredParam->psyRdoq);
+TOOLCMP(param->noiseReductionIntra, 
reconfiguredParam->noiseReductionIntra, "[x265] Reconfigure: nr-intra=%d to 
%d", param->noiseReductionIntra, reconfiguredParam->noiseReductionIntra);
+TOOLCMP(param->noiseReductionInter, 
reconfiguredParam->noiseReductionInter, "[x265] Reconfigure: nr-inter=%d to 
%d", param->noiseReductionInter, reconfiguredParam->noiseReductionInter);
+TOOLCMP(param->bEnableTSkipFast, reconfiguredParam->bEnableTSkipFast, 
"[x265] Reconfigure: tskip-fast=%d to %d", param->bEnableTSkipFast, 
reconfiguredParam->bEnableTSkipFast);
+TOOLCMP(param->

Re: [x265] [PATCH] param: cleanup, print reconfigured param option along with its old and configured value

2016-03-07 Thread Praveen Tiwari

Please ignore the patch need to update. Thanks.

Regards,
Praveen

On Tue, Mar 8, 2016 at 10:57 AM, <prav...@multicorewareinc.com> wrote:

> # HG changeset patch
> # User Praveen Tiwari <prav...@multicorewareinc.com>
> # Date 1457356750 -19800
> #  Mon Mar 07 18:49:10 2016 +0530
> # Node ID 6f7dbb1c901cb5b5b88cc20c3213906465021338
> # Parent  88aebc166fa8e16f91d5f0acce77690003be9d91
> param: cleanup, print reconfigured param option along with its old and
> configured value
>
> diff -r 88aebc166fa8 -r 6f7dbb1c901c source/common/param.cpp
> --- a/source/common/param.cpp   Fri Mar 04 16:59:45 2016 +0530
> +++ b/source/common/param.cpp   Mon Mar 07 18:49:10 2016 +0530
> @@ -1373,36 +1373,31 @@
>  if (!param || !reconfiguredParam)
>  return;
>
> -x265_log(param,X265_LOG_INFO, "Reconfigured param options :\n");
> -
> -char buf[80] = { 0 };
>  char tmp[40];
> -#define TOOLCMP(COND1, COND2, STR, VAL)  if (COND1 != COND2) {
> sprintf(tmp, STR, VAL); appendtool(param, buf, sizeof(buf), tmp); }
> -TOOLCMP(param->maxNumReferences, reconfiguredParam->maxNumReferences,
> "ref=%d", reconfiguredParam->maxNumReferences);
> -TOOLCMP(param->maxTUSize, reconfiguredParam->maxTUSize,
> "max-tu-size=%d", reconfiguredParam->maxTUSize);
> -TOOLCMP(param->searchRange, reconfiguredParam->searchRange,
> "merange=%d", reconfiguredParam->searchRange);
> -TOOLCMP(param->subpelRefine, reconfiguredParam->subpelRefine, "subme=
> %d", reconfiguredParam->subpelRefine);
> -TOOLCMP(param->rdLevel, reconfiguredParam->rdLevel, "rd=%d",
> reconfiguredParam->rdLevel);
> -TOOLCMP(param->psyRd, reconfiguredParam->psyRd, "psy-rd=%.2lf",
> reconfiguredParam->psyRd);
> -TOOLCMP(param->rdoqLevel, reconfiguredParam->rdoqLevel, "rdoq=%d",
> reconfiguredParam->rdoqLevel);
> -TOOLCMP(param->psyRdoq, reconfiguredParam->psyRdoq, "psy-rdoq=%.2lf",
> reconfiguredParam->psyRdoq);
> -TOOLCMP(param->noiseReductionIntra,
> reconfiguredParam->noiseReductionIntra, "nr-intra=%d",
> reconfiguredParam->noiseReductionIntra);
> -TOOLCMP(param->noiseReductionInter,
> reconfiguredParam->noiseReductionInter, "nr-inter=%d",
> reconfiguredParam->noiseReductionInter);
> -TOOLCMP(param->bEnableTSkipFast, reconfiguredParam->bEnableTSkipFast,
> "tskip-fast=%d", reconfiguredParam->bEnableTSkipFast);
> -TOOLCMP(param->bEnableSignHiding,
> reconfiguredParam->bEnableSignHiding, "signhide=%d",
> reconfiguredParam->bEnableSignHiding);
> -TOOLCMP(param->bEnableFastIntra, reconfiguredParam->bEnableFastIntra,
> "fast-intra=%d", reconfiguredParam->bEnableFastIntra);
> -if (param->bEnableLoopFilter && (param->deblockingFilterBetaOffset !=
> reconfiguredParam->deblockingFilterBetaOffset
> +#define TOOLCMP(COND1, COND2, STR, OLD_VAL, NEW_VAL)  if (COND1 != COND2)
> { sprintf(tmp, STR, OLD_VAL, NEW_VAL);}
> +TOOLCMP(param->maxNumReferences, reconfiguredParam->maxNumReferences,
> "[x265] Reconfigure: ref=%d to %d", param->maxNumReferences,
> reconfiguredParam->maxNumReferences);
> +TOOLCMP(param->maxTUSize, reconfiguredParam->maxTUSize, "[x265]
> Reconfigure: max-tu-size=%d to %d", param->maxTUSize,
> reconfiguredParam->maxTUSize);
> +TOOLCMP(param->searchRange, reconfiguredParam->searchRange, "[x265]
> Reconfigure: merange=%d to %d", param->searchRange,
> reconfiguredParam->searchRange);
> +TOOLCMP(param->subpelRefine, reconfiguredParam->subpelRefine, "[x265]
> Reconfigure: subme=%d to %d", param->subpelRefine,
> reconfiguredParam->subpelRefine);
> +TOOLCMP(param->rdLevel, reconfiguredParam->rdLevel, "[x265]
> Reconfigure: rd=%d to %d", param->rdLevel, reconfiguredParam->rdLevel);
> +TOOLCMP(param->psyRd, reconfiguredParam->psyRd, "[x265] Reconfigure:
> psy-rd=%.2lf to %.2lf", param->psyRd, reconfiguredParam->psyRd);
> +TOOLCMP(param->rdoqLevel, reconfiguredParam->rdoqLevel, "[x265]
> Reconfigure: rdoq=%d to %d", param->rdoqLevel,
> reconfiguredParam->rdoqLevel);
> +TOOLCMP(param->psyRdoq, reconfiguredParam->psyRdoq, "[x265]
> Reconfigure: psy-rdoq=%.2lf to %.2lf", param->psyRdoq,
> reconfiguredParam->psyRdoq);
> +TOOLCMP(param->noiseReductionIntra,
> reconfiguredParam->noiseReductionIntra, "[x265] Reconfigure: nr-intra=%d to
> %d", param->noiseReductionIntra, reconf

[x265] [PATCH] param: cleanup, print reconfigured param option along with its old and configured value

2016-03-07 Thread praveen

# HG changeset patch
# User Praveen Tiwari <prav...@multicorewareinc.com>
# Date 1457356750 -19800
#  Mon Mar 07 18:49:10 2016 +0530
# Node ID 6f7dbb1c901cb5b5b88cc20c3213906465021338
# Parent  88aebc166fa8e16f91d5f0acce77690003be9d91
param: cleanup, print reconfigured param option along with its old and 
configured value

diff -r 88aebc166fa8 -r 6f7dbb1c901c source/common/param.cpp
--- a/source/common/param.cpp   Fri Mar 04 16:59:45 2016 +0530
+++ b/source/common/param.cpp   Mon Mar 07 18:49:10 2016 +0530
@@ -1373,36 +1373,31 @@
 if (!param || !reconfiguredParam)
 return;
 
-x265_log(param,X265_LOG_INFO, "Reconfigured param options :\n");
-
-char buf[80] = { 0 };
 char tmp[40];
-#define TOOLCMP(COND1, COND2, STR, VAL)  if (COND1 != COND2) { sprintf(tmp, 
STR, VAL); appendtool(param, buf, sizeof(buf), tmp); }
-TOOLCMP(param->maxNumReferences, reconfiguredParam->maxNumReferences, 
"ref=%d", reconfiguredParam->maxNumReferences);
-TOOLCMP(param->maxTUSize, reconfiguredParam->maxTUSize, "max-tu-size=%d", 
reconfiguredParam->maxTUSize);
-TOOLCMP(param->searchRange, reconfiguredParam->searchRange, "merange=%d", 
reconfiguredParam->searchRange);
-TOOLCMP(param->subpelRefine, reconfiguredParam->subpelRefine, "subme= %d", 
reconfiguredParam->subpelRefine);
-TOOLCMP(param->rdLevel, reconfiguredParam->rdLevel, "rd=%d", 
reconfiguredParam->rdLevel);
-TOOLCMP(param->psyRd, reconfiguredParam->psyRd, "psy-rd=%.2lf", 
reconfiguredParam->psyRd);
-TOOLCMP(param->rdoqLevel, reconfiguredParam->rdoqLevel, "rdoq=%d", 
reconfiguredParam->rdoqLevel);
-TOOLCMP(param->psyRdoq, reconfiguredParam->psyRdoq, "psy-rdoq=%.2lf", 
reconfiguredParam->psyRdoq);
-TOOLCMP(param->noiseReductionIntra, 
reconfiguredParam->noiseReductionIntra, "nr-intra=%d", 
reconfiguredParam->noiseReductionIntra);
-TOOLCMP(param->noiseReductionInter, 
reconfiguredParam->noiseReductionInter, "nr-inter=%d", 
reconfiguredParam->noiseReductionInter);
-TOOLCMP(param->bEnableTSkipFast, reconfiguredParam->bEnableTSkipFast, 
"tskip-fast=%d", reconfiguredParam->bEnableTSkipFast);
-TOOLCMP(param->bEnableSignHiding, reconfiguredParam->bEnableSignHiding, 
"signhide=%d", reconfiguredParam->bEnableSignHiding);
-TOOLCMP(param->bEnableFastIntra, reconfiguredParam->bEnableFastIntra, 
"fast-intra=%d", reconfiguredParam->bEnableFastIntra);
-if (param->bEnableLoopFilter && (param->deblockingFilterBetaOffset != 
reconfiguredParam->deblockingFilterBetaOffset 
+#define TOOLCMP(COND1, COND2, STR, OLD_VAL, NEW_VAL)  if (COND1 != COND2) { 
sprintf(tmp, STR, OLD_VAL, NEW_VAL);}
+TOOLCMP(param->maxNumReferences, reconfiguredParam->maxNumReferences, 
"[x265] Reconfigure: ref=%d to %d", param->maxNumReferences, 
reconfiguredParam->maxNumReferences);
+TOOLCMP(param->maxTUSize, reconfiguredParam->maxTUSize, "[x265] 
Reconfigure: max-tu-size=%d to %d", param->maxTUSize, 
reconfiguredParam->maxTUSize);
+TOOLCMP(param->searchRange, reconfiguredParam->searchRange, "[x265] 
Reconfigure: merange=%d to %d", param->searchRange, 
reconfiguredParam->searchRange);
+TOOLCMP(param->subpelRefine, reconfiguredParam->subpelRefine, "[x265] 
Reconfigure: subme=%d to %d", param->subpelRefine, 
reconfiguredParam->subpelRefine);
+TOOLCMP(param->rdLevel, reconfiguredParam->rdLevel, "[x265] Reconfigure: 
rd=%d to %d", param->rdLevel, reconfiguredParam->rdLevel);
+TOOLCMP(param->psyRd, reconfiguredParam->psyRd, "[x265] Reconfigure: 
psy-rd=%.2lf to %.2lf", param->psyRd, reconfiguredParam->psyRd);
+TOOLCMP(param->rdoqLevel, reconfiguredParam->rdoqLevel, "[x265] 
Reconfigure: rdoq=%d to %d", param->rdoqLevel, reconfiguredParam->rdoqLevel);
+TOOLCMP(param->psyRdoq, reconfiguredParam->psyRdoq, "[x265] Reconfigure: 
psy-rdoq=%.2lf to %.2lf", param->psyRdoq, reconfiguredParam->psyRdoq);
+TOOLCMP(param->noiseReductionIntra, 
reconfiguredParam->noiseReductionIntra, "[x265] Reconfigure: nr-intra=%d to 
%d", param->noiseReductionIntra, reconfiguredParam->noiseReductionIntra);
+TOOLCMP(param->noiseReductionInter, 
reconfiguredParam->noiseReductionInter, "[x265] Reconfigure: nr-inter=%d to 
%d", param->noiseReductionInter, reconfiguredParam->noiseReductionInter);
+TOOLCMP(param->bEnableTSkipFast, reconfiguredParam->bEnableTSkipFast, 
"[x265] Reconfigure: tskip-fast=%d to %d", param->bEnableTSkipFast, 
reconfiguredParam->bEnableTSkipFast);
+TOOLCMP(param->bEnableSig

[x265] Fwd: [PATCH] asm: avx2 code for weight_sp() 16bpp

2015-06-30 Thread Praveen Tiwari

-- Forwarded message --
From: aasaipr...@multicorewareinc.com
Date: Mon, Jun 29, 2015 at 4:51 PM
Subject: [x265] [PATCH] asm: avx2 code for weight_sp() 16bpp
To: x265-devel@videolan.org


# HG changeset patch
# User Aasaipriya Chandran aasaipr...@multicorewareinc.com
# Date 1435562395 -19800
#  Mon Jun 29 12:49:55 2015 +0530
# Node ID bebe4e496a432608cf0a9c495debd1970caa387e
# Parent  9feee64efa440c25f016d15ae982789e5393a77e
asm: avx2 code for weight_sp() 16bpp

 avx2: weight_sp  11.37x   4496.63 51139.20
 sse4: weight_sp  6.48x8163.87 52870.36

diff -r 9feee64efa44 -r bebe4e496a43 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Jun 26 15:29:51 2015
+0530
+++ b/source/common/x86/asm-primitives.cpp  Mon Jun 29 12:49:55 2015
+0530
@@ -1517,6 +1517,7 @@
 p.scale1D_128to64 = PFX(scale1D_128to64_avx2);
 p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
 p.weight_pp = PFX(weight_pp_avx2);
+p.weight_sp = PFX(weight_sp_avx2);
 p.sign = PFX(calSign_avx2);

 p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
diff -r 9feee64efa44 -r bebe4e496a43 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Fri Jun 26 15:29:51 2015 +0530
+++ b/source/common/x86/pixel-util8.asm Mon Jun 29 12:49:55 2015 +0530
@@ -1674,8 +1674,128 @@
 dec r5d
 jnz .loopH
 RET
-
-%if ARCH_X86_64
+%endif
+
+
+%if HIGH_BIT_DEPTH
+INIT_YMM avx2
+cglobal weight_sp, 6,7,9
+mova  m1, [pw_1023]
+mova  m2, [pw_1]
+mov   r6d, r7m


r7 is 8th register (0-7). so it should be  cglobal weight_sp, 6, 8, 9
and ARCH_X86_64
only code.



+shl   r6d, 16
+orr6d, r6m
+vpbroadcastd  m3, r6d  ; m3 = [round w0]
+movd  xm4, r8m ; m4 = [shift]
+vpbroadcastd  m5, r9m  ; m5 = [offset]
+
+; correct row stride
+add   r3d, r3d
+add   r2d, r2d
+mov   r6d, r4d
+and   r6d, ~(mmsize / SIZEOF_PIXEL - 1)
+sub   r3d, r6d
+sub   r3d, r6d
+sub   r2d, r6d
+sub   r2d, r6d
+
+; generate partial width mask (MUST BE IN YMM0)
+mov   r6d, r4d
+and   r6d, (mmsize / SIZEOF_PIXEL - 1)
+movd  xm0, r6d
+pshuflw   m0, m0, 0
+punpcklqdqm0, m0
+vinserti128   m0, m0, xm0, 1
+pcmpgtw   m0, [pw_0_15]
+
+.loopH:
+mov   r6d, r4d
+
+.loopW:
+movu  m6, [r0]
+paddw m6, [pw_2000]
+
+punpcklwd m7, m6, m2
+pmaddwd   m7, m3   ;(round w0)
+psrad m7, xm4  ;(shift)
+paddd m7, m5   ;(offset)
+
+punpckhwd m6, m2
+pmaddwd   m6, m3
+psrad m6, xm4
+paddd m6, m5
+
+packusdw  m7, m6
+pminuwm7, m1
+
+sub   r6d, (mmsize / SIZEOF_PIXEL)
+jl.width14
+movu  [r1], m7
+lea   r0, [r0 + mmsize]
+lea   r1, [r1 + mmsize]
+je.nextH
+jmp   .loopW
+
+.width14:
+add   r6d, 16
+cmp   r6d, 14
+jl.width12
+movu  [r1], xm7
+vextracti128  xm8, m7, 1
+movq  [r1 + 16], xm8
+pextrd[r1 + 24], xm8, 2
+je.nextH
+
+.width12:
+cmp   r6d, 12
+jl.width10
+movu  [r1], xm7
+vextracti128  xm8, m7, 1
+movq  [r1 + 16], xm8
+je.nextH
+
+.width10:
+cmp   r6d, 10
+jl.width8
+movu  [r1], xm7
+vextracti128  xm8, m7, 1
+movd  [r1 + 16], xm8
+je.nextH
+
+.width8:
+cmp   r6d, 8
+jl.width6
+movu  [r1], xm7
+je.nextH
+
+.width6
+cmp   r6d, 6
+jl.width4
+movq  [r1], xm7
+pextrd[r1 + 8], xm7, 2
+je.nextH
+
+.width4:
+cmp   r6d, 4
+jl

Re: [x265] Fwd: [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp

2015-06-29 Thread Praveen Tiwari

You would like to visit 8bpp code as well.

Regards,
Praveen

On Mon, Jun 29, 2015 at 11:24 AM, Rajesh Paulraj 
raj...@multicorewareinc.com wrote:

 We don't need to push this patch. I will improve sse version for the same
 size. We may not need avx2 code for this.(will make sure after rewriting
 sse2 code)

 On Mon, Jun 29, 2015 at 10:21 AM, Deepthi Nandakumar 
 deep...@multicorewareinc.com wrote:

 This does not build for HBD disabled

 On Fri, Jun 26, 2015 at 5:40 PM, Rajesh Paulraj 
 raj...@multicorewareinc.com wrote:

 yes. It looks like we need to optimize sse2 code. I will work on this.

 On Fri, Jun 26, 2015 at 5:31 PM, Praveen Tiwari 
 prav...@multicorewareinc.com wrote:




 -- Forwarded message --
 From: raj...@multicorewareinc.com
 Date: Fri, Jun 26, 2015 at 3:14 PM
 Subject: [x265] [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp
 To: x265-devel@videolan.org


 # HG changeset patch
 # User Rajesh Paulrajraj...@multicorewareinc.com
 # Date 1435311076 -19800
 #  Fri Jun 26 15:01:16 2015 +0530
 # Node ID 956401f1a679f1e71181b704d64e4acdb6f1a93f
 # Parent  d64227e54233d1646c55bcb4b0b831e5340009ed
 asm: pixelavg_pp[8xN] avx2 code for 10bpp

 avx2:
 avg_pp[  8x4]  4.39x145.09  636.75
 avg_pp[  8x8]  5.33x215.27  1146.55
 avg_pp[ 8x16]  6.50x336.88  2190.68
 avg_pp[ 8x32]  7.71x579.86  4470.84

 sse2:
 avg_pp[  8x4]  2.31x287.63  663.94
 avg_pp[  8x8]  3.26x370.21  1205.26
 avg_pp[ 8x16]  3.99x581.63  2323.25
 avg_pp[ 8x32]  4.78x995.79  4755.58


 Basically, our macro pixel_avg_8xN just SSE (just simple syntax
 conversion for avx2, not using 256 bit capability) so, fundamentally there
 should be no major improvement in speed. But improvements 287.63c
 - 145.09c, 370.21c - 215.27 etc are quite good. Does it means SSE2 codes
 are not optimize well ? Can you revisit SSE code using this algorithm?



 diff -r d64227e54233 -r 956401f1a679
 source/common/x86/asm-primitives.cpp
 --- a/source/common/x86/asm-primitives.cpp  Thu Jun 25 16:25:51
 2015 +0530
 +++ b/source/common/x86/asm-primitives.cpp  Fri Jun 26 15:01:16
 2015 +0530
 @@ -1362,6 +1362,10 @@
  p.cu[BLOCK_32x32].intra_pred[33]=
 PFX(intra_pred_ang32_33_avx2);
  p.cu[BLOCK_32x32].intra_pred[34]=
 PFX(intra_pred_ang32_2_avx2);

 +p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx2);
 +p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx2);
 +p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx2);
 +p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_avx2);
  p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2);
  p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
  p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
 diff -r d64227e54233 -r 956401f1a679 source/common/x86/mc-a.asm
 --- a/source/common/x86/mc-a.asmThu Jun 25 16:25:51 2015 +0530
 +++ b/source/common/x86/mc-a.asmFri Jun 26 15:01:16 2015 +0530
 @@ -4490,6 +4490,88 @@
  RET
  %endif

 +%macro  pixel_avg_W8 0
 +movuxm0, [r2]
 +movuxm1, [r4]
 +pavgw   xm0, xm1
 +movu[r0], xm0
 +movuxm2, [r2 + r3]
 +movuxm3, [r4 + r5]
 +pavgw   xm2, xm3
 +movu[r0 + r1], xm2
 +
 +movuxm0, [r2 + r3 * 2]
 +movuxm1, [r4 + r5 * 2]
 +pavgw   xm0, xm1
 +movu[r0 + r1 * 2], xm0
 +movuxm2, [r2 + r6]
 +movuxm3, [r4 + r7]
 +pavgw   xm2, xm3
 +movu[r0 + r8], xm2
 +
 +lea r0, [r0 + 4 * r1]
 +lea r2, [r2 + 4 * r3]
 +lea r4, [r4 + 4 * r5]
 +%endmacro
 +

 +;---
 +;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0,
 intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)

 +;---
 +%if ARCH_X86_64
 +INIT_YMM avx2
 +cglobal pixel_avg_8x4, 6,10,4
 +add r1d, r1d
 +add r3d, r3d
 +add r5d, r5d
 +lea r6, [r3 * 3]
 +lea r7, [r5 * 3]
 +lea r8, [r1 * 3]
 +pixel_avg_W8
 +RET
 +
 +cglobal pixel_avg_8x8, 6,10,4
 +add r1d, r1d
 +add r3d, r3d
 +add r5d, r5d
 +lea r6, [r3 * 3]
 +lea r7, [r5 * 3]
 +lea r8, [r1 * 3]
 +mov r9d, 2
 +.loop
 +pixel_avg_W8
 +dec r9d
 +jnz .loop
 +RET
 +
 +cglobal pixel_avg_8x16, 6,10,4
 +add r1d, r1d
 +add r3d, r3d
 +add r5d, r5d
 +lea r6, [r3 * 3]
 +lea r7, [r5 * 3]
 +lea r8, [r1 * 3]
 +mov r9d, 4
 +.loop
 +pixel_avg_W8
 +dec r9d
 +jnz .loop
 +RET
 +
 +cglobal pixel_avg_8x32, 6,10,4
 +add r1d, r1d
 +add r3d, r3d
 +add r5d, r5d
 +lea r6, [r3 * 3

[x265] Fwd: [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp

2015-06-26 Thread Praveen Tiwari

-- Forwarded message --
From: raj...@multicorewareinc.com
Date: Fri, Jun 26, 2015 at 3:14 PM
Subject: [x265] [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp
To: x265-devel@videolan.org


# HG changeset patch
# User Rajesh Paulrajraj...@multicorewareinc.com
# Date 1435311076 -19800
#  Fri Jun 26 15:01:16 2015 +0530
# Node ID 956401f1a679f1e71181b704d64e4acdb6f1a93f
# Parent  d64227e54233d1646c55bcb4b0b831e5340009ed
asm: pixelavg_pp[8xN] avx2 code for 10bpp

avx2:
avg_pp[  8x4]  4.39x145.09  636.75
avg_pp[  8x8]  5.33x215.27  1146.55
avg_pp[ 8x16]  6.50x336.88  2190.68
avg_pp[ 8x32]  7.71x579.86  4470.84

sse2:
avg_pp[  8x4]  2.31x287.63  663.94
avg_pp[  8x8]  3.26x370.21  1205.26
avg_pp[ 8x16]  3.99x581.63  2323.25
avg_pp[ 8x32]  4.78x995.79  4755.58

diff -r d64227e54233 -r 956401f1a679 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Jun 25 16:25:51 2015
+0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Jun 26 15:01:16 2015
+0530
@@ -1362,6 +1362,10 @@
 p.cu[BLOCK_32x32].intra_pred[33]=
PFX(intra_pred_ang32_33_avx2);
 p.cu[BLOCK_32x32].intra_pred[34]= PFX(intra_pred_ang32_2_avx2);

+p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx2);
+p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx2);
+p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx2);
+p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_avx2);
 p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2);
 p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
 p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
diff -r d64227e54233 -r 956401f1a679 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asmThu Jun 25 16:25:51 2015 +0530
+++ b/source/common/x86/mc-a.asmFri Jun 26 15:01:16 2015 +0530
@@ -4490,6 +4490,88 @@
 RET
 %endif

+%macro  pixel_avg_W8 0
+movuxm0, [r2]
+movuxm1, [r4]
+pavgw   xm0, xm1
+movu[r0], xm0
+movuxm2, [r2 + r3]
+movuxm3, [r4 + r5]
+pavgw   xm2, xm3
+movu[r0 + r1], xm2
+
 Your macro is not using avx2 capabilities, did you check the performance
of two rows combined ? It will reduce your  pavgw and movu instruction by
half. You can use vinserti128 to combine two rows at a time.

+movuxm0, [r2 + r3 * 2]
+movuxm1, [r4 + r5 * 2]
+pavgw   xm0, xm1
+movu[r0 + r1 * 2], xm0
+movuxm2, [r2 + r6]
+movuxm3, [r4 + r7]
+pavgw   xm2, xm3
+movu[r0 + r8], xm2
+
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+lea r4, [r4 + 4 * r5]
+%endmacro
+
+;---
+;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t
sstride0, const pixel* src1, intptr_t sstride1, int)
+;---
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_avg_8x4, 6,10,4
+add r1d, r1d
+add r3d, r3d
+add r5d, r5d
+lea r6, [r3 * 3]
+lea r7, [r5 * 3]
+lea r8, [r1 * 3]
+pixel_avg_W8
+RET
+
+cglobal pixel_avg_8x8, 6,10,4
+add r1d, r1d
+add r3d, r3d
+add r5d, r5d
+lea r6, [r3 * 3]
+lea r7, [r5 * 3]
+lea r8, [r1 * 3]
+mov r9d, 2
+.loop
+pixel_avg_W8
+dec r9d
+jnz .loop
+RET
+
+cglobal pixel_avg_8x16, 6,10,4
+add r1d, r1d
+add r3d, r3d
+add r5d, r5d
+lea r6, [r3 * 3]
+lea r7, [r5 * 3]
+lea r8, [r1 * 3]
+mov r9d, 4
+.loop
+pixel_avg_W8
+dec r9d
+jnz .loop
+RET
+
+cglobal pixel_avg_8x32, 6,10,4
+add r1d, r1d
+add r3d, r3d
+add r5d, r5d
+lea r6, [r3 * 3]
+lea r7, [r5 * 3]
+lea r8, [r1 * 3]
+mov r9d, 8
+.loop
+pixel_avg_W8
+dec r9d
+jnz .loop
+RET
+%endif
+
 %macro  pixel_avg_H4 0
 movum0, [r2]
 movum1, [r4]
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Re: [x265] Fwd: [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp

2015-06-26 Thread Praveen Tiwari

ahh, width is just 8*16 = 128, two rows at a time will need vextracti128 as
well while storing, which goes to port5, a bottleneck port. pavgw is much
cheaper than it. You may try to combine 16XN sizes.

Regards,
Praveen

On Fri, Jun 26, 2015 at 3:40 PM, Rajesh Paulraj raj...@multicorewareinc.com
 wrote:

 I tried using vinserti128. But that reduces the performance than this one.
 So i kept this version.

 On Fri, Jun 26, 2015 at 3:37 PM, Praveen Tiwari 
 prav...@multicorewareinc.com wrote:




 -- Forwarded message --
 From: raj...@multicorewareinc.com
 Date: Fri, Jun 26, 2015 at 3:14 PM
 Subject: [x265] [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp
 To: x265-devel@videolan.org


 # HG changeset patch
 # User Rajesh Paulrajraj...@multicorewareinc.com
 # Date 1435311076 -19800
 #  Fri Jun 26 15:01:16 2015 +0530
 # Node ID 956401f1a679f1e71181b704d64e4acdb6f1a93f
 # Parent  d64227e54233d1646c55bcb4b0b831e5340009ed
 asm: pixelavg_pp[8xN] avx2 code for 10bpp

 avx2:
 avg_pp[  8x4]  4.39x145.09  636.75
 avg_pp[  8x8]  5.33x215.27  1146.55
 avg_pp[ 8x16]  6.50x336.88  2190.68
 avg_pp[ 8x32]  7.71x579.86  4470.84

 sse2:
 avg_pp[  8x4]  2.31x287.63  663.94
 avg_pp[  8x8]  3.26x370.21  1205.26
 avg_pp[ 8x16]  3.99x581.63  2323.25
 avg_pp[ 8x32]  4.78x995.79  4755.58

 diff -r d64227e54233 -r 956401f1a679 source/common/x86/asm-primitives.cpp
 --- a/source/common/x86/asm-primitives.cpp  Thu Jun 25 16:25:51 2015
 +0530
 +++ b/source/common/x86/asm-primitives.cpp  Fri Jun 26 15:01:16 2015
 +0530
 @@ -1362,6 +1362,10 @@
  p.cu[BLOCK_32x32].intra_pred[33]=
 PFX(intra_pred_ang32_33_avx2);
  p.cu[BLOCK_32x32].intra_pred[34]=
 PFX(intra_pred_ang32_2_avx2);

 +p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx2);
 +p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx2);
 +p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx2);
 +p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_avx2);
  p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2);
  p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
  p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
 diff -r d64227e54233 -r 956401f1a679 source/common/x86/mc-a.asm
 --- a/source/common/x86/mc-a.asmThu Jun 25 16:25:51 2015 +0530
 +++ b/source/common/x86/mc-a.asmFri Jun 26 15:01:16 2015 +0530
 @@ -4490,6 +4490,88 @@
  RET
  %endif

 +%macro  pixel_avg_W8 0
 +movuxm0, [r2]
 +movuxm1, [r4]
 +pavgw   xm0, xm1
 +movu[r0], xm0
 +movuxm2, [r2 + r3]
 +movuxm3, [r4 + r5]
 +pavgw   xm2, xm3
 +movu[r0 + r1], xm2
 +
  Your macro is not using avx2 capabilities, did you check the
 performance of two rows combined ? It will reduce your  pavgw and movu
 instruction by half. You can use vinserti128 to combine two rows at a
 time.

 +movuxm0, [r2 + r3 * 2]
 +movuxm1, [r4 + r5 * 2]
 +pavgw   xm0, xm1
 +movu[r0 + r1 * 2], xm0
 +movuxm2, [r2 + r6]
 +movuxm3, [r4 + r7]
 +pavgw   xm2, xm3
 +movu[r0 + r8], xm2
 +
 +lea r0, [r0 + 4 * r1]
 +lea r2, [r2 + 4 * r3]
 +lea r4, [r4 + 4 * r5]
 +%endmacro
 +

 +;---
 +;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0,
 intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)

 +;---
 +%if ARCH_X86_64
 +INIT_YMM avx2
 +cglobal pixel_avg_8x4, 6,10,4
 +add r1d, r1d
 +add r3d, r3d
 +add r5d, r5d
 +lea r6, [r3 * 3]
 +lea r7, [r5 * 3]
 +lea r8, [r1 * 3]
 +pixel_avg_W8
 +RET
 +
 +cglobal pixel_avg_8x8, 6,10,4
 +add r1d, r1d
 +add r3d, r3d
 +add r5d, r5d
 +lea r6, [r3 * 3]
 +lea r7, [r5 * 3]
 +lea r8, [r1 * 3]
 +mov r9d, 2
 +.loop
 +pixel_avg_W8
 +dec r9d
 +jnz .loop
 +RET
 +
 +cglobal pixel_avg_8x16, 6,10,4
 +add r1d, r1d
 +add r3d, r3d
 +add r5d, r5d
 +lea r6, [r3 * 3]
 +lea r7, [r5 * 3]
 +lea r8, [r1 * 3]
 +mov r9d, 4
 +.loop
 +pixel_avg_W8
 +dec r9d
 +jnz .loop
 +RET
 +
 +cglobal pixel_avg_8x32, 6,10,4
 +add r1d, r1d
 +add r3d, r3d
 +add r5d, r5d
 +lea r6, [r3 * 3]
 +lea r7, [r5 * 3]
 +lea r8, [r1 * 3]
 +mov r9d, 8
 +.loop
 +pixel_avg_W8
 +dec r9d
 +jnz .loop
 +RET
 +%endif
 +
  %macro  pixel_avg_H4 0
  movum0, [r2]
  movum1, [r4]
 ___
 x265-devel mailing list
 x265-devel@videolan.org

[x265] Fwd: [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp

2015-06-26 Thread Praveen Tiwari

-- Forwarded message --
From: raj...@multicorewareinc.com
Date: Fri, Jun 26, 2015 at 3:14 PM
Subject: [x265] [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp
To: x265-devel@videolan.org


# HG changeset patch
# User Rajesh Paulrajraj...@multicorewareinc.com
# Date 1435311076 -19800
#  Fri Jun 26 15:01:16 2015 +0530
# Node ID 956401f1a679f1e71181b704d64e4acdb6f1a93f
# Parent  d64227e54233d1646c55bcb4b0b831e5340009ed
asm: pixelavg_pp[8xN] avx2 code for 10bpp

avx2:
avg_pp[  8x4]  4.39x145.09  636.75
avg_pp[  8x8]  5.33x215.27  1146.55
avg_pp[ 8x16]  6.50x336.88  2190.68
avg_pp[ 8x32]  7.71x579.86  4470.84

sse2:
avg_pp[  8x4]  2.31x287.63  663.94
avg_pp[  8x8]  3.26x370.21  1205.26
avg_pp[ 8x16]  3.99x581.63  2323.25
avg_pp[ 8x32]  4.78x995.79  4755.58


Basically, our macro pixel_avg_8xN just SSE (just simple syntax
conversion for avx2, not using 256 bit capability) so, fundamentally there
should be no major improvement in speed. But improvements 287.63c
- 145.09c, 370.21c - 215.27 etc are quite good. Does it means SSE2 codes
are not optimize well ? Can you revisit SSE code using this algorithm?


diff -r d64227e54233 -r 956401f1a679 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Jun 25 16:25:51 2015
+0530
+++ b/source/common/x86/asm-primitives.cpp  Fri Jun 26 15:01:16 2015
+0530
@@ -1362,6 +1362,10 @@
 p.cu[BLOCK_32x32].intra_pred[33]=
PFX(intra_pred_ang32_33_avx2);
 p.cu[BLOCK_32x32].intra_pred[34]= PFX(intra_pred_ang32_2_avx2);

+p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx2);
+p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx2);
+p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx2);
+p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_avx2);
 p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2);
 p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
 p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
diff -r d64227e54233 -r 956401f1a679 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asmThu Jun 25 16:25:51 2015 +0530
+++ b/source/common/x86/mc-a.asmFri Jun 26 15:01:16 2015 +0530
@@ -4490,6 +4490,88 @@
 RET
 %endif

+%macro  pixel_avg_W8 0
+movuxm0, [r2]
+movuxm1, [r4]
+pavgw   xm0, xm1
+movu[r0], xm0
+movuxm2, [r2 + r3]
+movuxm3, [r4 + r5]
+pavgw   xm2, xm3
+movu[r0 + r1], xm2
+
+movuxm0, [r2 + r3 * 2]
+movuxm1, [r4 + r5 * 2]
+pavgw   xm0, xm1
+movu[r0 + r1 * 2], xm0
+movuxm2, [r2 + r6]
+movuxm3, [r4 + r7]
+pavgw   xm2, xm3
+movu[r0 + r8], xm2
+
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+lea r4, [r4 + 4 * r5]
+%endmacro
+
+;---
+;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t
sstride0, const pixel* src1, intptr_t sstride1, int)
+;---
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_avg_8x4, 6,10,4
+add r1d, r1d
+add r3d, r3d
+add r5d, r5d
+lea r6, [r3 * 3]
+lea r7, [r5 * 3]
+lea r8, [r1 * 3]
+pixel_avg_W8
+RET
+
+cglobal pixel_avg_8x8, 6,10,4
+add r1d, r1d
+add r3d, r3d
+add r5d, r5d
+lea r6, [r3 * 3]
+lea r7, [r5 * 3]
+lea r8, [r1 * 3]
+mov r9d, 2
+.loop
+pixel_avg_W8
+dec r9d
+jnz .loop
+RET
+
+cglobal pixel_avg_8x16, 6,10,4
+add r1d, r1d
+add r3d, r3d
+add r5d, r5d
+lea r6, [r3 * 3]
+lea r7, [r5 * 3]
+lea r8, [r1 * 3]
+mov r9d, 4
+.loop
+pixel_avg_W8
+dec r9d
+jnz .loop
+RET
+
+cglobal pixel_avg_8x32, 6,10,4
+add r1d, r1d
+add r3d, r3d
+add r5d, r5d
+lea r6, [r3 * 3]
+lea r7, [r5 * 3]
+lea r8, [r1 * 3]
+mov r9d, 8
+.loop
+pixel_avg_W8
+dec r9d
+jnz .loop
+RET
+%endif
+
 %macro  pixel_avg_H4 0
 movum0, [r2]
 movum1, [r4]
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] param: added x265_atof as internal encoder function, moved to namespace X265_NS

2015-06-19 Thread praveen

# HG changeset patch
# User Praveen Tiwari prav...@multicorewareinc.com
# Date 1434720481 -19800
#  Fri Jun 19 18:58:01 2015 +0530
# Node ID f53934a895e1ffc04edeae11183ad3556c09467b
# Parent  44b6b2df7016f0129e66d91e9aab03261d02758a
param: added x265_atof as internal encoder function, moved to namespace 
X265_NS

diff -r 44b6b2df7016 -r f53934a895e1 source/common/param.cpp
--- a/source/common/param.cpp   Fri Jun 19 16:43:29 2015 +0530
+++ b/source/common/param.cpp   Fri Jun 19 18:58:01 2015 +0530
@@ -471,16 +471,6 @@
 return 0;
 }
 
-static double x265_atof(const char* str, bool bError)
-{
-char *end;
-double v = strtod(str, end);
-
-if (end == str || *end != '\0')
-bError = true;
-return v;
-}
-
 static int parseName(const char* arg, const char* const* names, bool bError)
 {
 for (int i = 0; names[i]; i++)
@@ -890,6 +880,16 @@
 return v;
 }
 
+double x265_atof(const char* str, bool bError)
+{
+char *end;
+double v = strtod(str, end);
+
+if (end == str || *end != '\0')
+bError = true;
+return v;
+}
+
 /* cpu name can be:
  *   auto || true - x265::cpu_detect()
  *   false || no  - disabled
diff -r 44b6b2df7016 -r f53934a895e1 source/common/param.h
--- a/source/common/param.h Fri Jun 19 16:43:29 2015 +0530
+++ b/source/common/param.h Fri Jun 19 18:58:01 2015 +0530
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Deepthi Nandakumar deep...@multicorewareinc.com
+ *  Praveen Kumar Tiwari prav...@multicorewareinc.com
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -33,6 +34,7 @@
 void  x265_param_apply_fastfirstpass(x265_param *p);
 char* x265_param2string(x265_param *param);
 int   x265_atoi(const char *str, bool bError);
+double x265_atof(const char *str, bool bError);
 int   parseCpuName(const char *value, bool bError);
 void  setParamAspectRatio(x265_param *p, int width, int height);
 void  getParamAspectRatio(x265_param *p, int width, int height);
# HG changeset patch
# User Praveen Tiwari prav...@multicorewareinc.com
# Date 1434720481 -19800
#  Fri Jun 19 18:58:01 2015 +0530
# Node ID f53934a895e1ffc04edeae11183ad3556c09467b
# Parent  44b6b2df7016f0129e66d91e9aab03261d02758a
param: added x265_atof as internal encoder function, moved to namespace X265_NS

diff -r 44b6b2df7016 -r f53934a895e1 source/common/param.cpp
--- a/source/common/param.cpp	Fri Jun 19 16:43:29 2015 +0530
+++ b/source/common/param.cpp	Fri Jun 19 18:58:01 2015 +0530
@@ -471,16 +471,6 @@
 return 0;
 }
 
-static double x265_atof(const char* str, bool bError)
-{
-char *end;
-double v = strtod(str, end);
-
-if (end == str || *end != '\0')
-bError = true;
-return v;
-}
-
 static int parseName(const char* arg, const char* const* names, bool bError)
 {
 for (int i = 0; names[i]; i++)
@@ -890,6 +880,16 @@
 return v;
 }
 
+double x265_atof(const char* str, bool bError)
+{
+char *end;
+double v = strtod(str, end);
+
+if (end == str || *end != '\0')
+bError = true;
+return v;
+}
+
 /* cpu name can be:
  *   auto || true - x265::cpu_detect()
  *   false || no  - disabled
diff -r 44b6b2df7016 -r f53934a895e1 source/common/param.h
--- a/source/common/param.h	Fri Jun 19 16:43:29 2015 +0530
+++ b/source/common/param.h	Fri Jun 19 18:58:01 2015 +0530
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Deepthi Nandakumar deep...@multicorewareinc.com
+ *  Praveen Kumar Tiwari prav...@multicorewareinc.com
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -33,6 +34,7 @@
 void  x265_param_apply_fastfirstpass(x265_param *p);
 char* x265_param2string(x265_param *param);
 int   x265_atoi(const char *str, bool bError);
+double x265_atof(const char *str, bool bError);
 int   parseCpuName(const char *value, bool bError);
 void  setParamAspectRatio(x265_param *p, int width, int height);
 void  getParamAspectRatio(x265_param *p, int width, int height);
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH] asm: intra_pred_ang32_18 improved by ~45% over SSE4

2015-04-14 Thread praveen

# HG changeset patch
# User Praveen Tiwari prav...@multicorewareinc.com
# Date 1428992352 -19800
#  Tue Apr 14 11:49:12 2015 +0530
# Node ID 8c31f8daf9a2bbb3408178685eee97d84ca045ff
# Parent  9a0818c97dc72b7974889fd34de073cdb4fde771
asm: intra_pred_ang32_18 improved by ~45% over SSE4

AVX2:
intra_ang_32x32[18] 33.10x   354.58  11737.10

SSE4:
intra_ang_32x32[18] 17.51x   650.80  11396.64

diff -r 9a0818c97dc7 -r 8c31f8daf9a2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Apr 14 13:41:40 2015 +0800
+++ b/source/common/x86/asm-primitives.cpp  Tue Apr 14 11:49:12 2015 +0530
@@ -1821,6 +1821,7 @@
 p.cu[BLOCK_32x32].intra_pred[23] = x265_intra_pred_ang32_23_avx2;
 p.cu[BLOCK_32x32].intra_pred[22] = x265_intra_pred_ang32_22_avx2;
 p.cu[BLOCK_32x32].intra_pred[21] = x265_intra_pred_ang32_21_avx2;
+p.cu[BLOCK_32x32].intra_pred[18] = x265_intra_pred_ang32_18_avx2;
 
 // copy_sp primitives
 p.cu[BLOCK_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2;
diff -r 9a0818c97dc7 -r 8c31f8daf9a2 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Tue Apr 14 13:41:40 2015 +0800
+++ b/source/common/x86/intrapred.h Tue Apr 14 11:49:12 2015 +0530
@@ -277,6 +277,7 @@
 void x265_intra_pred_ang32_23_avx2(pixel* dst, intptr_t dstStride, const 
pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_22_avx2(pixel* dst, intptr_t dstStride, const 
pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_21_avx2(pixel* dst, intptr_t dstStride, const 
pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang32_18_avx2(pixel* dst, intptr_t dstStride, const 
pixel* srcPix, int dirMode, int bFilter);
 void x265_all_angs_pred_4x4_sse2(pixel *dest, pixel *refPix, pixel *filtPix, 
int bLuma);
 void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, 
int bLuma);
 void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, 
int bLuma);
diff -r 9a0818c97dc7 -r 8c31f8daf9a2 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Tue Apr 14 13:41:40 2015 +0800
+++ b/source/common/x86/intrapred8.asm  Tue Apr 14 11:49:12 2015 +0530
@@ -28,6 +28,7 @@
 SECTION_RODATA 32
 
 intra_pred_shuff_0_8:times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 
7, 8
+intra_pred_shuff_15_0:   times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 
3, 2, 1, 0
 
 pb_0_8times 8 db  0,  8
 pb_unpackbw1  times 2 db  1,  8,  2,  8,  3,  8,  4,  8
@@ -10366,6 +10367,99 @@
 
 RET
 
+INIT_YMM avx2
+cglobal intra_pred_ang32_18, 4, 4, 3
+movu   m0, [r2]
+movu   xm1, [r2 + 1 + 64]
+pshufb xm1, [intra_pred_shuff_15_0]
+mova   xm2, xm0
+vinserti128m1, m1, xm2, 1
+
+lear3, [r1 * 3]
+
+movu   [r0], m0
+palignrm2, m0, m1, 15
+movu   [r0 + r1], m2
+palignrm2, m0, m1, 14
+movu   [r0 + r1 * 2], m2
+palignrm2, m0, m1, 13
+movu   [r0 + r3], m2
+
+lear0, [r0 + r1 * 4]
+palignrm2, m0, m1, 12
+movu   [r0], m2
+palignrm2, m0, m1, 11
+movu   [r0 + r1], m2
+palignrm2, m0, m1, 10
+movu   [r0 + r1 * 2], m2
+palignrm2, m0, m1, 9
+movu   [r0 + r3], m2
+
+lear0, [r0 + r1 * 4]
+palignrm2, m0, m1, 8
+movu   [r0], m2
+palignrm2, m0, m1, 7
+movu   [r0 + r1], m2
+palignrm2, m0, m1, 6
+movu   [r0 + r1 * 2], m2
+palignrm2, m0, m1, 5
+movu   [r0 + r3], m2
+
+lear0, [r0 + r1 * 4]
+palignrm2, m0, m1, 4
+movu   [r0], m2
+palignrm2, m0, m1, 3
+movu   [r0 + r1], m2
+palignrm2, m0, m1, 2
+movu   [r0 + r1 * 2], m2
+palignrm2, m0, m1, 1
+movu   [r0 + r3], m2
+
+lear0, [r0 + r1 * 4]
+movu   [r0], m1
+
+movu   xm0, [r2 + 64 + 17]
+pshufb xm0, [intra_pred_shuff_15_0]
+vinserti128m0, m0, xm1, 1
+
+palignrm2, m1, m0, 15
+movu   [r0 + r1], m2
+palignrm2, m1, m0, 14
+movu   [r0 + r1 * 2], m2
+palignrm2, m1, m0, 13
+movu   [r0 + r3], m2
+
+lear0, [r0 + r1 * 4]
+palignrm2, m1, m0, 12
+movu   [r0], m2
+palignrm2, m1, m0, 11
+movu   [r0 + r1], m2
+palignrm2, m1, m0, 10
+movu   [r0 + r1 * 2], m2
+palignrm2, m1, m0, 9
+movu   [r0 + r3], m2
+
+lear0, [r0 + r1 * 4]
+palignrm2, m1, m0, 8
+movu   [r0], m2
+palignrm2, m1, m0, 7
+movu   [r0 + r1], m2
+palignrm2, m1, m0,6
+movu   [r0

[x265] [PATCH] asm: intra_pred_ang32_18 improved by ~44% over SSE4

2015-04-13 Thread praveen

# HG changeset patch
# User Praveen Tiwari prav...@multicorewareinc.com
# Date 1428917176 -19800
#  Mon Apr 13 14:56:16 2015 +0530
# Node ID f4310212b0745d51d0cc5ed8b2a3098e1bcea016
# Parent  4cccf22b00ee188a72c8dc3896d7dc1613d855ad
asm: intra_pred_ang32_18 improved by ~44% over SSE4

AVX2:
intra_ang_32x32[18] 31.25x   363.88  11371.31

SSE4:
intra_ang_32x32[18] 18.11x   648.61  11743.52

diff -r 4cccf22b00ee -r f4310212b074 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Fri Apr 10 18:15:38 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp  Mon Apr 13 14:56:16 2015 +0530
@@ -1819,6 +1819,7 @@
 p.cu[BLOCK_32x32].intra_pred[23] = x265_intra_pred_ang32_23_avx2;
 p.cu[BLOCK_32x32].intra_pred[22] = x265_intra_pred_ang32_22_avx2;
 p.cu[BLOCK_32x32].intra_pred[21] = x265_intra_pred_ang32_21_avx2;
+p.cu[BLOCK_32x32].intra_pred[18] = x265_intra_pred_ang32_18_avx2;
 
 // copy_sp primitives
 p.cu[BLOCK_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2;
diff -r 4cccf22b00ee -r f4310212b074 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Fri Apr 10 18:15:38 2015 -0500
+++ b/source/common/x86/intrapred.h Mon Apr 13 14:56:16 2015 +0530
@@ -277,6 +277,7 @@
 void x265_intra_pred_ang32_23_avx2(pixel* dst, intptr_t dstStride, const 
pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_22_avx2(pixel* dst, intptr_t dstStride, const 
pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_21_avx2(pixel* dst, intptr_t dstStride, const 
pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang32_18_avx2(pixel* dst, intptr_t dstStride, const 
pixel* srcPix, int dirMode, int bFilter);
 void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, 
int bLuma);
 void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, 
int bLuma);
 void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, 
int bLuma);
diff -r 4cccf22b00ee -r f4310212b074 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Fri Apr 10 18:15:38 2015 -0500
+++ b/source/common/x86/intrapred8.asm  Mon Apr 13 14:56:16 2015 +0530
@@ -28,6 +28,7 @@
 SECTION_RODATA 32
 
 intra_pred_shuff_0_8:times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 
7, 8
+intra_pred_shuff_15_0:   times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 
3, 2, 1, 0
 
 pb_0_8times 8 db  0,  8
 pb_unpackbw1  times 2 db  1,  8,  2,  8,  3,  8,  4,  8
@@ -10366,6 +10367,101 @@
 
 RET
 
+INIT_YMM avx2
+cglobal intra_pred_ang32_18, 4, 6, 3
+movu   m0, [r2]
+movu   xm1, [r2 + 1 + 64]
+pshufb xm1, [intra_pred_shuff_15_0]
+movu   xm2, xm0
+vinserti128m1, m1, xm2, 1
+
+lear4, [r1 * 2]
+lear3, [r1 * 3]
+lear5, [r1 * 4]
+
+movu   [r0], m0
+palignrm2, m0, m1, 15
+movu   [r0 + r1], m2
+palignrm2, m0, m1, 14
+movu   [r0 + r4], m2
+palignrm2, m0, m1, 13
+movu   [r0 + r3], m2
+
+lear0, [r0 + r5]
+palignrm2, m0, m1, 12
+movu   [r0], m2
+palignrm2, m0, m1, 11
+movu   [r0 + r1], m2
+palignrm2, m0, m1, 10
+movu   [r0 + r4], m2
+palignrm2, m0, m1, 9
+movu   [r0 + r3], m2
+
+lear0, [r0 + r5]
+palignrm2, m0, m1, 8
+movu   [r0], m2
+palignrm2, m0, m1, 7
+movu   [r0 + r1], m2
+palignrm2, m0, m1, 6
+movu   [r0 + r4], m2
+palignrm2, m0, m1, 5
+movu   [r0 + r3], m2
+
+lear0, [r0 + r5]
+palignrm2, m0, m1, 4
+movu   [r0], m2
+palignrm2, m0, m1, 3
+movu   [r0 + r1], m2
+palignrm2, m0, m1, 2
+movu   [r0 + r4], m2
+palignrm2, m0, m1, 1
+movu   [r0 + r3], m2
+
+lear0, [r0 + r5]
+movu   [r0], m1
+
+movu   xm0, [r2 + 64 + 17]
+pshufb xm0, [intra_pred_shuff_15_0]
+vinserti128m0, m0, xm1, 1
+
+palignrm2, m1, m0, 15
+movu   [r0 + r1], m2
+palignrm2, m1, m0, 14
+movu   [r0 + r4], m2
+palignrm2, m1, m0, 13
+movu   [r0 + r3], m2
+
+lear0, [r0 + r5]
+palignrm2, m1, m0, 12
+movu   [r0], m2
+palignrm2, m1, m0, 11
+movu   [r0 + r1], m2
+palignrm2, m1, m0, 10
+movu   [r0 + r4], m2
+palignrm2, m1, m0, 9
+movu   [r0 + r3], m2
+
+lear0, [r0 + r5]
+palignrm2, m1, m0, 8
+movu   [r0], m2
+palignrm2, m1, m0, 7
+movu   [r0 + r1], m2
+palignrm2, m1, m0,6

[x265] [PATCH 1 of 9] asm: intra_pred_ang16_12 improved by ~20% over SSE4

2015-04-09 Thread praveen

# HG changeset patch
# User Praveen Tiwari prav...@multicorewareinc.com
# Date 1428557307 -19800
#  Thu Apr 09 10:58:27 2015 +0530
# Node ID 561f063f3ef9c65397c3f43ca84bcd51185f6ad4
# Parent  7f2d92923de47e7e40f04ff27ed70074b0dca9d3
asm: intra_pred_ang16_12 improved by ~20% over SSE4

AVX2:
intra_ang_16x16[12] 15.16x   777.51  11785.44

SSE4:
intra_ang_16x16[12] 11.51x   976.41  11238.16

diff -r 7f2d92923de4 -r 561f063f3ef9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Apr 08 14:51:00 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp  Thu Apr 09 10:58:27 2015 +0530
@@ -1771,6 +1771,7 @@
 p.cu[BLOCK_16x16].intra_pred[7] = x265_intra_pred_ang16_7_avx2;
 p.cu[BLOCK_16x16].intra_pred[8] = x265_intra_pred_ang16_8_avx2;
 p.cu[BLOCK_16x16].intra_pred[9] = x265_intra_pred_ang16_9_avx2;
+p.cu[BLOCK_16x16].intra_pred[12] = x265_intra_pred_ang16_12_avx2;
 p.cu[BLOCK_16x16].intra_pred[11] = x265_intra_pred_ang16_11_avx2;
 p.cu[BLOCK_16x16].intra_pred[25] = x265_intra_pred_ang16_25_avx2;
 p.cu[BLOCK_16x16].intra_pred[28] = x265_intra_pred_ang16_28_avx2;
diff -r 7f2d92923de4 -r 561f063f3ef9 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Wed Apr 08 14:51:00 2015 -0500
+++ b/source/common/x86/intrapred.h Thu Apr 09 10:58:27 2015 +0530
@@ -240,6 +240,7 @@
 void x265_intra_pred_ang16_7_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_8_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_9_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang16_12_avx2(pixel* dst, intptr_t dstStride, const 
pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_11_avx2(pixel* dst, intptr_t dstStride, const 
pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_25_avx2(pixel* dst, intptr_t dstStride, const 
pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_28_avx2(pixel* dst, intptr_t dstStride, const 
pixel* srcPix, int dirMode, int bFilter);
diff -r 7f2d92923de4 -r 561f063f3ef9 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Wed Apr 08 14:51:00 2015 -0500
+++ b/source/common/x86/intrapred8.asm  Thu Apr 09 10:58:27 2015 +0530
@@ -133,6 +133,17 @@
   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 
18, 14, 18, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
   db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 
16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
 
+
+ALIGN 32
+c_ang16_mode_12:  db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 
27, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
+  db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 
22, 10, 22, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
+  db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 
17, 15, 17, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
+  db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 
12, 20, 12, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
+  db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 
7, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
+  db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 
2, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
+  db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 
29, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
+  db  8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 
24, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+
 ALIGN 32
 c_ang16_mode_28:  db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 
5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
   db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 
15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
@@ -12066,6 +12077,65 @@
 packuswb  %1, %2
 %endmacro
 
+
+INIT_YMM avx2
+cglobal intra_pred_ang16_12, 3, 5, 13
+mova  m11, [pw_1024]
+lea   r5, [intra_pred_shuff_0_8]
+
+movu  xm9, [r2 + 32]
+pinsrbxm9, [r2], 0
+pslldqxm7, xm9, 1
+pinsrbxm7, [r2 + 6], 0
+vinserti128   m9, m9, xm7, 1
+pshufbm9, [r5]
+
+movu  xm12, [r2 + 6 + 32]
+
+psrldqxm10, xm12, 2
+psrldqxm8, xm12, 1
+vinserti128   m10, m10, xm8, 1
+pshufbm10, [r5]
+
+lea   r3, [3 * r1]
+lea   r4, [c_ang16_mode_12]
+
+INTRA_PRED_ANG16_CAL_ROW m0, m1

[x265] [PATCH 6 of 9] asm: intra_pred_ang8_15 improved by ~5% over SSE4

2015-04-09 Thread praveen

# HG changeset patch
# User Praveen Tiwari prav...@multicorewareinc.com
# Date 1428579494 -19800
#  Thu Apr 09 17:08:14 2015 +0530
# Node ID 31ce12d63d6560df4ce29bdb948525cf73f057f4
# Parent  48278b974eec1dfc8da1643355a701ea073fec36
asm: intra_pred_ang8_15 improved by ~5% over SSE4

AVX2:
intra_ang_8x8[15]   9.57x342.52  3279.56

SSE4:
intra_ang_8x8[15]   8.95x360.01  3223.45

diff -r 48278b974eec -r 31ce12d63d65 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Apr 09 16:30:54 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Apr 09 17:08:14 2015 +0530
@@ -1766,6 +1766,7 @@
 p.cu[BLOCK_8x8].intra_pred[11] = x265_intra_pred_ang8_11_avx2;
 p.cu[BLOCK_8x8].intra_pred[13] = x265_intra_pred_ang8_13_avx2;
 p.cu[BLOCK_8x8].intra_pred[14] = x265_intra_pred_ang8_14_avx2;
+p.cu[BLOCK_8x8].intra_pred[15] = x265_intra_pred_ang8_15_avx2;
 p.cu[BLOCK_16x16].intra_pred[3] = x265_intra_pred_ang16_3_avx2;
 p.cu[BLOCK_16x16].intra_pred[4] = x265_intra_pred_ang16_4_avx2;
 p.cu[BLOCK_16x16].intra_pred[5] = x265_intra_pred_ang16_5_avx2;
diff -r 48278b974eec -r 31ce12d63d65 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Thu Apr 09 16:30:54 2015 +0530
+++ b/source/common/x86/intrapred.h Thu Apr 09 17:08:14 2015 +0530
@@ -235,6 +235,7 @@
 void x265_intra_pred_ang8_11_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang8_13_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang8_14_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang8_15_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_3_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_4_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_5_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
diff -r 48278b974eec -r 31ce12d63d65 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Thu Apr 09 16:30:54 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Thu Apr 09 17:08:14 2015 +0530
@@ -684,6 +684,12 @@
   db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 
31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
   db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 
5, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
 
+ALIGN 32
+c_ang8_mode_15:   db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 
15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
+  db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 
13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
+  db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 
11, 21, 11, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
+  db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 
9, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
+
 const ang_table
 %assign x 0
 %rep 32
@@ -11876,6 +11882,68 @@
 movhps[r0 + r3], xm2
 RET
 
+INIT_YMM avx2
+cglobal intra_pred_ang8_15, 3, 6, 6
+mova  m3, [pw_1024]
+movu  xm5, [r2 + 16]
+pinsrbxm5, [r2], 0
+lea   r5, [intra_pred_shuff_0_8]
+mova  xm0, xm5
+pslldqxm5, 1
+pinsrbxm5, [r2 + 2], 0
+vinserti128   m0, m0, xm5, 1
+pshufbm0, [r5]
+
+lea   r4, [c_ang8_mode_15]
+pmaddubsw m1, m0, [r4]
+pmulhrsw  m1, m3
+mova  xm0, xm5
+pslldqxm5, 1
+pinsrbxm5, [r2 + 4], 0
+vinserti128   m0, m0, xm5, 1
+pshufbm0, [r5]
+pmaddubsw m2, m0, [r4 + mmsize]
+pmulhrsw  m2, m3
+mova  xm0, xm5
+pslldqxm5, 1
+pinsrbxm5, [r2 + 6], 0
+vinserti128   m0, m0, xm5, 1
+pshufbm0, [r5]
+pmaddubsw m4, m0, [r4 + 2 * mmsize]
+pmulhrsw  m4, m3
+mova  xm0, xm5
+pslldqxm5, 1
+pinsrbxm5, [r2 + 8], 0
+vinserti128   m0, m0, xm5, 1
+pshufbm0, [r5]
+pmaddubsw m0, [r4 + 3 * mmsize]
+pmulhrsw  m0, m3
+packuswb  m1, m2
+packuswb  m4, m0
+
+vperm2i128m2, m1, m4, 0010b
+vperm2i128m1, m1, m4, 00110001b
+punpcklbw m4, m2, m1
+punpckhbw m2, m1
+punpcklwd m1, m4, m2
+punpckhwd m4, m2
+mova  m0

[x265] [PATCH 9 of 9] asm: intra_pred_ang8_21 improved by ~5% over SSE4

2015-04-09 Thread praveen

# HG changeset patch
# User Praveen Tiwari prav...@multicorewareinc.com
# Date 1428583713 -19800
#  Thu Apr 09 18:18:33 2015 +0530
# Node ID 759643fade74e82075a9a6491c41d9f3563df7e2
# Parent  72c75090a5dcbe002bd28d2190703b6d74ac7c81
asm: intra_pred_ang8_21 improved by ~5% over SSE4

AVX2:
intra_ang_8x8[21]   8.55x239.75  2050.08

SSE4:
intra_ang_8x8[21]   8.03x252.60  2027.91

diff -r 72c75090a5dc -r 759643fade74 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Apr 09 18:07:22 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Apr 09 18:18:33 2015 +0530
@@ -1765,6 +1765,7 @@
 p.cu[BLOCK_8x8].intra_pred[24] = x265_intra_pred_ang8_24_avx2;
 p.cu[BLOCK_8x8].intra_pred[11] = x265_intra_pred_ang8_11_avx2;
 p.cu[BLOCK_8x8].intra_pred[13] = x265_intra_pred_ang8_13_avx2;
+p.cu[BLOCK_8x8].intra_pred[21] = x265_intra_pred_ang8_21_avx2;
 p.cu[BLOCK_8x8].intra_pred[22] = x265_intra_pred_ang8_22_avx2;
 p.cu[BLOCK_8x8].intra_pred[23] = x265_intra_pred_ang8_23_avx2;
 p.cu[BLOCK_8x8].intra_pred[14] = x265_intra_pred_ang8_14_avx2;
diff -r 72c75090a5dc -r 759643fade74 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Thu Apr 09 18:07:22 2015 +0530
+++ b/source/common/x86/intrapred.h Thu Apr 09 18:18:33 2015 +0530
@@ -236,6 +236,7 @@
 void x265_intra_pred_ang8_13_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang8_14_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang8_15_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang8_21_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang8_22_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang8_23_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_3_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
diff -r 72c75090a5dc -r 759643fade74 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Thu Apr 09 18:07:22 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Thu Apr 09 18:18:33 2015 +0530
@@ -11945,6 +11945,57 @@
 movhps[r0 + r3], xm2
 RET
 
+INIT_YMM avx2
+cglobal intra_pred_ang8_21, 3, 6, 6
+mova  m3, [pw_1024]
+movu  xm5, [r2]
+lea   r5, [intra_pred_shuff_0_8]
+mova  xm0, xm5
+pslldqxm5, 1
+pinsrbxm5, [r2 + 2 + 16], 0
+vinserti128   m0, m0, xm5, 1
+pshufbm0, [r5]
+
+lea   r4, [c_ang8_mode_15]
+pmaddubsw m1, m0, [r4]
+pmulhrsw  m1, m3
+mova  xm0, xm5
+pslldqxm5, 1
+pinsrbxm5, [r2 + 4 + 16], 0
+vinserti128   m0, m0, xm5, 1
+pshufbm0, [r5]
+pmaddubsw m2, m0, [r4 + mmsize]
+pmulhrsw  m2, m3
+mova  xm0, xm5
+pslldqxm5, 1
+pinsrbxm5, [r2 + 6 + 16], 0
+vinserti128   m0, m0, xm5, 1
+pshufbm0, [r5]
+pmaddubsw m4, m0, [r4 + 2 * mmsize]
+pmulhrsw  m4, m3
+mova  xm0, xm5
+pslldqxm5, 1
+pinsrbxm5, [r2 + 8 + 16], 0
+vinserti128   m0, m0, xm5, 1
+pshufbm0, [r5]
+pmaddubsw m0, [r4 + 3 * mmsize]
+pmulhrsw  m0, m3
+packuswb  m1, m2
+packuswb  m4, m0
+
+lea   r3, [3 * r1]
+movq  [r0], xm1
+vextracti128  xm2, m1, 1
+movq  [r0 + r1], xm2
+movhps[r0 + 2 * r1], xm1
+movhps[r0 + r3], xm2
+lea   r0, [r0 + 4 * r1]
+movq  [r0], xm4
+vextracti128  xm2, m4, 1
+movq  [r0 + r1], xm2
+movhps[r0 + 2 * r1], xm4
+movhps[r0 + r3], xm2
+RET
 
 INIT_YMM avx2
 cglobal intra_pred_ang8_22, 3, 6, 6
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH 4 of 9] asm: intra_pred_ang8_13 improved by ~16% over SSE4

2015-04-09 Thread praveen

# HG changeset patch
# User Praveen Tiwari prav...@multicorewareinc.com
# Date 1428575027 -19800
#  Thu Apr 09 15:53:47 2015 +0530
# Node ID 79c8c583603a8dda7fe22973b55b18d9ff08cc64
# Parent  6f9c3e9aec5218f89389c6f1f363b86181fc20cf
asm: intra_pred_ang8_13 improved by ~16% over SSE4

AVX2:
intra_ang_8x8[13]   10.68x   297.95  3183.33

SSE4:
intra_ang_8x8[13]   9.16x352.32  3225.62

diff -r 6f9c3e9aec52 -r 79c8c583603a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Apr 09 13:28:56 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Apr 09 15:53:47 2015 +0530
@@ -1764,6 +1764,7 @@
 p.cu[BLOCK_8x8].intra_pred[12] = x265_intra_pred_ang8_12_avx2;
 p.cu[BLOCK_8x8].intra_pred[24] = x265_intra_pred_ang8_24_avx2;
 p.cu[BLOCK_8x8].intra_pred[11] = x265_intra_pred_ang8_11_avx2;
+p.cu[BLOCK_8x8].intra_pred[13] = x265_intra_pred_ang8_13_avx2;
 p.cu[BLOCK_16x16].intra_pred[3] = x265_intra_pred_ang16_3_avx2;
 p.cu[BLOCK_16x16].intra_pred[4] = x265_intra_pred_ang16_4_avx2;
 p.cu[BLOCK_16x16].intra_pred[5] = x265_intra_pred_ang16_5_avx2;
diff -r 6f9c3e9aec52 -r 79c8c583603a source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Thu Apr 09 13:28:56 2015 +0530
+++ b/source/common/x86/intrapred.h Thu Apr 09 15:53:47 2015 +0530
@@ -233,6 +233,7 @@
 void x265_intra_pred_ang8_12_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang8_24_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang8_11_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang8_13_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_3_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_4_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_5_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
diff -r 6f9c3e9aec52 -r 79c8c583603a source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Thu Apr 09 13:28:56 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Thu Apr 09 15:53:47 2015 +0530
@@ -672,6 +672,11 @@
 pw_planar32_L:dw 31, 30, 29, 28, 27, 26, 25, 24
 pw_planar32_H:dw 23, 22, 21, 20, 19, 18, 17, 16
 
+ALIGN 32
+c_ang8_mode_13:   db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 
23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
+  db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 
5, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
+  db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 
19, 13, 19, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
+  db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 
1, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
 
 const ang_table
 %assign x 0
@@ -11866,6 +11871,61 @@
 RET
 
 
+
+INIT_YMM avx2
+cglobal intra_pred_ang8_13, 3, 6, 6
+mova  m3, [pw_1024]
+movu  xm5, [r2 + 16]
+pinsrbxm5, [r2], 0
+lea   r5, [intra_pred_shuff_0_8]
+vinserti128   m0, m5, xm5, 1
+pshufbm0, [r5]
+
+lea   r4, [c_ang8_mode_13]
+pmaddubsw m1, m0, [r4]
+pmulhrsw  m1, m3
+pslldqxm5, 1
+pinsrbxm5, [r2 + 4], 0
+pshufbxm4, xm5, [r5]
+vinserti128   m0, m0, xm4, 1
+pmaddubsw m2, m0, [r4 + mmsize]
+pmulhrsw  m2, m3
+vinserti128   m0, m0, xm4, 0
+pmaddubsw m4, m0, [r4 + 2 * mmsize]
+pmulhrsw  m4, m3
+pslldqxm5, 1
+pinsrbxm5, [r2 + 7], 0
+pshufbxm5, [r5]
+vinserti128   m0, m0, xm5, 1
+pmaddubsw m0, [r4 + 3 * mmsize]
+pmulhrsw  m0, m3
+packuswb  m1, m2
+packuswb  m4, m0
+
+vperm2i128m2, m1, m4, 0010b
+vperm2i128m1, m1, m4, 00110001b
+punpcklbw m4, m2, m1
+punpckhbw m2, m1
+punpcklwd m1, m4, m2
+punpckhwd m4, m2
+mova  m0, [trans8_shuf]
+vpermdm1, m0, m1
+vpermdm4, m0, m4
+
+lea   r3, [3 * r1]
+movq  [r0], xm1
+movhps[r0 + r1], xm1
+vextracti128  xm2, m1, 1
+movq  [r0 + 2 * r1], xm2
+movhps[r0 + r3], xm2
+lea   r0, [r0 + 4 * r1]
+movq  [r0], xm4
+movhps[r0 + r1], xm4
+vextracti128  xm2, m4, 1
+movq  [r0 + 2 * r1], xm2
+movhps

[x265] [PATCH 7 of 9] asm: intra_pred_ang8_23 improved by ~18% over SSE4

2015-04-09 Thread praveen

# HG changeset patch
# User Praveen Tiwari prav...@multicorewareinc.com
# Date 1428581726 -19800
#  Thu Apr 09 17:45:26 2015 +0530
# Node ID 5db8882a1ef6c5f27bc59869692791290945af2a
# Parent  31ce12d63d6560df4ce29bdb948525cf73f057f4
asm: intra_pred_ang8_23 improved by ~18% over SSE4

AVX2:
intra_ang_8x8[23]   9.75x205.43  2002.05

SSE4:
intra_ang_8x8[23]   8.12x251.42  2041.61

diff -r 31ce12d63d65 -r 5db8882a1ef6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Thu Apr 09 17:08:14 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp  Thu Apr 09 17:45:26 2015 +0530
@@ -1765,6 +1765,7 @@
 p.cu[BLOCK_8x8].intra_pred[24] = x265_intra_pred_ang8_24_avx2;
 p.cu[BLOCK_8x8].intra_pred[11] = x265_intra_pred_ang8_11_avx2;
 p.cu[BLOCK_8x8].intra_pred[13] = x265_intra_pred_ang8_13_avx2;
+p.cu[BLOCK_8x8].intra_pred[23] = x265_intra_pred_ang8_23_avx2;
 p.cu[BLOCK_8x8].intra_pred[14] = x265_intra_pred_ang8_14_avx2;
 p.cu[BLOCK_8x8].intra_pred[15] = x265_intra_pred_ang8_15_avx2;
 p.cu[BLOCK_16x16].intra_pred[3] = x265_intra_pred_ang16_3_avx2;
diff -r 31ce12d63d65 -r 5db8882a1ef6 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Thu Apr 09 17:08:14 2015 +0530
+++ b/source/common/x86/intrapred.h Thu Apr 09 17:45:26 2015 +0530
@@ -236,6 +236,7 @@
 void x265_intra_pred_ang8_13_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang8_14_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang8_15_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang8_23_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_3_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_4_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang16_5_avx2(pixel* dst, intptr_t dstStride, const pixel* 
srcPix, int dirMode, int bFilter);
diff -r 31ce12d63d65 -r 5db8882a1ef6 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm  Thu Apr 09 17:08:14 2015 +0530
+++ b/source/common/x86/intrapred8.asm  Thu Apr 09 17:45:26 2015 +0530
@@ -12056,6 +12056,51 @@
 movhps[r0 + r3], xm2
 RET
 
+
+INIT_YMM avx2
+cglobal intra_pred_ang8_23, 3, 6, 6
+mova  m3, [pw_1024]
+movu  xm5, [r2]
+lea   r5, [intra_pred_shuff_0_8]
+vinserti128   m0, m5, xm5, 1
+pshufbm0, [r5]
+
+lea   r4, [c_ang8_mode_13]
+pmaddubsw m1, m0, [r4]
+pmulhrsw  m1, m3
+pslldqxm5, 1
+pinsrbxm5, [r2 + 4 + 16], 0
+pshufbxm4, xm5, [r5]
+vinserti128   m0, m0, xm4, 1
+pmaddubsw m2, m0, [r4 + mmsize]
+pmulhrsw  m2, m3
+vinserti128   m0, m0, xm4, 0
+pmaddubsw m4, m0, [r4 + 2 * mmsize]
+pmulhrsw  m4, m3
+pslldqxm5, 1
+pinsrbxm5, [r2 + 7 + 16], 0
+pshufbxm5, [r5]
+vinserti128   m0, m0, xm5, 1
+pmaddubsw m0, [r4 + 3 * mmsize]
+pmulhrsw  m0, m3
+
+packuswb  m1, m2
+packuswb  m4, m0
+
+lea   r3, [3 * r1]
+movq  [r0], xm1
+vextracti128  xm2, m1, 1
+movq  [r0 + r1], xm2
+movhps[r0 + 2 * r1], xm1
+movhps[r0 + r3], xm2
+lea   r0, [r0 + 4 * r1]
+movq  [r0], xm4
+vextracti128  xm2, m4, 1
+movq  [r0 + r1], xm2
+movhps[r0 + 2 * r1], xm4
+movhps[r0 + r3], xm2
+RET
+
 INIT_YMM avx2
 cglobal intra_pred_ang8_12, 3, 5, 5
 mova  m3, [pw_1024]
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

1 2 3 4 5 6 >

1 - 100 of 560 matches

Mail list logo