Re: [x265] [X265][PATCH] Fix: Hang when vbv is used with slices
Hi Aruna, I have reviewed the patch internally. Could you please review it once and push this patch to the default of x265? Thanks, Praveen On Fri, Jan 22, 2021 at 4:52 PM Niranjan Bala wrote: > From 64a985847ecca1a6937fe1dae00d3db79cf0bcb2 Mon Sep 17 00:00:00 2001 > From: Niranjan > Date: Fri, 22 Jan 2021 08:26:58 +0530 > Subject: [PATCH] Fix: Hang when vbv is used with slices > > --- > source/encoder/frameencoder.cpp | 30 -- > source/encoder/frameencoder.h | 4 ++-- > 2 files changed, 18 insertions(+), 16 deletions(-) > > diff --git a/source/encoder/frameencoder.cpp > b/source/encoder/frameencoder.cpp > index 2086a15a5..efe85282f 100644 > --- a/source/encoder/frameencoder.cpp > +++ b/source/encoder/frameencoder.cpp > @@ -47,8 +47,6 @@ FrameEncoder::FrameEncoder() > m_slicetypeWaitTime = 0; > m_activeWorkerCount = 0; > m_completionCount = 0; > -m_bAllRowsStop = false; > -m_vbvResetTriggerRow = -1; > m_outStreams = NULL; > m_backupStreams = NULL; > m_substreamSizes = NULL; > @@ -88,6 +86,8 @@ void FrameEncoder::destroy() > delete[] m_outStreams; > delete[] m_backupStreams; > X265_FREE(m_sliceBaseRow); > +X265_FREE((void*)m_bAllRowsStop); > +X265_FREE((void*)m_vbvResetTriggerRow); > X265_FREE(m_sliceMaxBlockRow); > X265_FREE(m_cuGeoms); > X265_FREE(m_ctuGeomMap); > @@ -118,6 +118,8 @@ bool FrameEncoder::init(Encoder *top, int numRows, int > numCols) > bool ok = !!m_numRows; > > m_sliceBaseRow = X265_MALLOC(uint32_t, m_param->maxSlices + 1); > +m_bAllRowsStop = X265_MALLOC(bool, m_param->maxSlices); > +m_vbvResetTriggerRow = X265_MALLOC(int, m_param->maxSlices); > ok &= !!m_sliceBaseRow; > m_sliceGroupSize = (uint16_t)(m_numRows + m_param->maxSlices - 1) / > m_param->maxSlices; > uint32_t sliceGroupSizeAccu = (m_numRows << 8) / m_param->maxSlices; > > @@ -438,8 +440,8 @@ void FrameEncoder::compressFrame() > m_stallStartTime = 0; > > m_completionCount = 0; > -m_bAllRowsStop = false; > -m_vbvResetTriggerRow = -1; > +memset((void*)m_bAllRowsStop, 0, sizeof(bool) * m_param->maxSlices); > +memset((void*)m_vbvResetTriggerRow, -1, sizeof(int) * > m_param->maxSlices); > m_rowSliceTotalBits[0] = 0; > m_rowSliceTotalBits[1] = 0; > > @@ -1469,16 +1471,16 @@ void FrameEncoder::processRowEncoder(int intRow, > ThreadLocalData& tld) > curRow.bufferedEntropy.copyState(rowCoder); > curRow.bufferedEntropy.loadContexts(rowCoder); > } > -if (bFirstRowInSlice && m_vbvResetTriggerRow != intRow) > > +if (bFirstRowInSlice && m_vbvResetTriggerRow[curRow.sliceId] > != intRow) > { > curEncData.m_rowStat[row].rowQp = curEncData.m_avgQpRc; > curEncData.m_rowStat[row].rowQpScale = > x265_qp2qScale(curEncData.m_avgQpRc); > } > > FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr]; > -if (m_param->bEnableWavefront && rowInSlice >= col && > !bFirstRowInSlice && m_vbvResetTriggerRow != intRow) > +if (m_param->bEnableWavefront && rowInSlice >= col && > !bFirstRowInSlice && m_vbvResetTriggerRow[curRow.sliceId] != intRow) > cuStat.baseQp = curEncData.m_cuStat[cuAddr - numCols + > 1].baseQp; > -else if (!m_param->bEnableWavefront && !bFirstRowInSlice && > m_vbvResetTriggerRow != intRow) > +else if (!m_param->bEnableWavefront && !bFirstRowInSlice && > m_vbvResetTriggerRow[curRow.sliceId] != intRow) > cuStat.baseQp = curEncData.m_rowStat[row - 1].rowQp; > else > cuStat.baseQp = curEncData.m_rowStat[row].rowQp; > @@ -1655,7 +1657,7 @@ void FrameEncoder::processRowEncoder(int intRow, > ThreadLocalData& tld) > x265_log(m_param, X265_LOG_DEBUG, "POC %d row %d - > encode restart required for VBV, to %.2f from %.2f\n", > m_frame->m_poc, row, qpBase, > curEncData.m_cuStat[cuAddr].baseQp); > > -m_vbvResetTriggerRow = row; > +m_vbvResetTriggerRow[curRow.sliceId] = row; > m_outStreams[0].copyBits(_backupStreams[0]); > > rowCoder.copyState(curRow.bufferedEntropy); > @@ -1707,8 +1709,8 @@ void FrameEncoder::processRowEncoder(int intRow, > ThreadLocalData& tld) >
Re: [x265] [PATCH] fix: help for rskip cli option to avoid make errors
Looks good to me. Regards, Praveen On Wed, Sep 16, 2020 at 7:06 PM Srikanth Kurapati < srikanth.kurap...@multicorewareinc.com> wrote: > From a92bc566e03f473af25db8f78d1eb3f40106a959 Mon Sep 17 00:00:00 2001 > From: Srikanth Kurapati > Date: Fri, 4 Sep 2020 11:06:39 +0530 > Subject: [PATCH] fix: help for rskip cli option to avoid make errors > > --- > source/x265cli.cpp | 3 +-- > 1 file changed, 1 insertion(+), 2 deletions(-) > > diff --git a/source/x265cli.cpp b/source/x265cli.cpp > index c28dd7f8c..2575e02cd 100755 > --- a/source/x265cli.cpp > +++ b/source/x265cli.cpp > @@ -127,8 +127,7 @@ namespace X265_NS { > H0(" --[no-]ssim-rdEnable ssim rate distortion > optimization, 0 to disable. Default %s\n", OPT(param->bSsimRd)); > H0(" --[no-]rd-refine Enable QP based RD > refinement for rd levels 5 and 6. Default %s\n", > OPT(param->bEnableRdRefine)); > H0(" --[no-]early-skip Enable early SKIP detection. > Default %s\n", OPT(param->bEnableEarlySkip)); > -H0(" --rskip Set mode for early exit from > recursion. Mode 1: exit using rdcost & CU homogenity. Mode 2: exit using CU > edge density.\n" > -" Mode 0: disabled. Default > %d\n", param->recursionSkipMode); > +H0(" --rskip Enable recurison skip for > early exit. 1: exit using rdcost & CU homogenity. 2: exit using CU edge > density. 0: disabled. Default %d\n", param->recursionSkipMode); > H1(" --rskip-edge-thresholdThreshold in terms of > percentage (integer of range [0,100]) for minimum edge density in CUs used > to prun the recursion depth. Applicable only for rskip mode 2. Value is > preset dependent. Default: %.f\n", param->edgeVarThreshold*100.0f); > H1(" --[no-]tskip-fast Enable fast intra transform > skipping. Default %s\n", OPT(param->bEnableTSkipFast)); > H1(" --[no-]splitrd-skip Enable skipping split RD > analysis when sum of split CU rdCost larger than one split CU rdCost for > Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip)); > -- > 2.20.1.windows.1 > > -- > *With Regards,* > *Srikanth Kurapati.* > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] Corrected wrong cli in docs wrt --colormatrix & --videoformat
>From 866e7d77aa113dcfd9596c27e7dda70d8da8220b Mon Sep 17 00:00:00 2001 From: Praveen Kumar Karadugattu Date: Tue, 1 Sep 2020 22:48:30 +0530 Subject: [PATCH] Corrected wrong cli in docs wrt --colormatrix & --videoformat --- doc/reST/cli.rst| 104 source/CMakeLists.txt | 2 +- source/common/param.cpp | 38 ++ source/x265cli.cpp | 12 +++--- 4 files changed, 80 insertions(+), 76 deletions(-) mode change 100644 => 100755 doc/reST/cli.rst mode change 100644 => 100755 source/CMakeLists.txt mode change 100644 => 100755 source/common/param.cpp mode change 100644 => 100755 source/x265cli.cpp diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst old mode 100644 new mode 100755 index 02828e3..e6c628c --- a/doc/reST/cli.rst +++ b/doc/reST/cli.rst @@ -141,7 +141,7 @@ Logging/Statistic Options **Residual Energy** Average residual energy. SSE is calculated on fenc and pred(before quantization). - **Luma/Chroma Values** minumum, maximum and average(averaged by area) + **Luma/Chroma Values** minimum, maximum and average(averaged by area) luma and chroma values of source for each frame. **PU Statistics** percentage of PU modes at each depth. @@ -246,7 +246,7 @@ Performance Options .. option:: --pools , --numa-pools - Comma seperated list of threads per NUMA node. If "none", then no worker + Comma separated list of threads per NUMA node. If "none", then no worker pools are created and only frame parallelism is possible. If NULL or "" (default) x265 will use all available threads on each NUMA node:: @@ -284,7 +284,7 @@ Performance Options the last thread pool is spawned only if it has more than 32 threads for 64-bit machines, or 16 for 32-bit machines. If the total number of threads in the system doesn't obey this constraint, we may spawn fewer threads - than cores which has been emperically shown to be better for performance. + than cores which has been empirically shown to be better for performance. If the four pool features: :option:`--wpp`, :option:`--pmode`, :option:`--pme` and :option:`--lookahead-slices` are all disabled, @@ -409,7 +409,7 @@ Performance Options Allow encoder to copy input x265 pictures to internal frame buffers. When disabled, x265 will not make an internal copy of the input picture and will work with the - application's buffers. While this allows for deeper integration, it is the responsbility + application's buffers. While this allows for deeper integration, it is the responsibility of the application to (a) ensure that the allocated picture has extra space for padding that will be done by the library, and (b) the buffers aren't recycled until the library has completed encoding this frame (which can be figured out by tracking NALs output by x265) @@ -554,7 +554,7 @@ frame counts) are only applicable to the CLI application. .. option:: --chunk-start - First frame of the chunk. Frames preceeding this in display order will + First frame of the chunk. Frames preceding this in display order will be encoded, however, they will be discarded in the bitstream. This feature can be enabled only in closed GOP structures. Default 0 (disabled). @@ -562,7 +562,7 @@ frame counts) are only applicable to the CLI application. .. option:: --chunk-end Last frame of the chunk. Frames following this in display order will be - used in taking lookahead decisions, but, they will not be encoded. + used in taking lookahead decisions, but they will not be encoded. This feature can be enabled only in closed GOP structures. Default 0 (disabled). @@ -638,7 +638,7 @@ Profile, Level, Tier If :option:`--level-idc` has been specified, --high-tier allows the support of high tier at that level. The encoder will first attempt to encode at the specified level, main tier first, turning on high tier only if - necessary and available at that level.If your requested level does not + necessary and available at that level. If your requested level does not support a High tier, high tier will not be supported. If --no-high-tier has been specified, then the encoder will attempt to encode only at the main tier. @@ -647,8 +647,8 @@ Profile, Level, Tier .. option:: --ref <1..16> Max number of L0 references to be allowed. This number has a linear - multiplier effect on the amount of work performed in motion search, - but will generally have a beneficial affect on compression and + multiplier effect on the amount of work performed in motion search + but will generally have a beneficial effect on compression and distortion. Note that x265 allows up to 16 L0 references but the HEVC @@ -668,7 +668,7 @@ Profile, Level, Tier .. option:: --allow-non-conformance, --no-allow-non-conformance Allow libx265 to generate a bitstream with profile and level NONE. - By default it will abort any encode which does not meet strict level + By defau
[x265] [Test-harness]Added the normalization fix of hist-scenectu to output-changing-commits
>From 09d42e9e3850d4f6424fb7b4e8620e4eb3ec7389 Mon Sep 17 00:00:00 2001 From: Praveen Kumar Karadugattu Date: Mon, 6 Jul 2020 13:21:59 +0530 Subject: [PATCH] Added the normalization fix of hist-scenectu to output-changing-commits --- output-changing-commits-git.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/output-changing-commits-git.txt b/output-changing-commits-git.txt index ead6d85..d60c5bc 100644 --- a/output-changing-commits-git.txt +++ b/output-changing-commits-git.txt @@ -18,5 +18,6 @@ # 2) not required [sao], [weightp], [cutree] etc.., because these are # already set in presets so golden outputs will store for that preset. +38774073d45138b01a6abd0e2cfcecae01038a72 [hist-scenecut] Fixed the normalization formula to ouput 0 to 1 and considered max chroma histogram SAD in histogram based scene cut detection 3f476a384a190bab44a2bdcf94a081ccc58b13e8 Merge with default 7bd63522910add904aaf878e85c2e7a2fece80cd analysis-save/load: fix crash during analysis sharing between non-dyadic resolutions -- 1.8.3.1 0001-Added-the-normalization-fix-of-hist-scenectu-to-outp.patch Description: Binary data ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH]Fixed the normalization formula to ouput 0 to 1 and considered max chroma histogram SAD in histogram based scene cut detection
>From 44704e10a60ae314ecd13dfb84c0c4f82d6c1a55 Mon Sep 17 00:00:00 2001 From: Praveen Kumar Karadugattu Date: Wed, 17 Jun 2020 19:28:06 +0530 Subject: [PATCH] Fixed the normalization formula to ouput 0 to 1 and considered max chroma histogram SAD in histogram based scene cut detection --- doc/reST/cli.rst | 8 ++--- source/common/param.cpp| 4 +-- source/encoder/encoder.cpp | 75 ++ source/encoder/encoder.h | 6 ++-- source/x265cli.cpp | 2 +- 5 files changed, 45 insertions(+), 50 deletions(-) diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst index eceec40..c9e288e 100644 --- a/doc/reST/cli.rst +++ b/doc/reST/cli.rst @@ -1462,13 +1462,13 @@ Slice decision options .. option:: --hist-scenecut, --no-hist-scenecut Indicates that scenecuts need to be detected using luma edge and chroma histograms. - :option: `--hist-scenecut` enables scenecut detection using the histograms and disables the default scene cut algorithm. - :option: `--no-hist-scenecut` disables histogram based scenecut algorithm. + :option:`--hist-scenecut` enables scenecut detection using the histograms and disables the default scene cut algorithm. + :option:`--no-hist-scenecut` disables histogram based scenecut algorithm. -.. option:: --hist-threshold <0.0..2.0> +.. option:: --hist-threshold <0.0..1.0> This value represents the threshold for normalized SAD of edge histograms used in scenecut detection. - This requires :option: `--hist-scenecut` to be enabled. For example, a value of 0.2 indicates that a frame with normalized SAD value + This requires :option:`--hist-scenecut` to be enabled. For example, a value of 0.2 indicates that a frame with normalized SAD value greater than 0.2 against the previous frame as scenecut. Default 0.01. diff --git a/source/common/param.cpp b/source/common/param.cpp index fb7244e..925f0c4 100644 --- a/source/common/param.cpp +++ b/source/common/param.cpp @@ -1688,8 +1688,8 @@ int x265_check_params(x265_param* param) "scenecutThreshold must be greater than 0"); CHECK(param->scenecutBias < 0 || 100 < param->scenecutBias, "scenecut-bias must be between 0 and 100"); -CHECK(param->edgeTransitionThreshold < 0.0 || 2.0 < param->edgeTransitionThreshold, -"hist-threshold must be between 0.0 and 2.0"); +CHECK(param->edgeTransitionThreshold < 0.0 || 1.0 < param->edgeTransitionThreshold, +"hist-threshold must be between 0.0 and 1.0"); CHECK(param->radl < 0 || param->radl > param->bframes, "radl must be between 0 and bframes"); CHECK(param->rdPenalty < 0 || param->rdPenalty > 2, diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index 752e5b2..f6bc540 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -222,12 +222,9 @@ void Encoder::create() uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1; m_edgePic = X265_MALLOC(pixel, m_planeSizes[0] * pixelbytes); m_edgeHistThreshold = m_param->edgeTransitionThreshold; -m_chromaHistThreshold = m_edgeHistThreshold * 10.0; -m_chromaHistThreshold = x265_min(m_chromaHistThreshold, MAX_SCENECUT_THRESHOLD); -m_scaledEdgeThreshold = m_edgeHistThreshold * SCENECUT_STRENGTH_FACTOR; -m_scaledEdgeThreshold = x265_min(m_scaledEdgeThreshold, MAX_SCENECUT_THRESHOLD); -m_scaledChromaThreshold = m_chromaHistThreshold * SCENECUT_STRENGTH_FACTOR; -m_scaledChromaThreshold = x265_min(m_scaledChromaThreshold, MAX_SCENECUT_THRESHOLD); +m_chromaHistThreshold = x265_min(m_edgeHistThreshold * 10.0, MAX_SCENECUT_THRESHOLD); +m_scaledEdgeThreshold = x265_min(m_edgeHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD); +m_scaledChromaThreshold = x265_min(m_chromaHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD); if (m_param->sourceBitDepth != m_param->internalBitDepth) { int size = m_param->sourceWidth * m_param->sourceHeight; @@ -1450,13 +1447,14 @@ bool Encoder::computeHistograms(x265_picture *pic) memset(edgeHist, 0, EDGE_BINS * sizeof(int32_t)); for (uint32_t i = 0; i < m_planeSizes[0]; i++) { -if (!m_edgePic[i]) - edgeHist[0]++; +if (m_edgePic[i]) +edgeHist[1]++; else - edgeHist[1]++; +edgeHist[0]++; } + /* Y Histogram Calculation */ -int32_t* yHist = m_curYUVHist[0]; +int32_t *yHist = m_curYUVHist[0]; memset(yHist, 0, HISTOGRAM_BINS * sizeof(int32_t)); for (uint32_t i = 0; i < m_planeSizes[0]; i++) { @@ -1468,7 +1466,7 @@ bool Encoder::computeHistograms(x265_picture *pic) { /* U Histogram Calculation */ int32_t *uHist = m_curYUVHist[1]; -
[x265] Meet Chat 5 of 937 [PATCH]Fixed the normalization formula to ouput 0 to 1 and considered max chroma histogram SAD in histogram based scene cut detection
>From 44704e10a60ae314ecd13dfb84c0c4f82d6c1a55 Mon Sep 17 00:00:00 2001 From: Praveen Kumar Karadugattu Date: Wed, 17 Jun 2020 19:28:06 +0530 Subject: [PATCH] Fixed the normalization formula to ouput 0 to 1 and considered max chroma histogram SAD in histogram based scene cut detection --- doc/reST/cli.rst | 8 ++--- source/common/param.cpp| 4 +-- source/encoder/encoder.cpp | 75 ++ source/encoder/encoder.h | 6 ++-- source/x265cli.cpp | 2 +- 5 files changed, 45 insertions(+), 50 deletions(-) diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst index eceec40..c9e288e 100644 --- a/doc/reST/cli.rst +++ b/doc/reST/cli.rst @@ -1462,13 +1462,13 @@ Slice decision options .. option:: --hist-scenecut, --no-hist-scenecut Indicates that scenecuts need to be detected using luma edge and chroma histograms. - :option: `--hist-scenecut` enables scenecut detection using the histograms and disables the default scene cut algorithm. - :option: `--no-hist-scenecut` disables histogram based scenecut algorithm. + :option:`--hist-scenecut` enables scenecut detection using the histograms and disables the default scene cut algorithm. + :option:`--no-hist-scenecut` disables histogram based scenecut algorithm. -.. option:: --hist-threshold <0.0..2.0> +.. option:: --hist-threshold <0.0..1.0> This value represents the threshold for normalized SAD of edge histograms used in scenecut detection. - This requires :option: `--hist-scenecut` to be enabled. For example, a value of 0.2 indicates that a frame with normalized SAD value + This requires :option:`--hist-scenecut` to be enabled. For example, a value of 0.2 indicates that a frame with normalized SAD value greater than 0.2 against the previous frame as scenecut. Default 0.01. diff --git a/source/common/param.cpp b/source/common/param.cpp index fb7244e..925f0c4 100644 --- a/source/common/param.cpp +++ b/source/common/param.cpp @@ -1688,8 +1688,8 @@ int x265_check_params(x265_param* param) "scenecutThreshold must be greater than 0"); CHECK(param->scenecutBias < 0 || 100 < param->scenecutBias, "scenecut-bias must be between 0 and 100"); -CHECK(param->edgeTransitionThreshold < 0.0 || 2.0 < param->edgeTransitionThreshold, -"hist-threshold must be between 0.0 and 2.0"); +CHECK(param->edgeTransitionThreshold < 0.0 || 1.0 < param->edgeTransitionThreshold, +"hist-threshold must be between 0.0 and 1.0"); CHECK(param->radl < 0 || param->radl > param->bframes, "radl must be between 0 and bframes"); CHECK(param->rdPenalty < 0 || param->rdPenalty > 2, diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index 752e5b2..f6bc540 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -222,12 +222,9 @@ void Encoder::create() uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1; m_edgePic = X265_MALLOC(pixel, m_planeSizes[0] * pixelbytes); m_edgeHistThreshold = m_param->edgeTransitionThreshold; -m_chromaHistThreshold = m_edgeHistThreshold * 10.0; -m_chromaHistThreshold = x265_min(m_chromaHistThreshold, MAX_SCENECUT_THRESHOLD); -m_scaledEdgeThreshold = m_edgeHistThreshold * SCENECUT_STRENGTH_FACTOR; -m_scaledEdgeThreshold = x265_min(m_scaledEdgeThreshold, MAX_SCENECUT_THRESHOLD); -m_scaledChromaThreshold = m_chromaHistThreshold * SCENECUT_STRENGTH_FACTOR; -m_scaledChromaThreshold = x265_min(m_scaledChromaThreshold, MAX_SCENECUT_THRESHOLD); +m_chromaHistThreshold = x265_min(m_edgeHistThreshold * 10.0, MAX_SCENECUT_THRESHOLD); +m_scaledEdgeThreshold = x265_min(m_edgeHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD); +m_scaledChromaThreshold = x265_min(m_chromaHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD); if (m_param->sourceBitDepth != m_param->internalBitDepth) { int size = m_param->sourceWidth * m_param->sourceHeight; @@ -1450,13 +1447,14 @@ bool Encoder::computeHistograms(x265_picture *pic) memset(edgeHist, 0, EDGE_BINS * sizeof(int32_t)); for (uint32_t i = 0; i < m_planeSizes[0]; i++) { -if (!m_edgePic[i]) - edgeHist[0]++; +if (m_edgePic[i]) +edgeHist[1]++; else - edgeHist[1]++; +edgeHist[0]++; } + /* Y Histogram Calculation */ -int32_t* yHist = m_curYUVHist[0]; +int32_t *yHist = m_curYUVHist[0]; memset(yHist, 0, HISTOGRAM_BINS * sizeof(int32_t)); for (uint32_t i = 0; i < m_planeSizes[0]; i++) { @@ -1468,7 +1466,7 @@ bool Encoder::computeHistograms(x265_picture *pic) { /* U Histogram Calculation */ int32_t *uHist = m_curYUVHist[1]; -
[x265] [PATCH]Fixed the --hist-scenecut feature to consider the max variation in chroma histograms along with luma edge histograms. Also fixed the formula for normalizing both the SAD values from 0.0
>From 0bc864dbc48624902e5a8314d9ec49ce19a84146 Mon Sep 17 00:00:00 2001 From: Praveen Kumar Karadugattu Date: Tue, 9 Jun 2020 20:27:48 +0530 Subject: [PATCH] Fixed the --hist-scenecut feature to consider the max variation in chroma histograms along with luma edge histograms. Also fixed the formula for normalizing both the SAD values from 0.0 to 1.0. This would alleviate the false positive scene-cuts observed with this feature. --- doc/reST/cli.rst | 8 ++--- source/common/param.cpp| 4 +-- source/encoder/encoder.cpp | 75 ++ source/encoder/encoder.h | 6 ++-- source/x265cli.cpp | 4 +-- 5 files changed, 46 insertions(+), 51 deletions(-) mode change 100644 => 100755 doc/reST/cli.rst mode change 100644 => 100755 source/common/param.cpp mode change 100644 => 100755 source/encoder/encoder.cpp mode change 100644 => 100755 source/encoder/encoder.h mode change 100644 => 100755 source/x265cli.cpp diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst old mode 100644 new mode 100755 index eceec40..c9e288e --- a/doc/reST/cli.rst +++ b/doc/reST/cli.rst @@ -1462,13 +1462,13 @@ Slice decision options .. option:: --hist-scenecut, --no-hist-scenecut Indicates that scenecuts need to be detected using luma edge and chroma histograms. - :option: `--hist-scenecut` enables scenecut detection using the histograms and disables the default scene cut algorithm. - :option: `--no-hist-scenecut` disables histogram based scenecut algorithm. + :option:`--hist-scenecut` enables scenecut detection using the histograms and disables the default scene cut algorithm. + :option:`--no-hist-scenecut` disables histogram based scenecut algorithm. -.. option:: --hist-threshold <0.0..2.0> +.. option:: --hist-threshold <0.0..1.0> This value represents the threshold for normalized SAD of edge histograms used in scenecut detection. - This requires :option: `--hist-scenecut` to be enabled. For example, a value of 0.2 indicates that a frame with normalized SAD value + This requires :option:`--hist-scenecut` to be enabled. For example, a value of 0.2 indicates that a frame with normalized SAD value greater than 0.2 against the previous frame as scenecut. Default 0.01. diff --git a/source/common/param.cpp b/source/common/param.cpp old mode 100644 new mode 100755 index fb7244e..925f0c4 --- a/source/common/param.cpp +++ b/source/common/param.cpp @@ -1688,8 +1688,8 @@ int x265_check_params(x265_param* param) "scenecutThreshold must be greater than 0"); CHECK(param->scenecutBias < 0 || 100 < param->scenecutBias, "scenecut-bias must be between 0 and 100"); -CHECK(param->edgeTransitionThreshold < 0.0 || 2.0 < param->edgeTransitionThreshold, -"hist-threshold must be between 0.0 and 2.0"); +CHECK(param->edgeTransitionThreshold < 0.0 || 1.0 < param->edgeTransitionThreshold, +"hist-threshold must be between 0.0 and 1.0"); CHECK(param->radl < 0 || param->radl > param->bframes, "radl must be between 0 and bframes"); CHECK(param->rdPenalty < 0 || param->rdPenalty > 2, diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp old mode 100644 new mode 100755 index 752e5b2..f6bc540 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -222,12 +222,9 @@ void Encoder::create() uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1; m_edgePic = X265_MALLOC(pixel, m_planeSizes[0] * pixelbytes); m_edgeHistThreshold = m_param->edgeTransitionThreshold; -m_chromaHistThreshold = m_edgeHistThreshold * 10.0; -m_chromaHistThreshold = x265_min(m_chromaHistThreshold, MAX_SCENECUT_THRESHOLD); -m_scaledEdgeThreshold = m_edgeHistThreshold * SCENECUT_STRENGTH_FACTOR; -m_scaledEdgeThreshold = x265_min(m_scaledEdgeThreshold, MAX_SCENECUT_THRESHOLD); -m_scaledChromaThreshold = m_chromaHistThreshold * SCENECUT_STRENGTH_FACTOR; -m_scaledChromaThreshold = x265_min(m_scaledChromaThreshold, MAX_SCENECUT_THRESHOLD); +m_chromaHistThreshold = x265_min(m_edgeHistThreshold * 10.0, MAX_SCENECUT_THRESHOLD); +m_scaledEdgeThreshold = x265_min(m_edgeHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD); +m_scaledChromaThreshold = x265_min(m_chromaHistThreshold * SCENECUT_STRENGTH_FACTOR, MAX_SCENECUT_THRESHOLD); if (m_param->sourceBitDepth != m_param->internalBitDepth) { int size = m_param->sourceWidth * m_param->sourceHeight; @@ -1450,13 +1447,14 @@ bool Encoder::computeHistograms(x265_picture *pic) memset(edgeHist, 0, EDGE_BINS * sizeof(int32_t)); for (uint32_t i = 0; i < m_planeSizes[0]; i++) { -if (!m_edgePic[i]) - edgeHist[0]++; +if (m_edgePic[i]) +
[x265] [PATCH]Fixed some of the wrongly represented cli parameters related to --colormatrix and --videoformat
>From f8664c406ee597b862ca3ee43d6e008bba5d7004 Mon Sep 17 00:00:00 2001 From: Praveen Kumar Karadugattu Date: Tue, 26 May 2020 18:12:51 +0530 Subject: [PATCH] Fixed some of the wrongly represented cli parameters related to --colormatrix and --videoformat --- doc/reST/cli.rst| 110 source/CMakeLists.txt | 2 +- source/common/param.cpp | 12 +++--- source/x265cli.cpp | 12 +++--- 4 files changed, 68 insertions(+), 68 deletions(-) diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst index eceec40..6a04100 100644 --- a/doc/reST/cli.rst +++ b/doc/reST/cli.rst @@ -141,7 +141,7 @@ Logging/Statistic Options **Residual Energy** Average residual energy. SSE is calculated on fenc and pred(before quantization). - **Luma/Chroma Values** minumum, maximum and average(averaged by area) + **Luma/Chroma Values** minimum, maximum and average(averaged by area) luma and chroma values of source for each frame. **PU Statistics** percentage of PU modes at each depth. @@ -246,7 +246,7 @@ Performance Options .. option:: --pools , --numa-pools - Comma seperated list of threads per NUMA node. If "none", then no worker + Comma separated list of threads per NUMA node. If "none", then no worker pools are created and only frame parallelism is possible. If NULL or "" (default) x265 will use all available threads on each NUMA node:: @@ -284,7 +284,7 @@ Performance Options the last thread pool is spawned only if it has more than 32 threads for 64-bit machines, or 16 for 32-bit machines. If the total number of threads in the system doesn't obey this constraint, we may spawn fewer threads - than cores which has been emperically shown to be better for performance. + than cores which has been empirically shown to be better for performance. If the four pool features: :option:`--wpp`, :option:`--pmode`, :option:`--pme` and :option:`--lookahead-slices` are all disabled, @@ -409,7 +409,7 @@ Performance Options Allow encoder to copy input x265 pictures to internal frame buffers. When disabled, x265 will not make an internal copy of the input picture and will work with the - application's buffers. While this allows for deeper integration, it is the responsbility + application's buffers. While this allows for deeper integration, it is the responsibility of the application to (a) ensure that the allocated picture has extra space for padding that will be done by the library, and (b) the buffers aren't recycled until the library has completed encoding this frame (which can be figured out by tracking NALs output by x265) @@ -554,7 +554,7 @@ frame counts) are only applicable to the CLI application. .. option:: --chunk-start - First frame of the chunk. Frames preceeding this in display order will + First frame of the chunk. Frames preceding this in display order will be encoded, however, they will be discarded in the bitstream. This feature can be enabled only in closed GOP structures. Default 0 (disabled). @@ -562,7 +562,7 @@ frame counts) are only applicable to the CLI application. .. option:: --chunk-end Last frame of the chunk. Frames following this in display order will be - used in taking lookahead decisions, but, they will not be encoded. + used in taking lookahead decisions, but they will not be encoded. This feature can be enabled only in closed GOP structures. Default 0 (disabled). @@ -638,7 +638,7 @@ Profile, Level, Tier If :option:`--level-idc` has been specified, --high-tier allows the support of high tier at that level. The encoder will first attempt to encode at the specified level, main tier first, turning on high tier only if - necessary and available at that level.If your requested level does not + necessary and available at that level. If your requested level does not support a High tier, high tier will not be supported. If --no-high-tier has been specified, then the encoder will attempt to encode only at the main tier. @@ -647,8 +647,8 @@ Profile, Level, Tier .. option:: --ref <1..16> Max number of L0 references to be allowed. This number has a linear - multiplier effect on the amount of work performed in motion search, - but will generally have a beneficial affect on compression and + multiplier effect on the amount of work performed in motion search + but will generally have a beneficial effect on compression and distortion. Note that x265 allows up to 16 L0 references but the HEVC @@ -668,7 +668,7 @@ Profile, Level, Tier .. option:: --allow-non-conformance, --no-allow-non-conformance Allow libx265 to generate a bitstream with profile and level NONE. - By default it will abort any encode which does not meet strict level + By default, it will abort any encode which does not meet strict level compliance. The two most likely causes for non-conformance are :option:`--ctu` being too small, :option:`--ref` being too high, or the bitrate o
Re: [x265] [PATCH] Fixed some of the wrongly represented cli parameters in the docs related to --colormatrix and --videoformat.
Hi Aruna, Please find attached the updated patch with the review comments incorporated. Thanks & Regards, Praveen On Mon, May 4, 2020 at 4:56 PM Praveen Kumar Karadugattu < praveenku...@multicorewareinc.com> wrote: > Hi Aruna, > > I have incorporated the changes and the updated patch is sent in the > previous email in this thread. Please check and push the same. > > Thanks & Regards, > Praveen > > On Mon, May 4, 2020 at 4:55 PM Praveen Kumar Karadugattu < > praveenku...@multicorewareinc.com> wrote: > >> From 951411943ed54043c2111f4a09419cbc77e5f0fd Mon Sep 17 00:00:00 2001 >> From: Praveen Karadugattu >> Date: Mon, 4 May 2020 16:50:14 +0530 >> Subject: [PATCH] Fixed some of the wrongly represented cli parameters in >> the >> docs related to --colormatrix and --videoformat. >> --- >> doc/reST/cli.rst| 112 >> >> source/CMakeLists.txt | 2 +- >> source/common/param.cpp | 12 +++--- >> source/x265cli.cpp | 12 +++--- >> 4 files changed, 69 insertions(+), 69 deletions(-) >> diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst >> index 1e2765d..524714a 100644 >> --- a/doc/reST/cli.rst >> +++ b/doc/reST/cli.rst >> @@ -141,7 +141,7 @@ Logging/Statistic Options >> **Residual Energy** Average residual energy. SSE is calculated on fenc >> and pred(before quantization). >> >> - **Luma/Chroma Values** minumum, maximum and average(averaged by area) >> + **Luma/Chroma Values** minimum, maximum and average(averaged by area) >> luma and chroma values of source for each frame. >> >> **PU Statistics** percentage of PU modes at each depth. >> @@ -246,7 +246,7 @@ Performance Options >> >> .. option:: --pools , --numa-pools >> >> - Comma seperated list of threads per NUMA node. If "none", then no worker >> + Comma separated list of threads per NUMA node. If "none", then no worker >> pools are created and only frame parallelism is possible. If NULL or "" >> (default) x265 will use all available threads on each NUMA node:: >> >> @@ -284,7 +284,7 @@ Performance Options >> the last thread pool is spawned only if it has more than 32 threads for >> 64-bit machines, or 16 for 32-bit machines. If the total number of >> threads >> in the system doesn't obey this constraint, we may spawn fewer threads >> - than cores which has been emperically shown to be better for >> performance. >> + than cores which has been empirically shown to be better for >> performance. >> >> If the four pool features: :option:`--wpp`, :option:`--pmode`, >> :option:`--pme` and :option:`--lookahead-slices` are all disabled, >> @@ -409,7 +409,7 @@ Performance Options >> >> Allow encoder to copy input x265 pictures to internal frame buffers. >> When disabled, >> x265 will not make an internal copy of the input picture and will work >> with the >> - application's buffers. While this allows for deeper integration, it is >> the responsbility >> + application's buffers. While this allows for deeper integration, it is >> the responsibility >> of the application to (a) ensure that the allocated picture has extra >> space for padding >> that will be done by the library, and (b) the buffers aren't recycled >> until the library >> has completed encoding this frame (which can be figured out by tracking >> NALs output by x265) >> @@ -554,7 +554,7 @@ frame counts) are only applicable to the CLI >> application. >> >> .. option:: --chunk-start >> >> - First frame of the chunk. Frames preceeding this in display order will >> + First frame of the chunk. Frames preceding this in display order will >> be encoded, however, they will be discarded in the bitstream. This >> feature can be enabled only in closed GOP structures. >> Default 0 (disabled). >> @@ -562,7 +562,7 @@ frame counts) are only applicable to the CLI >> application. >> .. option:: --chunk-end >> >> Last frame of the chunk. Frames following this in display order will be >> - used in taking lookahead decisions, but, they will not be encoded. >> + used in taking lookahead decisions, but they will not be encoded. >> This feature can be enabled only in closed GOP structures. >> Default 0 (disabled). >> >> @@ -638,7 +638,7 @@ Profile, Level, Tier >> If :option:`--level-idc` has been specified, --high-tier allows the >> support of high tier at that level. The encoder will first attempt to >> encode >>
Re: [x265] [PATCH] Fixed some of the wrongly represented cli parameters in the docs related to --colormatrix and --videoformat.
>From 951411943ed54043c2111f4a09419cbc77e5f0fd Mon Sep 17 00:00:00 2001 From: Praveen Karadugattu Date: Mon, 4 May 2020 16:50:14 +0530 Subject: [PATCH] Fixed some of the wrongly represented cli parameters in the docs related to --colormatrix and --videoformat. --- doc/reST/cli.rst| 112 source/CMakeLists.txt | 2 +- source/common/param.cpp | 12 +++--- source/x265cli.cpp | 12 +++--- 4 files changed, 69 insertions(+), 69 deletions(-) diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst index 1e2765d..524714a 100644 --- a/doc/reST/cli.rst +++ b/doc/reST/cli.rst @@ -141,7 +141,7 @@ Logging/Statistic Options **Residual Energy** Average residual energy. SSE is calculated on fenc and pred(before quantization). - **Luma/Chroma Values** minumum, maximum and average(averaged by area) + **Luma/Chroma Values** minimum, maximum and average(averaged by area) luma and chroma values of source for each frame. **PU Statistics** percentage of PU modes at each depth. @@ -246,7 +246,7 @@ Performance Options .. option:: --pools , --numa-pools - Comma seperated list of threads per NUMA node. If "none", then no worker + Comma separated list of threads per NUMA node. If "none", then no worker pools are created and only frame parallelism is possible. If NULL or "" (default) x265 will use all available threads on each NUMA node:: @@ -284,7 +284,7 @@ Performance Options the last thread pool is spawned only if it has more than 32 threads for 64-bit machines, or 16 for 32-bit machines. If the total number of threads in the system doesn't obey this constraint, we may spawn fewer threads - than cores which has been emperically shown to be better for performance. + than cores which has been empirically shown to be better for performance. If the four pool features: :option:`--wpp`, :option:`--pmode`, :option:`--pme` and :option:`--lookahead-slices` are all disabled, @@ -409,7 +409,7 @@ Performance Options Allow encoder to copy input x265 pictures to internal frame buffers. When disabled, x265 will not make an internal copy of the input picture and will work with the - application's buffers. While this allows for deeper integration, it is the responsbility + application's buffers. While this allows for deeper integration, it is the responsibility of the application to (a) ensure that the allocated picture has extra space for padding that will be done by the library, and (b) the buffers aren't recycled until the library has completed encoding this frame (which can be figured out by tracking NALs output by x265) @@ -554,7 +554,7 @@ frame counts) are only applicable to the CLI application. .. option:: --chunk-start - First frame of the chunk. Frames preceeding this in display order will + First frame of the chunk. Frames preceding this in display order will be encoded, however, they will be discarded in the bitstream. This feature can be enabled only in closed GOP structures. Default 0 (disabled). @@ -562,7 +562,7 @@ frame counts) are only applicable to the CLI application. .. option:: --chunk-end Last frame of the chunk. Frames following this in display order will be - used in taking lookahead decisions, but, they will not be encoded. + used in taking lookahead decisions, but they will not be encoded. This feature can be enabled only in closed GOP structures. Default 0 (disabled). @@ -638,7 +638,7 @@ Profile, Level, Tier If :option:`--level-idc` has been specified, --high-tier allows the support of high tier at that level. The encoder will first attempt to encode at the specified level, main tier first, turning on high tier only if - necessary and available at that level.If your requested level does not + necessary and available at that level. If your requested level does not support a High tier, high tier will not be supported. If --no-high-tier has been specified, then the encoder will attempt to encode only at the main tier. @@ -647,8 +647,8 @@ Profile, Level, Tier .. option:: --ref <1..16> Max number of L0 references to be allowed. This number has a linear - multiplier effect on the amount of work performed in motion search, - but will generally have a beneficial affect on compression and + multiplier effect on the amount of work performed in motion search + but will generally have a beneficial effect on compression and distortion. Note that x265 allows up to 16 L0 references but the HEVC @@ -668,7 +668,7 @@ Profile, Level, Tier .. option:: --allow-non-conformance, --no-allow-non-conformance Allow libx265 to generate a bitstream with profile and level NONE. - By default it will abort any encode which does not meet strict level + By default, it will abort any encode which does not meet strict level compliance. The two most likely causes for non-conformance are :option:`--ctu` being too small, :option:`--ref` being too high, or the bitrate o
Re: [x265] [PATCH] Fixed some of the wrongly represented cli parameters in the docs related to --colormatrix and --videoformat.
Hi Aruna, Thanks for the review. Please find my response in-lined below. I will re-send the patch incorporating the changes suggested. Regards, Praveen On Thu, Apr 23, 2020 at 4:11 PM Aruna Matheswaran < ar...@multicorewareinc.com> wrote: > > > On Mon, Apr 20, 2020 at 10:38 AM Praveen Kumar Karadugattu < > praveenku...@multicorewareinc.com> wrote: > >> From 9207e6db602ea218aca7d03075339009429280ef Mon Sep 17 00:00:00 2001 >> From: Praveen Karadugattu >> Date: Fri, 17 Apr 2020 19:59:03 +0530 >> Subject: [PATCH] Fixed some of the wrongly represented cli parameters in >> the >> docs related to --colormatrix and --videoformat. >> --- >> doc/reST/cli.rst| 8 >> source/common/param.cpp | 12 ++-- >> source/x265cli.cpp | 6 +++--- >> 3 files changed, 13 insertions(+), 13 deletions(-) >> diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst >> index 1e2765d..0fad45b 100644 >> --- a/doc/reST/cli.rst >> +++ b/doc/reST/cli.rst >> @@ -2154,7 +2154,7 @@ VUI fields must be manually specified. >> 2. ntsc >> 3. secam >> 4. mac >> - 5. undefined >> + 5. unknown >> >> .. option:: --range >> >> @@ -2207,15 +2207,15 @@ VUI fields must be manually specified. >> Specify color matrix setting i.e set the matrix coefficients used in >> deriving the luma and chroma. Default undefined (not signaled) >> >> - 0. GBR >> + 0. gbr >> 1. bt709 >> - 2. undef >> + 2. unknown >> 3. **reserved** >> 4. fcc >> 5. bt470bg >> 6. smpte170m >> 7. smpte240m >> - 8. YCgCo >> + 8. ycgco >> 9. bt2020nc >> 10. bt2020c >> 11. smpte2085 >> diff --git a/source/common/param.cpp b/source/common/param.cpp >> index 908400f..b4965ce 100644 >> --- a/source/common/param.cpp >> +++ b/source/common/param.cpp >> @@ -1122,7 +1122,7 @@ int x265_param_parse(x265_param* p, const char* >> name, const char* value) >> p->vui.bEnableOverscanInfoPresentFlag = 1; >> p->vui.bEnableOverscanAppropriateFlag = 1; >> } >> -else if (!strcmp(value, "undef")) >> +else if (!strcmp(value, "unknown")) >> > [AM] You are changing the API here. Please update X265_BUILD. > btw, there is no discrepancy between CLI values and the document of the > "overscan" option. > Did you modify this to have uniform values across VUI options? If so, can > we introduce "unknown" as an alias for "undef" and deprecate "undef" in the > next version? > [PK] I will increment the X265_BUILD in CMakeLists.txt. "overscan" has "undef" option. I have made it "unknown" to maintain uniformity. Yes we need to deprecate "undef" and use "unknown" everywhere instead. > p->vui.bEnableOverscanInfoPresentFlag = 0; >> else >> bError = true; >> @@ -1643,23 +1643,23 @@ int x265_check_params(x265_param* param) >>"Sample Aspect Ratio height must be greater than 0"); >> CHECK(param->vui.videoFormat < 0 || param->vui.videoFormat > 5, >>"Video Format must be component," >> - " pal, ntsc, secam, mac or undef"); >> + " pal, ntsc, secam, mac or unknown"); >> CHECK(param->vui.colorPrimaries < 0 >>|| param->vui.colorPrimaries > 12 >>|| param->vui.colorPrimaries == 3, >> - "Color Primaries must be undef, bt709, bt470m," >> + "Color Primaries must be unknown, bt709, bt470m," >>" bt470bg, smpte170m, smpte240m, film, bt2020, smpte-st-428, >> smpte-rp-431 or smpte-eg-432"); >> CHECK(param->vui.transferCharacteristics < 0 >>|| param->vui.transferCharacteristics > 18 >>|| param->vui.transferCharacteristics == 3, >> - "Transfer Characteristics must be undef, bt709, bt470m, >> bt470bg," >> + "Transfer Characteristics must be unknown, bt709, bt470m, >> bt470bg," >>" smpte170m, smpte240m, linear, log100, log316, iec61966-2-4, >> bt1361e," >>" iec61966-2-1, bt2020-10, bt2020-12, smpte-st-2084, >> smpte-st-428 or arib-std-b67"); >> CHECK(param->vui.matrixCoeffs < 0 >>|| param->vui.matrixCoeffs > 14 >>|| param->vui.matrixCoeffs == 3, >> - "Matrix Coeffici
[x265] [PATCH] Fixed some of the wrongly represented cli parameters in the docs related to --colormatrix and --videoformat.
>From 9207e6db602ea218aca7d03075339009429280ef Mon Sep 17 00:00:00 2001 From: Praveen Karadugattu Date: Fri, 17 Apr 2020 19:59:03 +0530 Subject: [PATCH] Fixed some of the wrongly represented cli parameters in the docs related to --colormatrix and --videoformat. --- doc/reST/cli.rst| 8 source/common/param.cpp | 12 ++-- source/x265cli.cpp | 6 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst index 1e2765d..0fad45b 100644 --- a/doc/reST/cli.rst +++ b/doc/reST/cli.rst @@ -2154,7 +2154,7 @@ VUI fields must be manually specified. 2. ntsc 3. secam 4. mac - 5. undefined + 5. unknown .. option:: --range @@ -2207,15 +2207,15 @@ VUI fields must be manually specified. Specify color matrix setting i.e set the matrix coefficients used in deriving the luma and chroma. Default undefined (not signaled) - 0. GBR + 0. gbr 1. bt709 - 2. undef + 2. unknown 3. **reserved** 4. fcc 5. bt470bg 6. smpte170m 7. smpte240m - 8. YCgCo + 8. ycgco 9. bt2020nc 10. bt2020c 11. smpte2085 diff --git a/source/common/param.cpp b/source/common/param.cpp index 908400f..b4965ce 100644 --- a/source/common/param.cpp +++ b/source/common/param.cpp @@ -1122,7 +1122,7 @@ int x265_param_parse(x265_param* p, const char* name, const char* value) p->vui.bEnableOverscanInfoPresentFlag = 1; p->vui.bEnableOverscanAppropriateFlag = 1; } -else if (!strcmp(value, "undef")) +else if (!strcmp(value, "unknown")) p->vui.bEnableOverscanInfoPresentFlag = 0; else bError = true; @@ -1643,23 +1643,23 @@ int x265_check_params(x265_param* param) "Sample Aspect Ratio height must be greater than 0"); CHECK(param->vui.videoFormat < 0 || param->vui.videoFormat > 5, "Video Format must be component," - " pal, ntsc, secam, mac or undef"); + " pal, ntsc, secam, mac or unknown"); CHECK(param->vui.colorPrimaries < 0 || param->vui.colorPrimaries > 12 || param->vui.colorPrimaries == 3, - "Color Primaries must be undef, bt709, bt470m," + "Color Primaries must be unknown, bt709, bt470m," " bt470bg, smpte170m, smpte240m, film, bt2020, smpte-st-428, smpte-rp-431 or smpte-eg-432"); CHECK(param->vui.transferCharacteristics < 0 || param->vui.transferCharacteristics > 18 || param->vui.transferCharacteristics == 3, - "Transfer Characteristics must be undef, bt709, bt470m, bt470bg," + "Transfer Characteristics must be unknown, bt709, bt470m, bt470bg," " smpte170m, smpte240m, linear, log100, log316, iec61966-2-4, bt1361e," " iec61966-2-1, bt2020-10, bt2020-12, smpte-st-2084, smpte-st-428 or arib-std-b67"); CHECK(param->vui.matrixCoeffs < 0 || param->vui.matrixCoeffs > 14 || param->vui.matrixCoeffs == 3, - "Matrix Coefficients must be undef, bt709, fcc, bt470bg, smpte170m," - " smpte240m, GBR, YCgCo, bt2020nc, bt2020c, smpte-st-2085, chroma-nc, chroma-c or ictcp"); + "Matrix Coefficients must be unknown, bt709, fcc, bt470bg, smpte170m," + " smpte240m, gbr, ycgco, bt2020nc, bt2020c, smpte-st-2085, chroma-nc, chroma-c or ictcp"); CHECK(param->vui.chromaSampleLocTypeTopField < 0 || param->vui.chromaSampleLocTypeTopField > 5, "Chroma Sample Location Type Top Field must be 0-5"); diff --git a/source/x265cli.cpp b/source/x265cli.cpp index 05f16b7..4d91b99 100644 --- a/source/x265cli.cpp +++ b/source/x265cli.cpp @@ -291,8 +291,8 @@ namespace X265_NS { H0(" 5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11,\n"); H0(" 12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of . Default %d\n", param->vui.aspectRatioIdc); H1(" --display-window Describe overscan cropping region as 'left,top,right,bottom' in pixels\n"); -H1(" --overscanSpecify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef\n"); -H0(" --videoformat Specify video format from undef, component, pal, ntsc, secam, mac. Default undef\n"); +H1(" --overscanSpecify whether it is appropriate for decoder to show cropped region: unknown, show or crop. Default undef\n"); +H0(" --videoformat Specify video format from unknown, component, pal, ntsc, secam, mac. Default undef\n"); H0(" --range
Re: [x265] [PATCH 1 of 1] Feature: Histogram Based Scene Cut Detection
This patch has been reviewed and looks good to me. Regards, Praveen On Mon, Nov 25, 2019 at 6:53 PM Srikanth Kurapati < srikanth.kurap...@multicorewareinc.com> wrote: > # HG changeset patch > # User Srikanth Kurapati > # Date 1573649311 -19800 > # Wed Nov 13 18:18:31 2019 +0530 > # Node ID 97a9eca413d83cd03ae0fa95957160bdf70c170b > # Parent 04db2bfee5d628d931d1407355b909ac8ff1c898 > Histogram Based Scene Cut Detection. > > This patch does the following. > 1.Finds scene cuts by thresholding normalized SAD of edge and chroma > histograms. > 2.Add option "--hist-scenecut" to enable histogram based scene cut > detection. > 3.Add option "--hist-threshold" to provide threshold for determining scene > cuts. > 3.Optimizes frame duplication by reusing normalized SAD to mark duplicate > frames. > > diff -r 04db2bfee5d6 -r 97a9eca413d8 doc/reST/cli.rst > --- a/doc/reST/cli.rst Thu Oct 31 16:23:27 2019 +0530 > +++ b/doc/reST/cli.rst Wed Nov 13 18:18:31 2019 +0530 > @@ -1426,7 +1426,20 @@ > This value represents the percentage difference between the inter cost > and > intra cost of a frame used in scenecut detection. For example, a value > of 5 indicates, > if the inter cost of a frame is greater than or equal to 95 percent of > the intra cost of the frame, > - then detect this frame as scenecut. Values between 5 and 15 are > recommended. Default 5. > + then detect this frame as scenecut. Values between 5 and 15 are > recommended. Default 5. > + > +.. option:: --hist-scenecut, --no-hist-scenecut > + > + Indicates that scenecuts need to be detected using luma edge and chroma > histograms. > + option: `--hist-scenecut` enables scenecut detection using the > histograms and disables the default scene cut algorithm. > + option: `--no-hist-scenecut` disables histogram based scenecut algorithm. > + > +.. option:: --hist-threshold <0.0..2.0> > + > + This value represents the threshold for normalized SAD of edge > histograms used in scenecut detection. > + This requires option: `--hist-scenecut` to be enabled. For example, a > value of 0.2 indicates that a frame with normalized SAD value > + greater than 0.2 against the previous frame as scenecut. > + Default 0.01. > > .. option:: --radl > > diff -r 04db2bfee5d6 -r 97a9eca413d8 source/CMakeLists.txt > --- a/source/CMakeLists.txt Thu Oct 31 16:23:27 2019 +0530 > +++ b/source/CMakeLists.txt Wed Nov 13 18:18:31 2019 +0530 > @@ -29,7 +29,7 @@ > option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) > mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD) > # X265_BUILD must be incremented each time the public API is changed > -set(X265_BUILD 182) > +set(X265_BUILD 183) > configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" > "${PROJECT_BINARY_DIR}/x265.def") > configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" > diff -r 04db2bfee5d6 -r 97a9eca413d8 source/common/common.h > --- a/source/common/common.h Thu Oct 31 16:23:27 2019 +0530 > +++ b/source/common/common.h Wed Nov 13 18:18:31 2019 +0530 > @@ -129,12 +129,16 @@ > typedef uint64_t sum2_t; > typedef uint64_t pixel4; > typedef int64_t ssum2_t; > +#define HISTOGRAM_BINS 1024 > +#define SHIFT 1 > #else > typedef uint8_t pixel; > typedef uint16_t sum_t; > typedef uint32_t sum2_t; > typedef uint32_t pixel4; > typedef int32_t ssum2_t; // Signed sum > +#define HISTOGRAM_BINS 256 > +#define SHIFT 0 > #endif // if HIGH_BIT_DEPTH > > #if X265_DEPTH < 10 > diff -r 04db2bfee5d6 -r 97a9eca413d8 source/common/param.cpp > --- a/source/common/param.cpp Thu Oct 31 16:23:27 2019 +0530 > +++ b/source/common/param.cpp Wed Nov 13 18:18:31 2019 +0530 > @@ -167,6 +167,8 @@ > param->bFrameAdaptive = X265_B_ADAPT_TRELLIS; > param->bBPyramid = 1; > param->scenecutThreshold = 40; /* Magic number pulled in from x264 */ > +param->edgeTransitionThreshold = 0.01; > +param->bHistBasedSceneCut = 0; > param->lookaheadSlices = 8; > param->lookaheadThreads = 0; > param->scenecutBias = 5.0; > @@ -572,6 +574,7 @@ > param->bframes = 0; > param->lookaheadDepth = 0; > param->scenecutThreshold = 0; > +param->bHistBasedSceneCut = 0; > param->rc.cuTree = 0; > param->frameNumThreads = 1; > } > @@ -920,12 +923,13 @@ > OPT("lookahead-slices") p->lookaheadSlices = atoi(value); > OPT("scenecut") > { > -p->scenecutThreshold = atobool(value); > -if (bError || p->scenecutThr
[x265] [x265 PATCH] Decoupled the non-"medium" presets from the effect of new default parameters committed under 4583000db964
# HG changeset patch # User praveen_karadugattu # Date 1560507248 -19800 # Fri Jun 14 15:44:08 2019 +0530 # Node ID 6766973bc652a7a2a550f539f03248ee54e3b312 # Parent a46ded2c14116af1cafacdc1fb98be43259dc7d6 Decoupled the non-"medium" presets from the effect of new default parameters committed under commit-id 4583000db964. diff -r a46ded2c1411 -r 6766973bc652 source/common/param.cpp --- a/source/common/param.cpp Tue May 28 14:01:54 2019 +0800 +++ b/source/common/param.cpp Fri Jun 14 15:44:08 2019 +0530 @@ -361,6 +361,8 @@ if (!strcmp(preset, "ultrafast")) { +param->maxNumMergeCand = 2; +param->bIntraInBFrames = 0; param->lookaheadDepth = 5; param->scenecutThreshold = 0; // disable lookahead param->maxCUSize = 32; @@ -369,7 +371,6 @@ param->bFrameAdaptive = 0; param->subpelRefine = 0; param->searchMethod = X265_DIA_SEARCH; -param->bEnableEarlySkip = 1; param->bEnableSAO = 0; param->bEnableSignHiding = 0; param->bEnableWeightedPred = 0; @@ -384,12 +385,13 @@ } else if (!strcmp(preset, "superfast")) { +param->maxNumMergeCand = 2; +param->bIntraInBFrames = 0; param->lookaheadDepth = 10; param->maxCUSize = 32; param->bframes = 3; param->bFrameAdaptive = 0; param->subpelRefine = 1; -param->bEnableEarlySkip = 1; param->bEnableWeightedPred = 0; param->rdLevel = 2; param->maxNumReferences = 1; @@ -403,10 +405,12 @@ } else if (!strcmp(preset, "veryfast")) { +param->maxNumMergeCand = 2; +param->limitReferences = 3; +param->bIntraInBFrames = 0; param->lookaheadDepth = 15; param->bFrameAdaptive = 0; param->subpelRefine = 1; -param->bEnableEarlySkip = 1; param->rdLevel = 2; param->maxNumReferences = 2; param->rc.qgSize = 32; @@ -414,15 +418,21 @@ } else if (!strcmp(preset, "faster")) { +param->maxNumMergeCand = 2; +param->limitReferences = 3; +param->bIntraInBFrames = 0; param->lookaheadDepth = 15; param->bFrameAdaptive = 0; -param->bEnableEarlySkip = 1; param->rdLevel = 2; param->maxNumReferences = 2; param->bEnableFastIntra = 1; } else if (!strcmp(preset, "fast")) { +param->maxNumMergeCand = 2; +param->limitReferences = 3; +param->bEnableEarlySkip = 0; +param->bIntraInBFrames = 0; param->lookaheadDepth = 15; param->bFrameAdaptive = 0; param->rdLevel = 2; @@ -435,13 +445,15 @@ } else if (!strcmp(preset, "slow")) { +param->limitReferences = 3; +param->bEnableEarlySkip = 0; +param->bIntraInBFrames = 0; param->bEnableRectInter = 1; param->lookaheadDepth = 25; param->rdLevel = 4; param->rdoqLevel = 2; param->psyRdoq = 1.0; param->subpelRefine = 3; -param->maxNumMergeCand = 3; param->searchMethod = X265_STAR_SEARCH; param->maxNumReferences = 4; param->limitModes = 1; @@ -449,6 +461,7 @@ } else if (!strcmp(preset, "slower")) { +param->bEnableEarlySkip = 0; param->bEnableWeightedBiPred = 1; param->bEnableAMP = 1; param->bEnableRectInter = 1; @@ -463,14 +476,13 @@ param->maxNumMergeCand = 4; param->searchMethod = X265_STAR_SEARCH; param->maxNumReferences = 5; -param->limitReferences = 1; param->limitModes = 1; -param->bIntraInBFrames = 1; param->lookaheadSlices = 0; // disabled for best quality param->limitTU = 4; } else if (!strcmp(preset, "veryslow")) { +param->bEnableEarlySkip = 0; param->bEnableWeightedBiPred = 1; param->bEnableAMP = 1; param->bEnableRectInter = 1; @@ -487,12 +499,12 @@ param->maxNumReferences = 5; param->limitReferences = 0; param->limitModes = 0; -param->bIntraInBFrames = 1; param->lookaheadSlices = 0; // disabled for best quality param->limitTU = 0; } else if (!strcmp(preset, "placebo")) { +param->bEnableEarlySkip = 0; param->bEnableWeightedBiPred = 1; param->bEnableAMP = 1; param->bEnableRectInter = 1; @@ -511,7 +523,6 @@ param->bEnableRecursionSkip = 0;
[x265] [x265 PATCH] Changed the params max-merge to 3, b-intra enabled, limit-ref and early-skip enabled for improved performance for high res
# HG changeset patch # User praveen_karadugattu # Date 1558937905 -19800 # Mon May 27 11:48:25 2019 +0530 # Node ID 4583000db964b8b942c55f532216a3696fcf69ea # Parent b9bef1a4c34a82ea685ed76ebdd642c266bffcc3 Changed the params max-merge to 3, b-intra enabled, limit-ref and early-skip enabled for improved performance for high res diff --git a/source/common/param.cpp b/source/common/param.cpp --- a/source/common/param.cpp +++ b/source/common/param.cpp @@ -185,12 +185,12 @@ param->searchMethod = X265_HEX_SEARCH; param->subpelRefine = 2; param->searchRange = 57; -param->maxNumMergeCand = 2; -param->limitReferences = 3; + param->maxNumMergeCand = 3; + param->limitReferences = 1; param->limitModes = 0; param->bEnableWeightedPred = 1; param->bEnableWeightedBiPred = 0; -param->bEnableEarlySkip = 0; + param->bEnableEarlySkip = 1; param->bEnableRecursionSkip = 1; param->bEnableAMP = 0; param->bEnableRectInter = 0; @@ -225,7 +225,7 @@ param->analysisReuseFileName = NULL; param->analysisSave = NULL; param->analysisLoad = NULL; -param->bIntraInBFrames = 0; + param->bIntraInBFrames = 1; param->bLossless = 0; param->bCULossless = 0; param->bEnableTemporalSubLayers = 0; ModifiedDefaultParams.diff Description: Binary data ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] ratecontrol.cpp: nits - fix for coding style
# HG changeset patch # User Praveen Tiwari # Date 1541569020 -19800 # Wed Nov 07 11:07:00 2018 +0530 # Branch stable # Node ID 5177401a9d4c8b577c4502538037e1cd0d2fae68 # Parent 26b4debfab1af7d5e080902b700d6124fafa8ebd ratecontrol.cpp: nits - fix for coding style diff -r 26b4debfab1a -r 5177401a9d4c source/encoder/ratecontrol.cpp --- a/source/encoder/ratecontrol.cppThu Nov 01 18:47:40 2018 +0530 +++ b/source/encoder/ratecontrol.cppWed Nov 07 11:07:00 2018 +0530 @@ -381,9 +381,9 @@ m_isGrainEnabled = false; if(m_param->rc.bEnableGrain) // tune for grainy content OR equal p-b frame sizes -m_isGrainEnabled = true; +m_isGrainEnabled = true; for (int i = 0; i < 3; i++) -m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN); +m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN); m_avgPFrameQp = 0 ; /* 720p videos seem to be a good cutoff for cplxrSum */ @@ -1253,9 +1253,7 @@ m_isSceneTransition = false; if (rce->encodeOrder < m_lastPredictorReset + m_param->frameNumThreads) -{ rce->rowPreds[0][0].count = 0; -} rce->bLastMiniGopBFrame = curFrame->m_lowres.bLastMiniGopBFrame; rce->bufferRate = m_bufferRate; @@ -1458,12 +1456,8 @@ if (!rce->keptAsRef) q *= fabs(m_param->rc.pbFactor); } -else if (rce->sliceType == P_SLICE - && m_lastNonBPictType == P_SLICE - && rce->coeffBits == 0) -{ +else if (rce->sliceType == P_SLICE && m_lastNonBPictType == P_SLICE && rce->coeffBits == 0) q = lastPqScale; -} /* last qscale / qdiff stuff */ if (m_lastNonBPictType == rce->sliceType && @@ -1664,9 +1658,7 @@ m_movingAvgSum += m_satdCostWindow[addPos]; } else if (m_sliderPos == s_slidingWindowFrames) -{ m_movingAvgSum += m_satdCostWindow[addPos]; -} else if (m_sliderPos > 0) { m_movingAvgSum += m_satdCostWindow[addPos]; @@ -1964,9 +1956,7 @@ } } else if (m_qCompress != 1 && m_param->rc.rateControlMode == X265_RC_CRF) -{ q = x265_qp2qScale(CRF_INIT_QP) / fabs(m_param->rc.ipFactor); -} else if (m_framesDone == 0 && !m_isVbv && m_param->rc.rateControlMode == X265_RC_ABR) { /* for ABR alone, clip the first I frame qp */ # HG changeset patch # User Praveen Tiwari # Date 1541569020 -19800 # Wed Nov 07 11:07:00 2018 +0530 # Branch stable # Node ID 5177401a9d4c8b577c4502538037e1cd0d2fae68 # Parent 26b4debfab1af7d5e080902b700d6124fafa8ebd ratecontrol.cpp: nits - fix for coding style diff -r 26b4debfab1a -r 5177401a9d4c source/encoder/ratecontrol.cpp --- a/source/encoder/ratecontrol.cpp Thu Nov 01 18:47:40 2018 +0530 +++ b/source/encoder/ratecontrol.cpp Wed Nov 07 11:07:00 2018 +0530 @@ -381,9 +381,9 @@ m_isGrainEnabled = false; if(m_param->rc.bEnableGrain) // tune for grainy content OR equal p-b frame sizes -m_isGrainEnabled = true; +m_isGrainEnabled = true; for (int i = 0; i < 3; i++) -m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN); +m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN); m_avgPFrameQp = 0 ; /* 720p videos seem to be a good cutoff for cplxrSum */ @@ -1253,9 +1253,7 @@ m_isSceneTransition = false; if (rce->encodeOrder < m_lastPredictorReset + m_param->frameNumThreads) -{ rce->rowPreds[0][0].count = 0; -} rce->bLastMiniGopBFrame = curFrame->m_lowres.bLastMiniGopBFrame; rce->bufferRate = m_bufferRate; @@ -1458,12 +1456,8 @@ if (!rce->keptAsRef) q *= fabs(m_param->rc.pbFactor); } -else if (rce->sliceType == P_SLICE - && m_lastNonBPictType == P_SLICE - && rce->coeffBits == 0) -{ +else if (rce->sliceType == P_SLICE && m_lastNonBPictType == P_SLICE && rce->coeffBits == 0) q = lastPqScale; -} /* last qscale / qdiff stuff */ if (m_lastNonBPictType == rce->sliceType && @@ -1664,9 +1658,7 @@ m_movingAvgSum += m_satdCostWindow[addPos]; } else if (m_sliderPos == s_slidingWindowFrames) -{ m_movingAvgSum += m_satdCostWindow[addPos]; -} else if (m_sliderPos > 0) { m_movingAvgSum += m_satdCostWindow[addPos]; @@ -19
Re: [x265] [PATCH] encoder: Do not include CLL SEI message if empty
Hello Vittorio, Sorry for the late reply, all of us were on leave due to the Diwali festival in India. Thanks for the patch, will run some basic test and push the patch. Regards, Praveen On Wed, Nov 7, 2018 at 12:35 AM Vittorio Giovara wrote: > > > On Thu, Nov 1, 2018 at 5:34 PM Vittorio Giovara < > vittorio.giov...@gmail.com> wrote: > >> Some devices render out-of-luminance pixels incorrectly otherwise. >> >> --- >> source/encoder/encoder.cpp | 11 +++ >> 1 file changed, 7 insertions(+), 4 deletions(-) >> >> diff -r fd517ae68f93 source/encoder/encoder.cpp >> --- a/source/encoder/encoder.cppTue Sep 25 16:02:31 2018 +0530 >> +++ b/source/encoder/encoder.cppThu Nov 01 17:27:51 2018 -0400 >> @@ -2381,10 +2381,13 @@ >> >> if (m_param->bEmitHDRSEI) >> { >> -SEIContentLightLevel cllsei; >> -cllsei.max_content_light_level = m_param->maxCLL; >> -cllsei.max_pic_average_light_level = m_param->maxFALL; >> -cllsei.writeSEImessages(bs, m_sps, NAL_UNIT_PREFIX_SEI, list, >> m_param->bSingleSeiNal); >> +if (m_emitCLLSEI) >> +{ >> +SEIContentLightLevel cllsei; >> +cllsei.max_content_light_level = m_param->maxCLL; >> +cllsei.max_pic_average_light_level = m_param->maxFALL; >> +cllsei.writeSEImessages(bs, m_sps, NAL_UNIT_PREFIX_SEI, >> list, m_param->bSingleSeiNal); >> +} >> >> if (m_param->masteringDisplayColorVolume) >> { >> -- >> Vittorio >> > > ping > -- > Vittorio > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] REPORT_SPEEDUP: correct the description
# HG changeset patch # User Praveen Tiwari # Date 1541078260 -19800 # Thu Nov 01 18:47:40 2018 +0530 # Branch stable # Node ID 26b4debfab1af7d5e080902b700d6124fafa8ebd # Parent 471726d3a0462739ff8e3518eb1a1e8a01de4e8d REPORT_SPEEDUP: correct the description diff -r 471726d3a046 -r 26b4debfab1a source/test/testharness.h --- a/source/test/testharness.h Wed Oct 31 16:35:48 2018 +0530 +++ b/source/test/testharness.h Thu Nov 01 18:47:40 2018 +0530 @@ -93,9 +93,9 @@ #define BENCH_RUNS 2000 -// Adapted from checkasm.c, runs each optimized primitive four times, measures rdtsc -// and discards invalid times. Repeats 1000 times to get a good average. Then measures -// the C reference with fewer runs and reports X factor and average cycles. +/* Adapted from checkasm.c, runs each optimized primitive four times, measures rdtsc + * and discards invalid times. Repeats BENCH_RUNS times to get a good average. + * Then measures the C reference with BENCH_RUNS / 4 runs and reports X factor and average cycles.*/ #define REPORT_SPEEDUP(RUNOPT, RUNREF, ...) \ { \ uint32_t cycles = 0; int runs = 0; \ # HG changeset patch # User Praveen Tiwari # Date 1541078260 -19800 # Thu Nov 01 18:47:40 2018 +0530 # Branch stable # Node ID 26b4debfab1af7d5e080902b700d6124fafa8ebd # Parent 471726d3a0462739ff8e3518eb1a1e8a01de4e8d REPORT_SPEEDUP: correct the description diff -r 471726d3a046 -r 26b4debfab1a source/test/testharness.h --- a/source/test/testharness.h Wed Oct 31 16:35:48 2018 +0530 +++ b/source/test/testharness.h Thu Nov 01 18:47:40 2018 +0530 @@ -93,9 +93,9 @@ #define BENCH_RUNS 2000 -// Adapted from checkasm.c, runs each optimized primitive four times, measures rdtsc -// and discards invalid times. Repeats 1000 times to get a good average. Then measures -// the C reference with fewer runs and reports X factor and average cycles. +/* Adapted from checkasm.c, runs each optimized primitive four times, measures rdtsc + * and discards invalid times. Repeats BENCH_RUNS times to get a good average. + * Then measures the C reference with BENCH_RUNS / 4 runs and reports X factor and average cycles.*/ #define REPORT_SPEEDUP(RUNOPT, RUNREF, ...) \ { \ uint32_t cycles = 0; int runs = 0; \ ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] fix Issue #442: linking issue on non x86 platform
# HG changeset patch # User Praveen Tiwari # Date 1540983948 -19800 # Wed Oct 31 16:35:48 2018 +0530 # Node ID f0d02ca443adf8ff90ed61552d35347ff51c8e90 # Parent fd517ae68f93dbfdd1bff45a9dd8e626523542b6 fix Issue #442: linking issue on non x86 platform diff -r fd517ae68f93 -r f0d02ca443ad source/common/cpu.cpp --- a/source/common/cpu.cpp Tue Sep 25 16:02:31 2018 +0530 +++ b/source/common/cpu.cpp Wed Oct 31 16:35:48 2018 +0530 @@ -127,6 +127,7 @@ { return(enable512); } + uint32_t cpu_detect(bool benableavx512 ) { diff -r fd517ae68f93 -r f0d02ca443ad source/common/quant.cpp --- a/source/common/quant.cpp Tue Sep 25 16:02:31 2018 +0530 +++ b/source/common/quant.cpp Wed Oct 31 16:35:48 2018 +0530 @@ -723,6 +723,7 @@ X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); uint32_t blkPos = codeParams.scan[scanPosBase]; +#if X265_ARCH_X86 bool enable512 = detect512(); if (enable512) primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); @@ -731,6 +732,10 @@ primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , ,blkPos); primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); } +#else +primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , blkPos); +primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); +#endif } } else @@ -805,8 +810,8 @@ uint32_t blkPos = codeParams.scan[scanPosBase]; if (usePsyMask) { +#if X265_ARCH_X86 bool enable512 = detect512(); - if (enable512) primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); else @@ -814,6 +819,10 @@ primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , blkPos); primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); } +#else +primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , blkPos); +primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); +#endif blkPos = codeParams.scan[scanPosBase]; for (int y = 0; y < MLS_CG_SIZE; y++) { # HG changeset patch # User Praveen Tiwari # Date 1540983948 -19800 # Wed Oct 31 16:35:48 2018 +0530 # Node ID f0d02ca443adf8ff90ed61552d35347ff51c8e90 # Parent fd517ae68f93dbfdd1bff45a9dd8e626523542b6 fix Issue #442: linking issue on non x86 platform diff -r fd517ae68f93 -r f0d02ca443ad source/common/cpu.cpp --- a/source/common/cpu.cpp Tue Sep 25 16:02:31 2018 +0530 +++ b/source/common/cpu.cpp Wed Oct 31 16:35:48 2018 +0530 @@ -127,6 +127,7 @@ { return(enable512); } + uint32_t cpu_detect(bool benableavx512 ) { diff -r fd517ae68f93 -r f0d02ca443ad source/common/quant.cpp --- a/source/common/quant.cpp Tue Sep 25 16:02:31 2018 +0530 +++ b/source/common/quant.cpp Wed Oct 31 16:35:48 2018 +0530 @@ -723,6 +723,7 @@ X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); uint32_t blkPos = codeParams.scan[scanPosBase]; +#if X265_ARCH_X86 bool enable512 = detect512(); if (enable512) primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); @@ -731,6 +732,10 @@ primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , ,blkPos); primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); } +#else +primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , blkPos); +primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); +#endif } } else @@ -805,8 +810,8 @@ uint32_t blkPos = codeParams.scan[scanPosBase]; if (usePsyMask) { +#if X265_ARCH_X86 bool enable512 = detect512(); - if (enable512) primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); else @@ -814,6 +819,10 @@ primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , blkPos); primitives.cu[log2TrSize
Re: [x265] [PATCH] fix Issue #442: linking issue on non x86 platform
Thanks! I messed up the syntax. On Wed, Oct 31, 2018 at 5:45 PM Andrey Semashev wrote: > On 10/31/18 2:33 PM, prav...@multicorewareinc.com wrote: > > # HG changeset patch > > # User Praveen Tiwari > > # Date 1540983948 -19800 > > # Wed Oct 31 16:35:48 2018 +0530 > > # Node ID 1c878790edea64186edabcd40fb3df121f536311 > > # Parent fd517ae68f93dbfdd1bff45a9dd8e626523542b6 > > fix Issue #442: linking issue on non x86 platform > > > > diff -r fd517ae68f93 -r 1c878790edea source/common/cpu.cpp > > --- a/source/common/cpu.cpp Tue Sep 25 16:02:31 2018 +0530 > > +++ b/source/common/cpu.cpp Wed Oct 31 16:35:48 2018 +0530 > > @@ -127,6 +127,7 @@ > > { > > return(enable512); > > } > > + > > uint32_t cpu_detect(bool benableavx512 ) > > { > > > > diff -r fd517ae68f93 -r 1c878790edea source/common/quant.cpp > > --- a/source/common/quant.cpp Tue Sep 25 16:02:31 2018 +0530 > > +++ b/source/common/quant.cpp Wed Oct 31 16:35:48 2018 +0530 > > @@ -723,6 +723,7 @@ > > X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff > failure\n"); > > uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); > > uint32_t blkPos = codeParams.scan[scanPosBase]; > > +#if X265_ARCH_X86 > > bool enable512 = detect512(); > > if (enable512) > > primitives.cu[log2TrSize - > 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, > , , , blkPos); > > @@ -731,6 +732,10 @@ > > primitives.cu[log2TrSize - > 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , > ,blkPos); > > primitives.cu[log2TrSize - > 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, > , , , blkPos); > > } > > +#elif > > #else? Everywhere else, too. > > > +primitives.cu[log2TrSize - > 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , > , blkPos); > > +primitives.cu[log2TrSize - > 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, > , , , blkPos); > > +#endif > > } > > } > > else > > @@ -805,8 +810,8 @@ > > uint32_t blkPos = codeParams.scan[scanPosBase]; > > if (usePsyMask) > > { > > +#if X265_ARCH_X86 > > bool enable512 = detect512(); > > - > > if (enable512) > > primitives.cu[log2TrSize - > 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, > , , , blkPos); > > else > > @@ -814,6 +819,10 @@ > > primitives.cu[log2TrSize - > 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , > , blkPos); > > primitives.cu[log2TrSize - > 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, > , , , blkPos); > > } > > +#elif > > +primitives.cu[log2TrSize - > 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , > , blkPos); > > +primitives.cu[log2TrSize - > 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, > , , , blkPos); > > +#endif > > blkPos = codeParams.scan[scanPosBase]; > > for (int y = 0; y < MLS_CG_SIZE; y++) > > { > > > > > > ___ > > x265-devel mailing list > > x265-devel@videolan.org > > https://mailman.videolan.org/listinfo/x265-devel > > > > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] fix Issue #442: linking issue on non x86 platform
# HG changeset patch # User Praveen Tiwari # Date 1540983948 -19800 # Wed Oct 31 16:35:48 2018 +0530 # Node ID 1c878790edea64186edabcd40fb3df121f536311 # Parent fd517ae68f93dbfdd1bff45a9dd8e626523542b6 fix Issue #442: linking issue on non x86 platform diff -r fd517ae68f93 -r 1c878790edea source/common/cpu.cpp --- a/source/common/cpu.cpp Tue Sep 25 16:02:31 2018 +0530 +++ b/source/common/cpu.cpp Wed Oct 31 16:35:48 2018 +0530 @@ -127,6 +127,7 @@ { return(enable512); } + uint32_t cpu_detect(bool benableavx512 ) { diff -r fd517ae68f93 -r 1c878790edea source/common/quant.cpp --- a/source/common/quant.cpp Tue Sep 25 16:02:31 2018 +0530 +++ b/source/common/quant.cpp Wed Oct 31 16:35:48 2018 +0530 @@ -723,6 +723,7 @@ X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); uint32_t blkPos = codeParams.scan[scanPosBase]; +#if X265_ARCH_X86 bool enable512 = detect512(); if (enable512) primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); @@ -731,6 +732,10 @@ primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , ,blkPos); primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); } +#elif +primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , blkPos); +primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); +#endif } } else @@ -805,8 +810,8 @@ uint32_t blkPos = codeParams.scan[scanPosBase]; if (usePsyMask) { +#if X265_ARCH_X86 bool enable512 = detect512(); - if (enable512) primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); else @@ -814,6 +819,10 @@ primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , blkPos); primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); } +#elif +primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , blkPos); +primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); +#endif blkPos = codeParams.scan[scanPosBase]; for (int y = 0; y < MLS_CG_SIZE; y++) { # HG changeset patch # User Praveen Tiwari # Date 1540983948 -19800 # Wed Oct 31 16:35:48 2018 +0530 # Node ID 1c878790edea64186edabcd40fb3df121f536311 # Parent fd517ae68f93dbfdd1bff45a9dd8e626523542b6 fix Issue #442: linking issue on non x86 platform diff -r fd517ae68f93 -r 1c878790edea source/common/cpu.cpp --- a/source/common/cpu.cpp Tue Sep 25 16:02:31 2018 +0530 +++ b/source/common/cpu.cpp Wed Oct 31 16:35:48 2018 +0530 @@ -127,6 +127,7 @@ { return(enable512); } + uint32_t cpu_detect(bool benableavx512 ) { diff -r fd517ae68f93 -r 1c878790edea source/common/quant.cpp --- a/source/common/quant.cpp Tue Sep 25 16:02:31 2018 +0530 +++ b/source/common/quant.cpp Wed Oct 31 16:35:48 2018 +0530 @@ -723,6 +723,7 @@ X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); uint32_t blkPos = codeParams.scan[scanPosBase]; +#if X265_ARCH_X86 bool enable512 = detect512(); if (enable512) primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); @@ -731,6 +732,10 @@ primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , ,blkPos); primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); } +#elif +primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , blkPos); +primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); +#endif } } else @@ -805,8 +810,8 @@ uint32_t blkPos = codeParams.scan[scanPosBase]; if (usePsyMask) { +#if X265_ARCH_X86 bool enable512 = detect512(); - if (enable512) primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , , blkPos); else @@ -814,6 +819,10 @@ primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, , , blkPos); primitives.cu[log2TrSize
Re: [x265] Original C++ code used for sad functions' assembly code in COST_MV?
Hello Jeffrey, You can find all C primitives in source/common folder. SAD C primitives ares in source/common/pixel.cpp. Thanks, Praveen On Wed, Sep 5, 2018 at 12:23 PM, Mario *LigH* Rohkrämer wrote: > Jeffrey Chen schrieb am 04.09.2018 um 23:57: > >> Hi, I would like to configure the sad function in COST_MV for another >> platform. However, the assembly code would not be supported on the other >> platform. Where can I find the original programming language code that was >> made into the assembly language code? >> > > Hi Jeffrey. > > I'm not a developer, just guessing: > > source/encoder/motion.cpp line 234 #defines a loop. > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] nits
# HG changeset patch # User Praveen Tiwari # Date 1535537469 -19800 # Wed Aug 29 15:41:09 2018 +0530 # Node ID c4b7f40d4747c000fafc96c6331aaf312243b586 # Parent 683defcf536ad5d4e5994dc39efb48de5fec8648 nits diff -r 683defcf536a -r c4b7f40d4747 source/encoder/encoder.cpp --- a/source/encoder/encoder.cppTue Aug 28 18:57:01 2018 +0530 +++ b/source/encoder/encoder.cppWed Aug 29 15:41:09 2018 +0530 @@ -2516,7 +2516,7 @@ vui.defaultDisplayWindow.bottomOffset = m_param->vui.defDispWinBottomOffset; vui.defaultDisplayWindow.leftOffset = m_param->vui.defDispWinLeftOffset; - vui.frameFieldInfoPresentFlag = !!m_param->interlaceMode || (m_param->pictureStructure >= 0); +vui.frameFieldInfoPresentFlag = !!m_param->interlaceMode || (m_param->pictureStructure >= 0); vui.fieldSeqFlag = !!m_param->interlaceMode; vui.hrdParametersPresentFlag = m_param->bEmitHRDSEI; diff -r 683defcf536a -r c4b7f40d4747 source/encoder/sei.h --- a/source/encoder/sei.h Tue Aug 28 18:57:01 2018 +0530 +++ b/source/encoder/sei.h Wed Aug 29 15:41:09 2018 +0530 @@ -304,15 +304,15 @@ int m_preferredTransferCharacteristics; SEIAlternativeTC() { - m_payloadType = ALTERNATIVE_TRANSFER_CHARACTERISTICS; - m_payloadSize = 0; - m_preferredTransferCharacteristics = -1; - } - - void writeSEI(const SPS&) - { - WRITE_CODE(m_preferredTransferCharacteristics, 8, "Preferred transfer characteristics"); - } +m_payloadType = ALTERNATIVE_TRANSFER_CHARACTERISTICS; +m_payloadSize = 0; +m_preferredTransferCharacteristics = -1; +} + +void writeSEI(const SPS&) +{ +WRITE_CODE(m_preferredTransferCharacteristics, 8, "Preferred transfer characteristics"); +} }; } # HG changeset patch # User Praveen Tiwari # Date 1535537469 -19800 # Wed Aug 29 15:41:09 2018 +0530 # Node ID c4b7f40d4747c000fafc96c6331aaf312243b586 # Parent 683defcf536ad5d4e5994dc39efb48de5fec8648 nits diff -r 683defcf536a -r c4b7f40d4747 source/encoder/encoder.cpp --- a/source/encoder/encoder.cpp Tue Aug 28 18:57:01 2018 +0530 +++ b/source/encoder/encoder.cpp Wed Aug 29 15:41:09 2018 +0530 @@ -2516,7 +2516,7 @@ vui.defaultDisplayWindow.bottomOffset = m_param->vui.defDispWinBottomOffset; vui.defaultDisplayWindow.leftOffset = m_param->vui.defDispWinLeftOffset; - vui.frameFieldInfoPresentFlag = !!m_param->interlaceMode || (m_param->pictureStructure >= 0); +vui.frameFieldInfoPresentFlag = !!m_param->interlaceMode || (m_param->pictureStructure >= 0); vui.fieldSeqFlag = !!m_param->interlaceMode; vui.hrdParametersPresentFlag = m_param->bEmitHRDSEI; diff -r 683defcf536a -r c4b7f40d4747 source/encoder/sei.h --- a/source/encoder/sei.h Tue Aug 28 18:57:01 2018 +0530 +++ b/source/encoder/sei.h Wed Aug 29 15:41:09 2018 +0530 @@ -304,15 +304,15 @@ int m_preferredTransferCharacteristics; SEIAlternativeTC() { - m_payloadType = ALTERNATIVE_TRANSFER_CHARACTERISTICS; - m_payloadSize = 0; - m_preferredTransferCharacteristics = -1; - } - - void writeSEI(const SPS&) - { - WRITE_CODE(m_preferredTransferCharacteristics, 8, "Preferred transfer characteristics"); - } +m_payloadType = ALTERNATIVE_TRANSFER_CHARACTERISTICS; +m_payloadSize = 0; +m_preferredTransferCharacteristics = -1; +} + +void writeSEI(const SPS&) +{ +WRITE_CODE(m_preferredTransferCharacteristics, 8, "Preferred transfer characteristics"); +} }; } ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] Patch for issue #422 - Credit to Dimitry Andric
# HG changeset patch # User Praveen Tiwari # Date 1534424221 -19800 # Thu Aug 16 18:27:01 2018 +0530 # Node ID 88ee12651e3031dc1fc2f3f6a8bbac5f67839579 # Parent cbc24109c1c849c027b5f087c6ff5f2087cb7301 Patch for issue #422 - Credit to Dimitry Andric. This is due to undefined behavior in cuTreeFix8Pack(), where a double value is cast directly to uint16_t. If the double value is negative, the resulting value from the cast is undefined. diff -r cbc24109c1c8 -r 88ee12651e30 source/common/pixel.cpp --- a/source/common/pixel.cpp Tue Aug 14 18:01:51 2018 +0530 +++ b/source/common/pixel.cpp Thu Aug 16 18:27:01 2018 +0530 @@ -922,7 +922,7 @@ static void cuTreeFix8Pack(uint16_t *dst, double *src, int count) { for (int i = 0; i < count; i++) -dst[i] = (uint16_t)(src[i] * 256.0); +dst[i] = (uint16_t)(int16_t)(src[i] * 256.0); } static void cuTreeFix8Unpack(double *dst, uint16_t *src, int count) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] Code performance issue
Hello Min, Thanks for the suggestion, we will run some tests and let you know if any change is required here. Thanks. Regards, Praveen Tiwari On Sat, Jun 2, 2018 at 9:18 AM, chen wrote: > There have series performance issues, such as, > > uint32_t sum = (uint32_t)pow((outOfBound >> 2), 2); > > Are you want to get square value from a small integer? > > > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH] threadpool.cpp: use WIN system call for popcount
It is just counting cpusPerNode, so the 64-bit number is not required, yes but I missed the fact of support on few CPUs. Lookup table based implementation could have been fastest due to better caching, but it is not used frequently so we can keep as it is. Thanks. On Thu, May 3, 2018 at 11:24 PM, Andrey Semashev <andrey.semas...@gmail.com> wrote: > On Thu, May 3, 2018 at 7:37 PM, Pradeep Ramachandran > <prad...@multicorewareinc.com> wrote: > > > > On Thu, May 3, 2018 at 2:23 PM, <prav...@multicorewareinc.com> wrote: > >> > >> # HG changeset patch > >> # User Praveen Tiwari <prav...@multicorewareinc.com> > >> # Date 1525328839 -19800 > >> # Thu May 03 11:57:19 2018 +0530 > >> # Branch stable > >> # Node ID 9cbb2aadcca3a2f7a308ea1dc792fb817bcc5b51 > >> # Parent 69aafa6d70ad4e151f4590766c6b125621c5d007 > >> threadpool.cpp: use WIN system call for popcount > > > > > > Unless this fixes a known bug, I don't want to push this directly into > > stable. Syscalls are notorious especially when working with older > versions > > of the OS. > > I would rather push this into default and allow users to test that this > > works with all kinds of systems and then merge with stable once the > answer > > is known. > > Does this fix a specific issue on some platform, or improve performance? > > The comment is not quite right, __popcnt is not a syscall but an > MSVC-specific intrinsic. > > https://msdn.microsoft.com/en-us/library/bb385231.aspx > > The equivalent gcc intrinsic is __builtin_popcount and friends. > > I think, the patch is buggy because the relevant field is a 64-bit > integer on 64-bit Windows and __popcnt is 32-bit. > > Note also that the popcount instruction only available in ABM ISA > extension. In Intel CPUs it is available since Nehalem. > > >> diff -r 69aafa6d70ad -r 9cbb2aadcca3 source/common/threadpool.cpp > >> --- a/source/common/threadpool.cpp Wed May 02 15:15:05 2018 +0530 > >> +++ b/source/common/threadpool.cpp Thu May 03 11:57:19 2018 +0530 > >> @@ -71,21 +71,6 @@ > >> # define strcasecmp _stricmp > >> #endif > >> > >> -#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 > >> -const uint64_t m1 = 0x; //binary: 0101... > >> -const uint64_t m2 = 0x; //binary: 00110011.. > >> -const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary: 4 zeros, 4 ones ... > >> -const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power > of > >> 0,1,2,3... > >> - > >> -static int popCount(uint64_t x) > >> -{ > >> -x -= (x >> 1) & m1; > >> -x = (x & m2) + ((x >> 2) & m2); > >> -x = (x + (x >> 4)) & m3; > >> -return (x * h01) >> 56; > >> -} > >> -#endif > >> - > >> namespace X265_NS { > >> // x265 private namespace > >> > >> @@ -274,7 +259,7 @@ > >> for (int i = 0; i < numNumaNodes; i++) > >> { > >> GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer); > >> -cpusPerNode[i] = popCount(groupAffinityPointer->Mask); > >> +cpusPerNode[i] = __popcnt(static_cast >> int>(groupAffinityPointer->Mask)); > >> } > >> delete groupAffinityPointer; > >> #elif HAVE_LIBNUMA > >> @@ -623,7 +608,7 @@ > >> for (int i = 0; i < numNumaNodes; i++) > >> { > >> GetNumaNodeProcessorMaskEx((UCHAR)i, ); > >> -cpus += popCount(groupAffinity.Mask); > >> +cpus += __popcnt(static_cast int>(groupAffinity.Mask)); > >> } > >> return cpus; > >> #elif _WIN32 > >> ___ > >> x265-devel mailing list > >> x265-devel@videolan.org > >> https://mailman.videolan.org/listinfo/x265-devel > > > > > > > > ___ > > x265-devel mailing list > > x265-devel@videolan.org > > https://mailman.videolan.org/listinfo/x265-devel > > > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] threadpool.cpp: use WIN system call for popcount
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1525328839 -19800 # Thu May 03 11:57:19 2018 +0530 # Branch stable # Node ID 9cbb2aadcca3a2f7a308ea1dc792fb817bcc5b51 # Parent 69aafa6d70ad4e151f4590766c6b125621c5d007 threadpool.cpp: use WIN system call for popcount diff -r 69aafa6d70ad -r 9cbb2aadcca3 source/common/threadpool.cpp --- a/source/common/threadpool.cpp Wed May 02 15:15:05 2018 +0530 +++ b/source/common/threadpool.cpp Thu May 03 11:57:19 2018 +0530 @@ -71,21 +71,6 @@ # define strcasecmp _stricmp #endif -#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 -const uint64_t m1 = 0x; //binary: 0101... -const uint64_t m2 = 0x; //binary: 00110011.. -const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary: 4 zeros, 4 ones ... -const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power of 0,1,2,3... - -static int popCount(uint64_t x) -{ -x -= (x >> 1) & m1; -x = (x & m2) + ((x >> 2) & m2); -x = (x + (x >> 4)) & m3; -return (x * h01) >> 56; -} -#endif - namespace X265_NS { // x265 private namespace @@ -274,7 +259,7 @@ for (int i = 0; i < numNumaNodes; i++) { GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer); -cpusPerNode[i] = popCount(groupAffinityPointer->Mask); +cpusPerNode[i] = __popcnt(static_cast(groupAffinityPointer->Mask)); } delete groupAffinityPointer; #elif HAVE_LIBNUMA @@ -623,7 +608,7 @@ for (int i = 0; i < numNumaNodes; i++) { GetNumaNodeProcessorMaskEx((UCHAR)i, ); -cpus += popCount(groupAffinity.Mask); +cpus += __popcnt(static_cast(groupAffinity.Mask)); } return cpus; #elif _WIN32 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH 000 of 307 ] AVX-512 implementataion in x265: breaks 32-bit compilation
Thanks for reporting, we are looking at the issue, will send a fix soon. Regards, Praveen Tiwari On Thu, Apr 12, 2018 at 2:31 AM, Mario Rohkrämer <cont...@ligh.de> wrote: > Am 07.04.2018, 04:29 Uhr, schrieb <mythr...@multicorewareinc.com>: > > This series of patches enables AVX-512 in x265. USe CLI option --asm >> avx512 to enable AVX-512 kernels. >> ___ >> x265-devel mailing list >> x265-devel@videolan.org >> https://mailman.videolan.org/listinfo/x265-devel >> > > > Compiling x265 for Win32 target (here in MSYS2/MinGW32) is not possible > anymore. > > Assembler code was still available for 8-bit depth core, at least. But: > > + > [ 13%] Building ASM_NASM object common/CMakeFiles/common.dir/x > 86/pixel-util8.asm.obj > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1867: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1880: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1880: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1880: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1880: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1941: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > H:/development/media-autobuild_suite-master/build/x265-hg/ > source/common/x86/pixel-util8.asm:1954: error: invalid combination of > opcode and operands > make[2]: *** [common/CMakeFiles/common.dir/build.make:159: > common/CMakeFiles/common.dir/x86/pixel-util8.asm.obj] Error 1 > make[1]: *** [CMakeFiles/Makefile2:449: common/CMakeFiles/common.dir/all] > Error 2 > make: *** [Makefile:130: all] Error 2 > + > > Trying to compile AVX-512 instructions may have to be avoided in 32-bit > architecture mode (because there is surely no 32-bit only CPU supporting > this instruction set extension). > > -- > > Fun and success! &g
Re: [x265] [PATCH 000 of 307 ] AVX-512 implementataion in x265
Your request is on the way, soon we will share the performance related details. Thanks. Regards, Praveen Tiwari On Fri, Apr 6, 2018 at 9:36 PM, Vittorio Giovara <vittorio.giov...@gmail.com > wrote: > just curious, what kind of general speed improvement does this give? > I could have missed them in the series, but it would be nice to have some > sort of benchmarks > thanks > Vittorio > > On Sat, Apr 7, 2018 at 4:29 AM, <mythr...@multicorewareinc.com> wrote: > >> This series of patches enables AVX-512 in x265. USe CLI option --asm >> avx512 to enable AVX-512 kernels. >> ___ >> x265-devel mailing list >> x265-devel@videolan.org >> https://mailman.videolan.org/listinfo/x265-devel >> > > > > -- > Vittorio > > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] x86: split ipfilter8 kernels into two different source file
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1516343663 -19800 # Fri Jan 19 12:04:23 2018 +0530 # Node ID 55a15ecc1110f206199db1b0f997272b5f7ddc82 # Parent 52782aeb20818273cbf749d221647a254b26c4a4 x86: split ipfilter8 kernels into two different source file This patch implements infrastructure to split ipfiletr8 asm source file into two different files in order to avoid longer build time. It moves interp_8tap_horizontal kernels to the newly created file. diff -r 52782aeb2081 -r 55a15ecc1110 source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Fri Feb 16 11:40:59 2018 +0530 +++ b/source/common/CMakeLists.txt Fri Jan 19 12:04:23 2018 +0530 @@ -56,17 +56,15 @@ endif() set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES}) source_group(Intrinsics FILES ${VEC_PRIMITIVES}) - -set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h seaintegral.h) +set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h h-ipfilter8.h loopfilter.h seaintegral.h) set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm mc-a2.asm pixel-util8.asm blockcopy8.asm pixeladd8.asm dct8.asm seaintegral.asm) if(HIGH_BIT_DEPTH) set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm loopfilter.asm) else() -set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm ipfilter8.asm loopfilter.asm) +set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm h-ipfilter8.asm ipfilter8.asm loopfilter.asm) endif() - if(NOT X64) set(A_SRCS ${A_SRCS} pixel-32.asm) endif() diff -r 52782aeb2081 -r 55a15ecc1110 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Feb 16 11:40:59 2018 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jan 19 12:04:23 2018 +0530 @@ -115,8 +115,8 @@ #include "intrapred.h" #include "dct8.h" #include "seaintegral.h" +#include "h-ipfilter8.h" } - #define ALL_LUMA_CU_TYPED(prim, fncdef, fname, cpu) \ p.cu[BLOCK_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \ p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \ diff -r 52782aeb2081 -r 55a15ecc1110 source/common/x86/h-ipfilter8.asm --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/source/common/x86/h-ipfilter8.asm Fri Jan 19 12:04:23 2018 +0530 @@ -0,0 +1,267 @@ +;* +;* Copyright (C) 2013-2017 MulticoreWare, Inc +;* +;* Authors: Min Chen <chenm...@163.com> +;* Nabajit Deka <naba...@multicorewareinc.com> +;* Praveen Kumar Tiwari <prav...@multicorewareinc.com> +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +const h_tabw_LumaCoeff, dw 0, 0, 0, 64, 0, 0, 0, 0 + dw -1, 4, -10, 58, 17, -5, 1, 0 + dw -1, 4, -11, 40, 40, -11, 4, -1 + dw 0, 1, -5, 17, 58, -10, 4, -1 + +SECTION .text + +cextern pw_32 +cextern pw_2000 + +%macro FILTER_H8_W8_sse2 0 +movhm1, [r0 + x - 3] +movhm4, [r0 + x - 2] +punpcklbw m1, m6 +punpcklbw m4, m6 +movhm5, [r0 + x - 1] +movhm0, [r0 + x] +punpcklbw m5, m6 +punpcklbw m0, m6 +pmaddwd m1, m3 +pmaddwd m4, m3 +pmaddwd m5, m3 +pmaddwd m0, m3 +packssdwm1, m4 +packssdwm5, m0 +pshuflw m4, m1, q2301 +pshufhw m4, m4, q2301 +pshuflw m0, m5, q2301 +pshufhw m0, m0, q2301 +paddw m1, m4 +paddw m5, m0 +psrldq m1, 2 +psrldq m5, 2 +pshufd m1, m1, q3120 +pshufd m5, m5, q3120 +punpcklqdq m1, m5 +movhm7, [r0 + x + 1] +movhm4, [r0 + x + 2] +punpcklbw m7, m6 +punpcklbw m4, m6 +mo
[x265] [PATCH] dct32 AVX512 Kernel
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1512003711 28800 # Wed Nov 29 17:01:51 2017 -0800 # Branch avx-512 # Node ID 96c57dd05464126451ae2100efe4c4b759390311 # Parent 82a58ec0b04a870dac11ae253c30a15a3002419e dct32 AVX512 Kernel diff -r 82a58ec0b04a -r 96c57dd05464 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Nov 28 13:51:06 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Nov 29 17:01:51 2017 -0800 @@ -5005,8 +5005,9 @@ p.pu[LUMA_64x32].luma_vpp = PFX(interp_8tap_vert_pp_64x32_avx512); p.pu[LUMA_64x16].luma_vpp = PFX(interp_8tap_vert_pp_64x16_avx512); -p.cu[BLOCK_8x8].dct = PFX(dct8_avx512); -p.cu[BLOCK_8x8].idct = PFX(idct8_avx512); +p.cu[BLOCK_8x8].dct= PFX(dct8_avx512); +p.cu[BLOCK_32x32].dct = PFX(dct32_avx512); +p.cu[BLOCK_8x8].idct = PFX(idct8_avx512); p.cu[BLOCK_16x16].idct = PFX(idct16_avx512); p.cu[BLOCK_32x32].idct = PFX(idct32_avx512); p.quant = PFX(quant_avx512); diff -r 82a58ec0b04a -r 96c57dd05464 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asmTue Nov 28 13:51:06 2017 +0530 +++ b/source/common/x86/dct8.asmWed Nov 29 17:01:51 2017 -0800 @@ -30,16 +30,61 @@ %include "x86util.asm" SECTION_RODATA 64 +tab_dct32: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 +dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 +dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90, -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 +dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 +dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 +dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 +dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 +dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 +dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 +dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 +dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 +dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 +dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 +dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 +dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 +dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 +dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 +dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 +dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57, -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 +dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 +dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 +dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 +dw 43, -90,
[x265] [PATCH] quant.cpp: use 'rdoQuant_c' primitive to optimize rdoQuant path
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1511851222 -19800 # Tue Nov 28 12:10:22 2017 +0530 # Node ID d732ca2095defdbf42748327006083befb30a89e # Parent 4d242c555d14ca8214d9da89cef41c4418af4dca quant.cpp: use 'rdoQuant_c' primitive to optimize rdoQuant path diff -r 4d242c555d14 -r d732ca2095de source/common/quant.cpp --- a/source/common/quant.cpp Tue Nov 28 11:43:00 2017 +0530 +++ b/source/common/quant.cpp Tue Nov 28 12:10:22 2017 +0530 @@ -803,20 +803,14 @@ if (usePsyMask) { -// TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA +// Expected to work faster by FMA SIMD +primitives.rdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, , , psyScale, blkPos, log2TrSize); +blkPos = codeParams.scan[scanPosBase]; + for (int y = 0; y < MLS_CG_SIZE; y++) { for (int x = 0; x < MLS_CG_SIZE; x++) { -int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ -int predictedCoef= m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/ -costUncoded[blkPos + x] = static_cast(((int64_t)signCoef * signCoef) << scaleBits); -/* when no residual coefficient is coded, predicted coef == recon coef */ -costUncoded[blkPos + x] -= PSYVALUE(predictedCoef); - -totalUncodedCost += costUncoded[blkPos + x]; -totalRdCost += costUncoded[blkPos + x]; - const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; X265_CHECK(trSize > 4, "trSize check failure\n"); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] quant.cpp: use 'nonPsyRdoQuant_c' primitive to optimize rdoQuant path
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1511855234 -19800 # Tue Nov 28 13:17:14 2017 +0530 # Node ID 85970193df47aa5da685efc27aaef04d9f7f21a0 # Parent d732ca2095defdbf42748327006083befb30a89e quant.cpp: use 'nonPsyRdoQuant_c' primitive to optimize rdoQuant path diff -r d732ca2095de -r 85970193df47 source/common/quant.cpp --- a/source/common/quant.cpp Tue Nov 28 12:10:22 2017 +0530 +++ b/source/common/quant.cpp Tue Nov 28 13:17:14 2017 +0530 @@ -824,16 +824,14 @@ } else { -// non-psy path +// non-psy path - expected to work faster by FMA SIMD +primitives.nonPsyRdoQuant(m_resiDctCoeff, costUncoded, , , blkPos, log2TrSize); +blkPos = codeParams.scan[scanPosBase]; + for (int y = 0; y < MLS_CG_SIZE; y++) { for (int x = 0; x < MLS_CG_SIZE; x++) { -int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ -costUncoded[blkPos + x] = static_cast(((int64_t)signCoef * signCoef) << scaleBits); -totalUncodedCost += costUncoded[blkPos + x]; -totalRdCost += costUncoded[blkPos + x]; - const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; X265_CHECK(trSize > 4, "trSize check failure\n"); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] quant.cpp: 'nonPsyRdoQuant_c' primitive for SIMD optimization
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1511849580 -19800 # Tue Nov 28 11:43:00 2017 +0530 # Node ID 4d242c555d14ca8214d9da89cef41c4418af4dca # Parent dfd4951a93744f3d732cb4645abd2fd87eded750 quant.cpp: 'nonPsyRdoQuant_c' primitive for SIMD optimization This particular section of code appears to be bottleneck in many profiles, as it involves 64-bit multiplication operations. For SIMD optimization we need to convert few buffer/variables to double. diff -r dfd4951a9374 -r 4d242c555d14 source/common/dct.cpp --- a/source/common/dct.cpp Mon Nov 20 14:17:36 2017 +0530 +++ b/source/common/dct.cpp Tue Nov 28 11:43:00 2017 +0530 @@ -1010,6 +1010,26 @@ } } +static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, double *costUncoded, double *totalUncodedCost, double *totalRdCost, uint32_t blkPos, uint32_t log2TrSize) +{ +const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ +const int scaleBits = SCALE_BITS - 2 * transformShift; +const uint32_t trSize = 1 << log2TrSize; + +for (int y = 0; y < MLS_CG_SIZE; y++) +{ +for (int x = 0; x < MLS_CG_SIZE; x++) +{ +int signCoef = m_resiDctCoeff[blkPos + x];/* pre-quantization DCT coeff */ +costUncoded[blkPos + x] = static_cast(((int64_t)signCoef * signCoef) << scaleBits); + +*totalUncodedCost += costUncoded[blkPos + x]; +*totalRdCost += costUncoded[blkPos + x]; +} +blkPos += trSize; +} +} + namespace X265_NS { // x265 private namespace void setupDCTPrimitives_c(EncoderPrimitives& p) @@ -1019,6 +1039,7 @@ p.quant = quant_c; p.nquant = nquant_c; p.rdoQuant = rdoQuant_c; +p.nonPsyRdoQuant = nonPsyRdoQuant_c; p.dst4x4 = dst4_c; p.cu[BLOCK_4x4].dct = dct4_c; p.cu[BLOCK_8x8].dct = dct8_c; diff -r dfd4951a9374 -r 4d242c555d14 source/common/primitives.h --- a/source/common/primitives.hMon Nov 20 14:17:36 2017 +0530 +++ b/source/common/primitives.hTue Nov 28 11:43:00 2017 +0530 @@ -216,6 +216,8 @@ typedef void (*integralv_t)(uint32_t *sum, intptr_t stride); typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride); typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize); +typedef void (*nonPsyRdoQuant_t)(int16_t *m_resiDctCoeff, double *costUncoded, double *totalUncodedCost, double *totalRdCost, uint32_t blkPos, uint32_t log2TrSize); + /* Function pointers to optimized encoder primitives. Each pointer can reference * either an assembly routine, a SIMD intrinsic primitive, or a C function */ struct EncoderPrimitives @@ -303,6 +305,7 @@ quant_t quant; nquant_t nquant; rdoQuant_trdoQuant; +nonPsyRdoQuant_t nonPsyRdoQuant; dequant_scaling_t dequant_scaling; dequant_normal_t dequant_normal; denoiseDct_t denoiseDct; diff -r dfd4951a9374 -r 4d242c555d14 source/common/quant.cpp --- a/source/common/quant.cpp Mon Nov 20 14:17:36 2017 +0530 +++ b/source/common/quant.cpp Tue Nov 28 11:43:00 2017 +0530 @@ -737,17 +737,7 @@ uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); uint32_t blkPos = codeParams.scan[scanPosBase]; -for (int y = 0; y < MLS_CG_SIZE; y++) -{ -for (int x = 0; x < MLS_CG_SIZE; x++) -{ -int signCoef = m_resiDctCoeff[blkPos + x];/* pre-quantization DCT coeff */ -costUncoded[blkPos + x] = static_cast(((int64_t)signCoef * signCoef) << scaleBits); -totalUncodedCost += costUncoded[blkPos + x]; -totalRdCost += costUncoded[blkPos + x]; -} -blkPos += trSize; -} +primitives.nonPsyRdoQuant(m_resiDctCoeff, costUncoded, , , blkPos, log2TrSize); } } ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] quant.cpp: 'rdoQuant_c' primitive for SIMD optimization
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1511167656 -19800 # Mon Nov 20 14:17:36 2017 +0530 # Node ID dfd4951a93744f3d732cb4645abd2fd87eded750 # Parent 17bb240012fe990635be621ac261bfd7c9b2d0ba quant.cpp: 'rdoQuant_c' primitive for SIMD optimization This particular section of code appears to be bottleneck in many profiles, as it involves 64-bit multiplication operations. For SIMD optimization we need to convert few buffer/variables to double. diff -r 17bb240012fe -r dfd4951a9374 source/common/dct.cpp --- a/source/common/dct.cpp Fri Nov 24 17:23:59 2017 +0100 +++ b/source/common/dct.cpp Mon Nov 20 14:17:36 2017 +0530 @@ -984,15 +984,41 @@ return (sum & 0x00FF) + (c1 << 26) + (firstC2Idx << 28); } +static void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize) +{ +const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ +const int scaleBits = SCALE_BITS - 2 * transformShift; +const uint32_t trSize = 1 << log2TrSize; +int max = X265_MAX(0, (2 * transformShift + 1)); + +for (int y = 0; y < MLS_CG_SIZE; y++) +{ +for (int x = 0; x < MLS_CG_SIZE; x++) +{ +int64_t signCoef = m_resiDctCoeff[blkPos + x];/* pre-quantization DCT coeff */ +int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/ + +costUncoded[blkPos + x] = static_cast((signCoef * signCoef) << scaleBits); + +/* when no residual coefficient is coded, predicted coef == recon coef */ +costUncoded[blkPos + x] -= static_cast((psyScale * (predictedCoef)) >> max); + +*totalUncodedCost += costUncoded[blkPos + x]; +*totalRdCost += costUncoded[blkPos + x]; +} +blkPos += trSize; +} +} + namespace X265_NS { // x265 private namespace - void setupDCTPrimitives_c(EncoderPrimitives& p) { p.dequant_scaling = dequant_scaling_c; p.dequant_normal = dequant_normal_c; p.quant = quant_c; p.nquant = nquant_c; +p.rdoQuant = rdoQuant_c; p.dst4x4 = dst4_c; p.cu[BLOCK_4x4].dct = dct4_c; p.cu[BLOCK_8x8].dct = dct8_c; diff -r 17bb240012fe -r dfd4951a9374 source/common/primitives.h --- a/source/common/primitives.hFri Nov 24 17:23:59 2017 +0100 +++ b/source/common/primitives.hMon Nov 20 14:17:36 2017 +0530 @@ -213,10 +213,9 @@ typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ); typedef void (*pelFilterChroma_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ); - typedef void (*integralv_t)(uint32_t *sum, intptr_t stride); typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride); - +typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize); /* Function pointers to optimized encoder primitives. Each pointer can reference * either an assembly routine, a SIMD intrinsic primitive, or a C function */ struct EncoderPrimitives @@ -301,9 +300,9 @@ * the CU arrays */ dct_t dst4x4; idct_tidst4x4; - quant_t quant; nquant_t nquant; +rdoQuant_trdoQuant; dequant_scaling_t dequant_scaling; dequant_normal_t dequant_normal; denoiseDct_t denoiseDct; diff -r 17bb240012fe -r dfd4951a9374 source/common/quant.cpp --- a/source/common/quant.cpp Fri Nov 24 17:23:59 2017 +0100 +++ b/source/common/quant.cpp Mon Nov 20 14:17:36 2017 +0530 @@ -661,11 +661,9 @@ #define SIGCOST(bits) ((lambda2 * (bits)) >> 8) #define RDCOST(d, bits) int64_t)d * d) << scaleBits) + SIGCOST(bits)) #define PSYVALUE(rec) ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1))) - int64_t costCoeff[trSize * trSize]; /* d*d + lambda * bits */ -int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0*/ +double costUncoded[trSize * trSize]; /* d*d + lambda * 0*/ int64_t costSig[trSize * trSize]; /* lambda * bits */ - int rateIncUp[trSize * trSize]; /* signal overhead of increasing level */ int rateIncDown[trSize * trSize];/* signal overhead of decreasing level */ int sigRateDelta[trSize * trSize]; /* signal difference between zero and non-zero */ @@ -675,15 +673,12 @@ const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */ bool bIsLuma = ttype == TEXT_LUMA; - /* total rate distortion cost of transform block, as
Re: [x265] [PATCH] quant.cpp: 'rdoQuant_c' primitive for SIMD optimization
Please ignore this patch I messed an update. I will resend this soon. Thanks On Mon, Nov 27, 2017 at 5:11 PM, <prav...@multicorewareinc.com> wrote: > # HG changeset patch > # User Praveen Tiwari <prav...@multicorewareinc.com> > # Date 1511167656 -19800 > # Mon Nov 20 14:17:36 2017 +0530 > # Node ID dffb056e5ad0e2298b0dd65d048f4f16d8508566 > # Parent b24454f3ff6de650aab6835e291837fc4e2a4466 > quant.cpp: 'rdoQuant_c' primitive for SIMD optimization > > This particular section of code appears to be bottleneck in many profiles, > as it > involves 64-bit multiplication operations. For SIMD optimization we need > to convert > few buffer/variables to double. > > diff -r b24454f3ff6d -r dffb056e5ad0 source/common/dct.cpp > --- a/source/common/dct.cpp Wed Nov 22 22:00:48 2017 +0530 > +++ b/source/common/dct.cpp Mon Nov 20 14:17:36 2017 +0530 > @@ -984,6 +984,32 @@ > return (sum & 0x00FF) + (c1 << 26) + (firstC2Idx << 28); > } > > +void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* > costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t > psyScale, uint32_t blkPos, uint32_t log2TrSize) > +{ > +const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - > log2TrSize; /* Represents scaling through forward transform */ > +const int scaleBits = SCALE_BITS - 2 * transformShift; > +const uint32_t trSize = 1 << log2TrSize; > +int max = X265_MAX(0, (2 * transformShift + 1)); > + > +for (int y = 0; y < MLS_CG_SIZE; y++) > +{ > +for (int x = 0; x < MLS_CG_SIZE; x++) > +{ > +int64_t signCoef = m_resiDctCoeff[blkPos + x];/* > pre-quantization DCT coeff */ > +int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - > signCoef; /* predicted DCT = source DCT - residual DCT*/ > + > +costUncoded[blkPos + x] = static_cast((signCoef * > signCoef) << scaleBits); > + > +/* when no residual coefficient is coded, predicted coef == > recon coef */ > +costUncoded[blkPos + x] -= static_cast((psyScale * > (predictedCoef)) >> max); > + > +*totalUncodedCost += costUncoded[blkPos + x]; > +*totalRdCost += costUncoded[blkPos + x]; > +} > +blkPos += trSize; > +} > +} > + > namespace X265_NS { > // x265 private namespace > > @@ -993,6 +1019,7 @@ > p.dequant_normal = dequant_normal_c; > p.quant = quant_c; > p.nquant = nquant_c; > +p.rdoQuant = rdoQuant_c; > p.dst4x4 = dst4_c; > p.cu[BLOCK_4x4].dct = dct4_c; > p.cu[BLOCK_8x8].dct = dct8_c; > diff -r b24454f3ff6d -r dffb056e5ad0 source/common/primitives.h > --- a/source/common/primitives.hWed Nov 22 22:00:48 2017 +0530 > +++ b/source/common/primitives.hMon Nov 20 14:17:36 2017 +0530 > @@ -216,6 +216,7 @@ > > typedef void (*integralv_t)(uint32_t *sum, intptr_t stride); > typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride); > +typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* > m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* > totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize); > > /* Function pointers to optimized encoder primitives. Each pointer can > reference > * either an assembly routine, a SIMD intrinsic primitive, or a C > function */ > @@ -304,6 +305,7 @@ > > quant_t quant; > nquant_t nquant; > +rdoQuant_trdoQuant; > dequant_scaling_t dequant_scaling; > dequant_normal_t dequant_normal; > denoiseDct_t denoiseDct; > diff -r b24454f3ff6d -r dffb056e5ad0 source/common/quant.cpp > --- a/source/common/quant.cpp Wed Nov 22 22:00:48 2017 +0530 > +++ b/source/common/quant.cpp Mon Nov 20 14:17:36 2017 +0530 > @@ -663,7 +663,7 @@ > #define PSYVALUE(rec) ((psyScale * (rec)) >> X265_MAX(0, (2 * > transformShift + 1))) > > int64_t costCoeff[trSize * trSize]; /* d*d + lambda * bits */ > -int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0*/ > +double costUncoded[trSize * trSize]; /* d*d + lambda * 0*/ > int64_t costSig[trSize * trSize]; /* lambda * bits */ > > int rateIncUp[trSize * trSize]; /* signal overhead of increasing > level */ > @@ -677,12 +677,12 @@ > bool bIsLuma = ttype == TEXT_LUMA; > > /* total rate distortion cost of transform block, as CBF=0 */ > -int64_t totalUncodedCost = 0; > +double totalUncodedCost = 0; > > /* Total rate distortion cost of this transform block, counting te > di
[x265] [PATCH] quant.cpp: 'rdoQuant_c' primitive for SIMD optimization
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1511167656 -19800 # Mon Nov 20 14:17:36 2017 +0530 # Node ID dffb056e5ad0e2298b0dd65d048f4f16d8508566 # Parent b24454f3ff6de650aab6835e291837fc4e2a4466 quant.cpp: 'rdoQuant_c' primitive for SIMD optimization This particular section of code appears to be bottleneck in many profiles, as it involves 64-bit multiplication operations. For SIMD optimization we need to convert few buffer/variables to double. diff -r b24454f3ff6d -r dffb056e5ad0 source/common/dct.cpp --- a/source/common/dct.cpp Wed Nov 22 22:00:48 2017 +0530 +++ b/source/common/dct.cpp Mon Nov 20 14:17:36 2017 +0530 @@ -984,6 +984,32 @@ return (sum & 0x00FF) + (c1 << 26) + (firstC2Idx << 28); } +void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize) +{ +const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ +const int scaleBits = SCALE_BITS - 2 * transformShift; +const uint32_t trSize = 1 << log2TrSize; +int max = X265_MAX(0, (2 * transformShift + 1)); + +for (int y = 0; y < MLS_CG_SIZE; y++) +{ +for (int x = 0; x < MLS_CG_SIZE; x++) +{ +int64_t signCoef = m_resiDctCoeff[blkPos + x];/* pre-quantization DCT coeff */ +int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/ + +costUncoded[blkPos + x] = static_cast((signCoef * signCoef) << scaleBits); + +/* when no residual coefficient is coded, predicted coef == recon coef */ +costUncoded[blkPos + x] -= static_cast((psyScale * (predictedCoef)) >> max); + +*totalUncodedCost += costUncoded[blkPos + x]; +*totalRdCost += costUncoded[blkPos + x]; +} +blkPos += trSize; +} +} + namespace X265_NS { // x265 private namespace @@ -993,6 +1019,7 @@ p.dequant_normal = dequant_normal_c; p.quant = quant_c; p.nquant = nquant_c; +p.rdoQuant = rdoQuant_c; p.dst4x4 = dst4_c; p.cu[BLOCK_4x4].dct = dct4_c; p.cu[BLOCK_8x8].dct = dct8_c; diff -r b24454f3ff6d -r dffb056e5ad0 source/common/primitives.h --- a/source/common/primitives.hWed Nov 22 22:00:48 2017 +0530 +++ b/source/common/primitives.hMon Nov 20 14:17:36 2017 +0530 @@ -216,6 +216,7 @@ typedef void (*integralv_t)(uint32_t *sum, intptr_t stride); typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride); +typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize); /* Function pointers to optimized encoder primitives. Each pointer can reference * either an assembly routine, a SIMD intrinsic primitive, or a C function */ @@ -304,6 +305,7 @@ quant_t quant; nquant_t nquant; +rdoQuant_trdoQuant; dequant_scaling_t dequant_scaling; dequant_normal_t dequant_normal; denoiseDct_t denoiseDct; diff -r b24454f3ff6d -r dffb056e5ad0 source/common/quant.cpp --- a/source/common/quant.cpp Wed Nov 22 22:00:48 2017 +0530 +++ b/source/common/quant.cpp Mon Nov 20 14:17:36 2017 +0530 @@ -663,7 +663,7 @@ #define PSYVALUE(rec) ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1))) int64_t costCoeff[trSize * trSize]; /* d*d + lambda * bits */ -int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0*/ +double costUncoded[trSize * trSize]; /* d*d + lambda * 0*/ int64_t costSig[trSize * trSize]; /* lambda * bits */ int rateIncUp[trSize * trSize]; /* signal overhead of increasing level */ @@ -677,12 +677,12 @@ bool bIsLuma = ttype == TEXT_LUMA; /* total rate distortion cost of transform block, as CBF=0 */ -int64_t totalUncodedCost = 0; +double totalUncodedCost = 0; /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks, * the distortion and signal cost of coded blocks, and the coding cost of significant * coefficient and coefficient group bitmaps */ -int64_t totalRdCost = 0; +double totalRdCost = 0; TUEntropyCodingParameters codeParams; cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma); @@ -729,24 +729,9 @@ uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); uint32_t blkPos = codeParams.scan[scanPosBase]; -// TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA -for (int y = 0; y < MLS_CG
[x265] [PATCH] encoder.cpp: fix encoder crash for --analysis-reuse-level=10
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1511338280 -19800 # Wed Nov 22 13:41:20 2017 +0530 # Branch stable # Node ID dd11aa99f40a1af59065984afa9b699d2eb1162e # Parent 752ed1108fce1b475e0458b70f92503d6343818b encoder.cpp: fix encoder crash for --analysis-reuse-level=10 diff -r 752ed1108fce -r dd11aa99f40a source/encoder/encoder.cpp --- a/source/encoder/encoder.cppTue Nov 21 09:50:45 2017 +0530 +++ b/source/encoder/encoder.cppWed Nov 22 13:41:20 2017 +0530 @@ -1155,6 +1155,8 @@ pic_out->analysisData.wt = outFrame->m_analysisData.wt; pic_out->analysisData.interData = outFrame->m_analysisData.interData; pic_out->analysisData.intraData = outFrame->m_analysisData.intraData; +pic_out->analysisData.modeFlag[0] = outFrame->m_analysisData.modeFlag[0]; +pic_out->analysisData.modeFlag[1] = outFrame->m_analysisData.modeFlag[1]; if (m_param->bDisableLookahead) { int factor = 1; @@ -3067,6 +3069,7 @@ CHECKED_MALLOC(interData->mvpIdx[dir], uint8_t, analysis->numPartitions * analysis->numCUsInFrame); CHECKED_MALLOC(interData->refIdx[dir], int8_t, analysis->numPartitions * analysis->numCUsInFrame); CHECKED_MALLOC(interData->mv[dir], MV, analysis->numPartitions * analysis->numCUsInFrame); +CHECKED_MALLOC(analysis->modeFlag[dir], uint8_t, analysis->numPartitions * analysis->numCUsInFrame); } /* Allocate intra in inter */ @@ -3146,7 +3149,11 @@ X265_FREE(((analysis_inter_data*)analysis->interData)->mvpIdx[dir]); X265_FREE(((analysis_inter_data*)analysis->interData)->refIdx[dir]); X265_FREE(((analysis_inter_data*)analysis->interData)->mv[dir]); -X265_FREE(analysis->modeFlag[dir]); +if (analysis->modeFlag[dir] != NULL) +{ +X265_FREE(analysis->modeFlag[dir]); +analysis->modeFlag[dir] = NULL; +} } } else ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH 2 of 2] x86: Change assembler from YASM to NASM
Yes, that's true looking at the future prospect we have decided to move the support to NASM. It comes with additional advantages as Andrey mentioned above, but we understand the concern to change assembler support, we will make it a smooth transition as much as possible. Thanks. Regards, Praveen Tiwari ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] Fwd: [PATCH] intra: sse4 version of strong intra smoothing
-- Forwarded message -- From: chenDate: Tue, Nov 21, 2017 at 10:07 AM Subject: Re: [x265] [PATCH] intra: sse4 version of strong intra smoothing To: Development for x265 >diff -r a7c2f80c18af -r 973560d58dfb source/common/x86/intrapred8.asm >--- a/source/common/x86/intrapred8.asm Mon Nov 20 14:31:22 2017 +0530 >+++ b/source/common/x86/intrapred8.asm Tue Nov 21 03:10:14 2017 +0800 >@@ -22313,11 +22313,144 @@ > mov [r1 + 64], r3b ; LeftLast > RET > >-INIT_XMM sse4 >-cglobal intra_filter_32x32, 2,4,6 >-mov r2b, byte [r0 + 64]; topLast >-mov r3b, byte [r0 + 128]; LeftLast >- >+; this function add strong intra filter >+ INIT_XMM sse4 >+cglobal intra_filter_32x32, 3,8,7 >+xor r3d, r3d ; R9 >+xor r4d, r4d ; R10 >+mov r3b, byte [r0 + 64] ; topLast >+mov r4b, byte [r0 + 128] ; LeftLast xor+mov = movzx, the xor (clear to zero) does not spending cycle, but affect instruction decode rate >+ >+; strong intra filter is diabled >+cmp r2m, byte 0 >+jz .normal_filter32 >+; decide to do strong intra filter >+xor r5d, r5d ; R11 >+xor r6d, r6d ; RAX >+xor r7d, r7d ; RDI >+mov r5b, byte [r0] ; topLeft >+mov r6b, byte [r0 + 96] ; leftMiddle >+mov r7b, byte [r0 + 32] ; topMiddle >+ >+; threshold = 8 >+mov r2d, r3d ; R8 >+add r2d, r5d ; (topLast + topLeft) >+shl r7d, 1 ; 2 * topMiddle >+sub r2d, r7d (A+B) - 2 * C <==> (A-C) + (B-C) >+mov r7d, r2d ; backup r2d >+sar r7d, 31 >+xor r2d, r7d >+sub r2d, r7d ; abs(r2d) >+cmp r2d, 8 ; how about this or instruction cdq? ; abs(x-y) mov eax, X sub eax, Y sub Y, X cmovg eax, Y >+; bilinearAbove is false >+jns .normal_filter32 >+ >+mov r2d, r5d >+add r2d, r4d >+shl r6d, 1 >+sub r2d, r6d >+mov r6d, r2d >+sar r6d, 31 >+xor r2d, r6d >+sub r2d, r6d >+cmp r2d, 8 >+; bilinearLeft is false >+jns .normal_filter32 >+ >+; do strong intra filter shift = 6 >+mov r2d, r5d >+shl r2d, 6 >+add r2d, 32 ; init >+mov r6d, r4d >+sub r6w, r5w ; deltaL size is word partial register may stall in here >+mov r7d, r3d >+sub r7w, r5w ; deltaR size is word >+movdxmm0, r2d >+ vpbroadcastwxmm0, xmm0 SSE4? This is AVX2 instruction, so * *intialization on top is wrong. We genrally we don't prefix xmm, ymm for native version m0, m1 will be better. >+movaxmm4, xmm0 >+ ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] analysis: use AVC CU analysis-info for HEVC mode analysis
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1510926794 -19800 # Fri Nov 17 19:23:14 2017 +0530 # Node ID 6b248ccb14169d2b0d5b84d50d94a153bd8f3b4f # Parent 9723e8812e63ce51e38ede41f7d5edf73cad0849 analysis: use AVC CU analysis-info for HEVC mode analysis This patch work implements the functionality for anlysis-reuselevel 7, here we want to use AVC analysis-info for HEVC mode decision and use the depth from offload for AVC sizes diff -r 9723e8812e63 -r 6b248ccb1416 source/common/cudata.cpp --- a/source/common/cudata.cpp Fri Nov 17 14:16:31 2017 +0530 +++ b/source/common/cudata.cpp Fri Nov 17 19:23:14 2017 +0530 @@ -201,6 +201,8 @@ m_cuDepth= charBuf; charBuf += m_numPartitions; m_predMode = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */ m_partSize = charBuf; charBuf += m_numPartitions; +m_skipFlag[0]= charBuf; charBuf += m_numPartitions; +m_skipFlag[1]= charBuf; charBuf += m_numPartitions; m_mergeFlag = charBuf; charBuf += m_numPartitions; m_interDir = charBuf; charBuf += m_numPartitions; m_mvpIdx[0] = charBuf; charBuf += m_numPartitions; @@ -239,6 +241,8 @@ m_cuDepth= charBuf; charBuf += m_numPartitions; m_predMode = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */ m_partSize = charBuf; charBuf += m_numPartitions; +m_skipFlag[0]= charBuf; charBuf += m_numPartitions; +m_skipFlag[1]= charBuf; charBuf += m_numPartitions; m_mergeFlag = charBuf; charBuf += m_numPartitions; m_interDir = charBuf; charBuf += m_numPartitions; m_mvpIdx[0] = charBuf; charBuf += m_numPartitions; diff -r 9723e8812e63 -r 6b248ccb1416 source/common/cudata.h --- a/source/common/cudata.hFri Nov 17 14:16:31 2017 +0530 +++ b/source/common/cudata.hFri Nov 17 19:23:14 2017 +0530 @@ -199,13 +199,14 @@ uint8_t* m_predMode; // array of prediction modes uint8_t* m_partSize; // array of partition sizes uint8_t* m_mergeFlag;// array of merge flags +uint8_t* m_skipFlag[2]; uint8_t* m_interDir; // array of inter directions uint8_t* m_mvpIdx[2];// array of motion vector predictor candidates or merge candidate indices [0] uint8_t* m_tuDepth; // array of transform indices uint8_t* m_transformSkip[3]; // array of transform skipping flags per plane uint8_t* m_cbf[3]; // array of coded block flags (CBF) per plane uint8_t* m_chromaIntraDir; // array of intra directions (chroma) -enum { BytesPerPartition = 21 }; // combined sizeof() of all per-part data +enum { BytesPerPartition = 23 }; // combined sizeof() of all per-part data sse_t*m_distortion; coeff_t* m_trCoeff[3]; // transformed coefficient buffer per plane diff -r 9723e8812e63 -r 6b248ccb1416 source/common/framedata.h --- a/source/common/framedata.h Fri Nov 17 14:16:31 2017 +0530 +++ b/source/common/framedata.h Fri Nov 17 19:23:14 2017 +0530 @@ -195,6 +195,7 @@ uint8_t*mvpIdx[2]; int8_t* refIdx[2]; MV* mv[2]; + int64_t* sadCost; }; struct analysis2PassFrameData diff -r 9723e8812e63 -r 6b248ccb1416 source/encoder/analysis.cpp --- a/source/encoder/analysis.cpp Fri Nov 17 14:16:31 2017 +0530 +++ b/source/encoder/analysis.cpp Fri Nov 17 19:23:14 2017 +0530 @@ -75,6 +75,10 @@ m_reuseInterDataCTU = NULL; m_reuseRef = NULL; m_bHD = false; +m_modeFlag[0] = false; +m_modeFlag[1] = false; +m_checkMergeAndSkipOnly[0] = false; +m_checkMergeAndSkipOnly[1] = false; m_evaluateInter = 0; } @@ -247,6 +251,9 @@ memcpy(ctu.m_cuDepth, >depth[posCTU], sizeof(uint8_t) * numPartition); memcpy(ctu.m_predMode, >modes[posCTU], sizeof(uint8_t) * numPartition); memcpy(ctu.m_partSize, >partSize[posCTU], sizeof(uint8_t) * numPartition); +for (int list = 0; list < m_slice->isInterB() + 1; list++) +memcpy(ctu.m_skipFlag[list], _frame->m_analysisData.modeFlag[list][posCTU], sizeof(uint8_t) * numPartition); + if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !m_param->bMVType) { analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData; @@ -1162,7 +1169,11 @@ PicYuv& reconPic = *m_frame->m_reconPic; SplitData splitCUData; -if ((m_param->bMVType && cuGeom.numPartitions > 16) || !m_param->bMVType) +bool bHEVCBlockAnalysis = (m_param-
[x265] [PATCH] analysis: update analysisReuseLevel 7 for analysis sharing
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1510561483 -19800 # Mon Nov 13 13:54:43 2017 +0530 # Node ID 4b6af5ba01f8244aec5862dc5fccc0019c44d0c8 # Parent 2fec493b990ee3066e5ffc853d83669955f0ee3c analysis: update analysisReuseLevel 7 for analysis sharing diff -r 2fec493b990e -r 4b6af5ba01f8 doc/reST/cli.rst --- a/doc/reST/cli.rst Mon Nov 13 12:20:50 2017 +0530 +++ b/doc/reST/cli.rst Mon Nov 13 13:54:43 2017 +0530 @@ -887,17 +887,19 @@ Note that --analysis-reuse-level must be paired with analysis-reuse-mode. - ++-+ - | Level | Description | - ++=+ - | 1 | Lookahead information | - ++-+ - | 2 to 4 | Level 1 + intra/inter modes, ref's | - ++-+ - | 5 to 9 | Level 2 + rect-amp | - ++-+ - | 10 | Level 5 + Full CU analysis-info | - ++-+ ++--+--+ +| Level| Description | ++==+==+ +| 1| Lookahead information| ++--+--+ +| 2 to 4 | Level 1 + intra/inter modes, ref's | ++--+--+ +| 5,6,8 and 9 | Level 2 + rect-amp | ++--+--+ +| 7| Level 5 + AVC size CU refinement | ++--+--+ +| 10 | Level 5 + Full CU analysis-info | ++--+--+ .. option:: --refine-mv-type diff -r 2fec493b990e -r 4b6af5ba01f8 source/encoder/analysis.cpp --- a/source/encoder/analysis.cpp Mon Nov 13 12:20:50 2017 +0530 +++ b/source/encoder/analysis.cpp Mon Nov 13 13:54:43 2017 +0530 @@ -2301,7 +2301,7 @@ for (uint32_t part = 0; part < numPU; part++) { PredictionUnit pu(mode.cu, cuGeom, part); -if (m_param->analysisReuseLevel == 10) +if (m_param->analysisReuseLevel >= 7) { analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData; int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) + cuGeom.absPartIdx; @@ -2407,7 +2407,7 @@ if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth) nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom)); -int lamdaQP = m_param->analysisReuseLevel == 10 ? nextQP : lqp; +int lamdaQP = (m_param->analysisReuseLevel >= 7) ? nextQP : lqp; if (split) m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, childGeom, nextQP) : compressInterCU_rd0_4(parentCTU, childGeom, nextQP); diff -r 2fec493b990e -r 4b6af5ba01f8 source/encoder/encoder.cpp --- a/source/encoder/encoder.cppMon Nov 13 12:20:50 2017 +0530 +++ b/source/encoder/encoder.cppMon Nov 13 13:54:43 2017 +0530 @@ -3036,7 +3036,7 @@ CHECKED_MALLOC(interData->mergeFlag, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); } -if (m_param->analysisReuseLevel == 10) +if (m_param->analysisReuseLevel >= 7) { CHECKED_MALLOC(interData->interDir, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); for (int dir = 0; dir < numDir; dir++) @@ -3113,7 +3113,7 @@ X265_FREE(((analysis_inter_data*)analysis->interData)->mergeFlag); X265_FREE(((analysis_inter_data*)analysis->interData)->partSize); } -if (m_param->analysisReuseLevel == 10) +if (m_param->analysisReuseLevel >= 7) { X265_FREE(((analysis_inter_data*)analysis->interData)->interDir); int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] API: 'x265_set_analysis_data' to capture analysis information
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1510555850 -19800 # Mon Nov 13 12:20:50 2017 +0530 # Node ID 2fec493b990ee3066e5ffc853d83669955f0ee3c # Parent dd9772385d152528201d335efbc6f75fdc43b08c API: 'x265_set_analysis_data' to capture analysis information diff -r dd9772385d15 -r 2fec493b990e doc/reST/api.rst --- a/doc/reST/api.rst Tue Nov 14 11:00:09 2017 +0530 +++ b/doc/reST/api.rst Mon Nov 13 12:20:50 2017 +0530 @@ -215,6 +215,13 @@ *the encoder will wait for this copy to complete if enabled. */ +**x265_set_analysis_data()** may be used to recive analysis information from external application:: + +/* x265_set_analysis_data: + * set the analysis data, + * returns negative on error, 0 access unit were output. */ + int x265_set_analysis_data(x265_encoder *encoder, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes); + Pictures diff -r dd9772385d15 -r 2fec493b990e source/CMakeLists.txt --- a/source/CMakeLists.txt Tue Nov 14 11:00:09 2017 +0530 +++ b/source/CMakeLists.txt Mon Nov 13 12:20:50 2017 +0530 @@ -29,7 +29,7 @@ option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD) # X265_BUILD must be incremented each time the public API is changed -set(X265_BUILD 145) +set(X265_BUILD 146) configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" "${PROJECT_BINARY_DIR}/x265.def") configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" diff -r dd9772385d15 -r 2fec493b990e source/encoder/api.cpp --- a/source/encoder/api.cppTue Nov 14 11:00:09 2017 +0530 +++ b/source/encoder/api.cppMon Nov 13 12:20:50 2017 +0530 @@ -365,6 +365,18 @@ return encoder->getRefFrameList((PicYuv**)l0, (PicYuv**)l1, sliceType, poc); } +int x265_set_analysis_data(x265_encoder *enc, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes) +{ +if (!enc) +return -1; + +Encoder *encoder = static_cast<Encoder*>(enc); +if (!encoder->setAnalysisData(analysis_data, poc, cuBytes)) +return 0; + +return -1; +} + void x265_cleanup(void) { BitCost::destroy(); @@ -444,6 +456,7 @@ _csvlog_frame, _csvlog_encode, _dither_image, +_set_analysis_data }; typedef const x265_api* (*api_get_func)(int bitDepth); diff -r dd9772385d15 -r 2fec493b990e source/encoder/encoder.cpp --- a/source/encoder/encoder.cppTue Nov 14 11:00:09 2017 +0530 +++ b/source/encoder/encoder.cppMon Nov 13 12:20:50 2017 +0530 @@ -574,6 +574,88 @@ return 0; } +int Encoder::setAnalysisData(x265_analysis_data *analysis_data, int poc, uint32_t cuBytes) +{ +uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; +uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; + +Frame* curFrame = m_dpb->m_picList.getPOC(poc); +if (curFrame != NULL) +{ +curFrame->m_analysisData = (*analysis_data); +curFrame->m_analysisData.numCUsInFrame = widthInCU * heightInCU; +curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions; +allocAnalysis(>m_analysisData); +if (m_param->maxCUSize == 16) +{ +if (analysis_data->sliceType == X265_TYPE_IDR || analysis_data->sliceType == X265_TYPE_I) +{ +curFrame->m_analysisData.sliceType = X265_TYPE_I; +if (m_param->analysisReuseLevel < 2) +return -1; + +curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions; +size_t count = 0; +analysis_intra_data * currIntraData = (analysis_intra_data *)curFrame->m_analysisData.intraData; +analysis_intra_data * intraData = (analysis_intra_data *)analysis_data->intraData; +for (uint32_t d = 0; d < cuBytes; d++) +{ +int bytes = curFrame->m_analysisData.numPartitions >> ((intraData)->depth[d] * 2); +memset(&(currIntraData)->depth[count], (intraData)->depth[d], bytes); +memset(&(currIntraData)->chromaModes[count], (intraData)->chromaModes[d], bytes); +memset(&(currIntraData)->partSizes[count], (intraData)->partSizes[d], bytes); +memset(&(currIntraData)->partSizes[count], (intraData)->partSizes[d], bytes); +count += bytes; +} +memcpy(&(currIntraData)->modes, (intraData)->modes, curFrame->m_analysisData.numPartitions * analysis_data->numCUsInFrame); +} +else +{ +uint
[x265] [PATCH] analysis: update analysisReuseLevel 7 for analysis sharing
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1510561483 -19800 # Mon Nov 13 13:54:43 2017 +0530 # Node ID 02f21c4eafe13b52966f3fc1f925bb34070d647d # Parent 65eff30eb192d836b52edb5c3e2e6deae06dabf7 analysis: update analysisReuseLevel 7 for analysis sharing diff -r 65eff30eb192 -r 02f21c4eafe1 source/encoder/analysis.cpp --- a/source/encoder/analysis.cpp Mon Nov 13 12:20:50 2017 +0530 +++ b/source/encoder/analysis.cpp Mon Nov 13 13:54:43 2017 +0530 @@ -2301,7 +2301,7 @@ for (uint32_t part = 0; part < numPU; part++) { PredictionUnit pu(mode.cu, cuGeom, part); -if (m_param->analysisReuseLevel == 10) +if (m_param->analysisReuseLevel >= 7) { analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData; int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) + cuGeom.absPartIdx; @@ -2407,7 +2407,7 @@ if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth) nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom)); -int lamdaQP = m_param->analysisReuseLevel == 10 ? nextQP : lqp; +int lamdaQP = m_param->analysisReuseLevel >= 7 ? nextQP : lqp; if (split) m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, childGeom, nextQP) : compressInterCU_rd0_4(parentCTU, childGeom, nextQP); diff -r 65eff30eb192 -r 02f21c4eafe1 source/encoder/encoder.cpp --- a/source/encoder/encoder.cppMon Nov 13 12:20:50 2017 +0530 +++ b/source/encoder/encoder.cppMon Nov 13 13:54:43 2017 +0530 @@ -3038,7 +3038,7 @@ CHECKED_MALLOC(interData->mergeFlag, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); } -if (m_param->analysisReuseLevel == 10) +if (m_param->analysisReuseLevel >= 7) { CHECKED_MALLOC(interData->interDir, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); for (int dir = 0; dir < numDir; dir++) @@ -3115,7 +3115,7 @@ X265_FREE(((analysis_inter_data*)analysis->interData)->mergeFlag); X265_FREE(((analysis_inter_data*)analysis->interData)->partSize); } -if (m_param->analysisReuseLevel == 10) +if (m_param->analysisReuseLevel >= 7) { X265_FREE(((analysis_inter_data*)analysis->interData)->interDir); int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] API: 'x265_set_analysis_data' to capture analysis information
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1510555850 -19800 # Mon Nov 13 12:20:50 2017 +0530 # Node ID 65eff30eb192d836b52edb5c3e2e6deae06dabf7 # Parent 563cbe1f4a21dcfe2117ccaa874b713d94434f92 API: 'x265_set_analysis_data' to capture analysis information diff -r 563cbe1f4a21 -r 65eff30eb192 source/CMakeLists.txt --- a/source/CMakeLists.txt Wed Nov 08 17:08:18 2017 +0530 +++ b/source/CMakeLists.txt Mon Nov 13 12:20:50 2017 +0530 @@ -29,7 +29,7 @@ option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD) # X265_BUILD must be incremented each time the public API is changed -set(X265_BUILD 143) +set(X265_BUILD 144) configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" "${PROJECT_BINARY_DIR}/x265.def") configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" diff -r 563cbe1f4a21 -r 65eff30eb192 source/encoder/api.cpp --- a/source/encoder/api.cppWed Nov 08 17:08:18 2017 +0530 +++ b/source/encoder/api.cppMon Nov 13 12:20:50 2017 +0530 @@ -365,6 +365,18 @@ return encoder->getRefFrameList((PicYuv**)l0, (PicYuv**)l1, sliceType, poc); } +int x265_set_analysis_data(x265_encoder *enc, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes) +{ +if (!enc) +return -1; + +Encoder *encoder = static_cast<Encoder*>(enc); +if (!encoder->setAnalysisData(analysis_data, poc, cuBytes)) +return 0; + +return -1; +} + void x265_cleanup(void) { BitCost::destroy(); @@ -444,6 +456,7 @@ _csvlog_frame, _csvlog_encode, _dither_image, +_set_analysis_data }; typedef const x265_api* (*api_get_func)(int bitDepth); diff -r 563cbe1f4a21 -r 65eff30eb192 source/encoder/encoder.cpp --- a/source/encoder/encoder.cppWed Nov 08 17:08:18 2017 +0530 +++ b/source/encoder/encoder.cppMon Nov 13 12:20:50 2017 +0530 @@ -576,6 +576,88 @@ return 0; } +int Encoder::setAnalysisData(x265_analysis_data *analysis_data, int poc, uint32_t cuBytes) +{ +uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; +uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; + +Frame* curFrame = m_dpb->m_picList.getPOC(poc); +if (curFrame != NULL) +{ +curFrame->m_analysisData = (*analysis_data); +curFrame->m_analysisData.numCUsInFrame = widthInCU * heightInCU; +curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions; +allocAnalysis(>m_analysisData); +if (m_param->maxCUSize == 16) +{ +if (analysis_data->sliceType == X265_TYPE_IDR || analysis_data->sliceType == X265_TYPE_I) +{ +curFrame->m_analysisData.sliceType = X265_TYPE_I; +if (m_param->analysisReuseLevel < 2) +return -1; + +curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions; +size_t count = 0; +analysis_intra_data * currIntraData = (analysis_intra_data *)curFrame->m_analysisData.intraData; +analysis_intra_data * intraData = (analysis_intra_data *)analysis_data->intraData; +for (uint32_t d = 0; d < cuBytes; d++) +{ +int bytes = curFrame->m_analysisData.numPartitions >> ((intraData)->depth[d] * 2); +memset(&(currIntraData)->depth[count], (intraData)->depth[d], bytes); +memset(&(currIntraData)->chromaModes[count], (intraData)->chromaModes[d], bytes); +memset(&(currIntraData)->partSizes[count], (intraData)->partSizes[d], bytes); +memset(&(currIntraData)->partSizes[count], (intraData)->partSizes[d], bytes); +count += bytes; +} +memcpy(&(currIntraData)->modes, (intraData)->modes, curFrame->m_analysisData.numPartitions * analysis_data->numCUsInFrame); +} +else +{ +uint32_t numDir = analysis_data->sliceType == X265_TYPE_P ? 1 : 2; +if (m_param->analysisReuseLevel < 2) +return -1; + +curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions; +size_t count = 0; +analysis_inter_data * currInterData = (analysis_inter_data *)curFrame->m_analysisData.interData; +analysis_inter_data * interData = (analysis_inter_data *)analysis_data->interData; +for (uint32_t d = 0; d < cuBytes; d++) +{ +int bytes = curFrame->m_analysi
[x265] [PATCH] analysis: update analysisReuseLevel 7 for analysis sharing
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1510561483 -19800 # Mon Nov 13 13:54:43 2017 +0530 # Node ID 67f2dd6203ff8a1e2271ef0ac052ac5f1ad99110 # Parent 5ea4fdbdea99a8bdd91d0d7961bcf50764d445b6 analysis: update analysisReuseLevel 7 for analysis sharing diff -r 5ea4fdbdea99 -r 67f2dd6203ff source/encoder/analysis.cpp --- a/source/encoder/analysis.cpp Mon Nov 13 12:20:50 2017 +0530 +++ b/source/encoder/analysis.cpp Mon Nov 13 13:54:43 2017 +0530 @@ -2301,7 +2301,7 @@ for (uint32_t part = 0; part < numPU; part++) { PredictionUnit pu(mode.cu, cuGeom, part); -if (m_param->analysisReuseLevel == 10) +if (m_param->analysisReuseLevel >= 7) { analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData; int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) + cuGeom.absPartIdx; @@ -2407,7 +2407,7 @@ if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth) nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom)); -int lamdaQP = m_param->analysisReuseLevel == 10 ? nextQP : lqp; +int lamdaQP = m_param->analysisReuseLevel >= 7 ? nextQP : lqp; if (split) m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, childGeom, nextQP) : compressInterCU_rd0_4(parentCTU, childGeom, nextQP); diff -r 5ea4fdbdea99 -r 67f2dd6203ff source/encoder/encoder.cpp --- a/source/encoder/encoder.cppMon Nov 13 12:20:50 2017 +0530 +++ b/source/encoder/encoder.cppMon Nov 13 13:54:43 2017 +0530 @@ -3035,7 +3035,7 @@ CHECKED_MALLOC(interData->mergeFlag, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); } -if (m_param->analysisReuseLevel == 10) +if (m_param->analysisReuseLevel >= 7) { CHECKED_MALLOC(interData->interDir, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); for (int dir = 0; dir < numDir; dir++) @@ -3112,7 +3112,7 @@ X265_FREE(((analysis_inter_data*)analysis->interData)->mergeFlag); X265_FREE(((analysis_inter_data*)analysis->interData)->partSize); } -if (m_param->analysisReuseLevel == 10) +if (m_param->analysisReuseLevel >= 7) { X265_FREE(((analysis_inter_data*)analysis->interData)->interDir); int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] API: 'x265_set_analysis_data' to capture analysis information
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1510555850 -19800 # Mon Nov 13 12:20:50 2017 +0530 # Node ID 5ea4fdbdea99a8bdd91d0d7961bcf50764d445b6 # Parent 563cbe1f4a21dcfe2117ccaa874b713d94434f92 API: 'x265_set_analysis_data' to capture analysis information diff -r 563cbe1f4a21 -r 5ea4fdbdea99 source/CMakeLists.txt --- a/source/CMakeLists.txt Wed Nov 08 17:08:18 2017 +0530 +++ b/source/CMakeLists.txt Mon Nov 13 12:20:50 2017 +0530 @@ -29,7 +29,7 @@ option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD) # X265_BUILD must be incremented each time the public API is changed -set(X265_BUILD 143) +set(X265_BUILD 144) configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" "${PROJECT_BINARY_DIR}/x265.def") configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" diff -r 563cbe1f4a21 -r 5ea4fdbdea99 source/encoder/api.cpp --- a/source/encoder/api.cppWed Nov 08 17:08:18 2017 +0530 +++ b/source/encoder/api.cppMon Nov 13 12:20:50 2017 +0530 @@ -365,6 +365,18 @@ return encoder->getRefFrameList((PicYuv**)l0, (PicYuv**)l1, sliceType, poc); } +int x265_set_analysis_data(x265_encoder *enc, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes) +{ +if (!enc) +return -1; + +Encoder *encoder = static_cast<Encoder*>(enc); +if (!encoder->setAnalysisData(analysis_data, poc, cuBytes)) +return 0; + +return -1; +} + void x265_cleanup(void) { BitCost::destroy(); @@ -444,6 +456,7 @@ _csvlog_frame, _csvlog_encode, _dither_image, +_set_analysis_data }; typedef const x265_api* (*api_get_func)(int bitDepth); diff -r 563cbe1f4a21 -r 5ea4fdbdea99 source/encoder/encoder.cpp --- a/source/encoder/encoder.cppWed Nov 08 17:08:18 2017 +0530 +++ b/source/encoder/encoder.cppMon Nov 13 12:20:50 2017 +0530 @@ -576,6 +576,85 @@ return 0; } +int Encoder::setAnalysisData(x265_analysis_data *analysis_data, int poc, uint32_t cuBytes) +{ +uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; +uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; + +Frame* curFrame = m_dpb->m_picList.getPOC(poc); +if (curFrame != NULL) +{ +curFrame->m_analysisData = (*analysis_data); +curFrame->m_analysisData.numCUsInFrame = widthInCU * heightInCU; +curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions; +allocAnalysis(>m_analysisData); +if (m_param->maxCUSize == 16) +{ +if (analysis_data->sliceType == X265_TYPE_IDR || analysis_data->sliceType == X265_TYPE_I) +{ +curFrame->m_analysisData.sliceType = X265_TYPE_I; +if (m_param->analysisReuseLevel < 2) +return -1; + +curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions; +size_t count = 0; +for (uint32_t d = 0; d < cuBytes; d++) +{ +int bytes = curFrame->m_analysisData.numPartitions >> (((analysis_intra_data *)analysis_data->intraData)->depth[d] * 2); +memset(&((analysis_intra_data *)curFrame->m_analysisData.intraData)->depth[count], ((analysis_intra_data *)analysis_data->intraData)->depth[d], bytes); +memset(&((analysis_intra_data *)curFrame->m_analysisData.intraData)->chromaModes[count], ((analysis_intra_data *)analysis_data->intraData)->chromaModes[d], bytes); +memset(&((analysis_intra_data *)curFrame->m_analysisData.intraData)->partSizes[count], ((analysis_intra_data *)analysis_data->intraData)->partSizes[d], bytes); +memset(&((analysis_intra_data *)curFrame->m_analysisData.intraData)->partSizes[count], ((analysis_intra_data *)analysis_data->intraData)->partSizes[d], bytes); +count += bytes; +} +memcpy(&((analysis_intra_data *)curFrame->m_analysisData.intraData)->modes, ((analysis_intra_data *)analysis_data->intraData)->modes, curFrame->m_analysisData.numPartitions * analysis_data->numCUsInFrame); +} +else +{ +uint32_t numDir = analysis_data->sliceType == X265_TYPE_P ? 1 : 2; +if (m_param->analysisReuseLevel < 2) +return -1; + +curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions; +size_t count = 0; +for (uint32_t d = 0; d < cuBytes; d++) +{ +int bytes =
[x265] [PATCH] api: 'x265_get_ref_frame_list' to get forward and backward refrence list
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1509446629 -19800 # Tue Oct 31 16:13:49 2017 +0530 # Node ID 6ad93877ffe19cd6cf285f0cc8189f41dce606b8 # Parent de91aae2db5353e4e548d002e2dce530a6c8078d api: 'x265_get_ref_frame_list' to get forward and backward refrence list diff -r de91aae2db53 -r 6ad93877ffe1 doc/reST/api.rst --- a/doc/reST/api.rst Tue Oct 31 13:57:37 2017 +0530 +++ b/doc/reST/api.rst Tue Oct 31 16:13:49 2017 +0530 @@ -201,6 +201,13 @@ * This API must be called after(poc >= lookaheadDepth + bframes + 2) condition check. */ int x265_get_slicetype_poc_and_scenecut(x265_encoder *encoder, int *slicetype, int *poc, int* sceneCut); +**x265_get_ref_frame_list()** may be used to fetch forward and backward refrence list:: + +/* x265_get_ref_frame_list: + * returns negative on error, 0 when access unit were output. + * This API must be called after(poc >= lookaheadDepth + bframes + 2) condition check */ + int x265_get_ref_frame_list(x265_encoder *encoder, x265_picyuv**, x265_picyuv**, int, int); + **x265_encoder_ctu_info** /* x265_encoder_ctu_info: *Copy CTU information such as ctu address and ctu partition structure of all diff -r de91aae2db53 -r 6ad93877ffe1 source/CMakeLists.txt --- a/source/CMakeLists.txt Tue Oct 31 13:57:37 2017 +0530 +++ b/source/CMakeLists.txt Tue Oct 31 16:13:49 2017 +0530 @@ -29,7 +29,7 @@ option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD) # X265_BUILD must be incremented each time the public API is changed -set(X265_BUILD 137) +set(X265_BUILD 138) configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" "${PROJECT_BINARY_DIR}/x265.def") configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" diff -r de91aae2db53 -r 6ad93877ffe1 source/common/frame.h --- a/source/common/frame.h Tue Oct 31 13:57:37 2017 +0530 +++ b/source/common/frame.h Tue Oct 31 16:13:49 2017 +0530 @@ -98,6 +98,7 @@ float* m_quantOffsets; // points to quantOffsets in x265_picture x265_sei m_userSEI; +Event m_reconEncoded; /* Frame Parallelism - notification between FrameEncoders of available motion reference rows */ ThreadSafeInteger* m_reconRowFlag; // flag of CTU rows completely reconstructed and extended for motion reference diff -r de91aae2db53 -r 6ad93877ffe1 source/common/picyuv.h --- a/source/common/picyuv.hTue Oct 31 13:57:37 2017 +0530 +++ b/source/common/picyuv.hTue Oct 31 16:13:49 2017 +0530 @@ -27,6 +27,7 @@ #include "common.h" #include "md5.h" #include "x265.h" +struct x265_picyuv {}; namespace X265_NS { // private namespace @@ -34,7 +35,7 @@ class ShortYuv; struct SPS; -class PicYuv +class PicYuv : public x265_picyuv { public: diff -r de91aae2db53 -r 6ad93877ffe1 source/encoder/api.cpp --- a/source/encoder/api.cppTue Oct 31 13:57:37 2017 +0530 +++ b/source/encoder/api.cppTue Oct 31 16:13:49 2017 +0530 @@ -350,6 +350,15 @@ return -1; } +int x265_get_ref_frame_list(x265_encoder *enc, x265_picyuv** l0, x265_picyuv** l1, int sliceType, int poc) +{ +if (!enc) +return -1; + +Encoder *encoder = static_cast<Encoder*>(enc); +return encoder->getRefFrameList((PicYuv**)l0, (PicYuv**)l1, sliceType, poc); +} + void x265_cleanup(void) { BitCost::destroy(); @@ -424,6 +433,7 @@ _encoder_intra_refresh, _encoder_ctu_info, _get_slicetype_poc_and_scenecut, +_get_ref_frame_list, }; typedef const x265_api* (*api_get_func)(int bitDepth); diff -r de91aae2db53 -r 6ad93877ffe1 source/encoder/encoder.cpp --- a/source/encoder/encoder.cppTue Oct 31 13:57:37 2017 +0530 +++ b/source/encoder/encoder.cppTue Oct 31 16:13:49 2017 +0530 @@ -446,6 +446,47 @@ return 0; } +int Encoder::getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc) +{ +if (!(IS_X265_TYPE_I(sliceType))) +{ +Frame *framePtr = m_dpb->m_picList.getPOC(poc); +if (framePtr != NULL) +{ +for (int j = 0; j < framePtr->m_encData->m_slice->m_numRefIdx[0]; j++)// check only for --ref=n number of frames. +{ +if (framePtr->m_encData->m_slice->m_refFrameList[0][j] && framePtr->m_encData->m_slice->m_refFrameList[0][j]->m_reconPic != NULL) +{ +int l0POC = framePtr->m_encData->m_slice->m_refFrameList[0][j]->m_poc; +Frame* l0Fp = m_dpb->m_picList.getPOC(l0POC); +if (l0Fp->m_reconPic->m_picOrg[0] == NULL) +l0Fp->m_reconEncoded.wait(); /* If recon is not ready, current frame encoder need to wait. */ +
[x265] [PATCH] api: 'x265_get_slicetype_poc_and_scenecut' to fetch slicetype, poc and scenecut information
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1509438457 -19800 # Tue Oct 31 13:57:37 2017 +0530 # Node ID de91aae2db5353e4e548d002e2dce530a6c8078d # Parent 6a310b24c6a2d831ef08bbda1bdcf9d929daa308 api: 'x265_get_slicetype_poc_and_scenecut' to fetch slicetype, poc and scenecut information diff -r 6a310b24c6a2 -r de91aae2db53 doc/reST/api.rst --- a/doc/reST/api.rst Thu Nov 02 12:17:29 2017 +0530 +++ b/doc/reST/api.rst Tue Oct 31 13:57:37 2017 +0530 @@ -192,6 +192,15 @@ * presets is not recommended without a more fine-grained breakdown of * parameters to take this into account. */ int x265_encoder_reconfig(x265_encoder *, x265_param *); + +**x265_get_slicetype_poc_and_scenecut()** may be used to fetch slice type, poc and scene cut information mid-encode:: + +/* x265_get_slicetype_poc_and_scenecut: + * get the slice type, poc and scene cut information for the current frame, + * returns negative on error, 0 on success. + * This API must be called after(poc >= lookaheadDepth + bframes + 2) condition check. */ + int x265_get_slicetype_poc_and_scenecut(x265_encoder *encoder, int *slicetype, int *poc, int* sceneCut); + **x265_encoder_ctu_info** /* x265_encoder_ctu_info: *Copy CTU information such as ctu address and ctu partition structure of all diff -r 6a310b24c6a2 -r de91aae2db53 source/CMakeLists.txt --- a/source/CMakeLists.txt Thu Nov 02 12:17:29 2017 +0530 +++ b/source/CMakeLists.txt Tue Oct 31 13:57:37 2017 +0530 @@ -29,7 +29,7 @@ option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD) # X265_BUILD must be incremented each time the public API is changed -set(X265_BUILD 136) +set(X265_BUILD 137) configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" "${PROJECT_BINARY_DIR}/x265.def") configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" diff -r 6a310b24c6a2 -r de91aae2db53 source/common/piclist.cpp --- a/source/common/piclist.cpp Thu Nov 02 12:17:29 2017 +0530 +++ b/source/common/piclist.cpp Tue Oct 31 13:57:37 2017 +0530 @@ -117,6 +117,15 @@ return NULL; } +Frame* PicList::getCurFrame(void) +{ +Frame *curFrame = m_start; +if (curFrame != NULL) +return curFrame; +else +return NULL; +} + void PicList::remove(Frame& curFrame) { #if _DEBUG diff -r 6a310b24c6a2 -r de91aae2db53 source/common/piclist.h --- a/source/common/piclist.h Thu Nov 02 12:17:29 2017 +0530 +++ b/source/common/piclist.h Tue Oct 31 13:57:37 2017 +0530 @@ -62,6 +62,9 @@ /** Find frame with specified POC */ Frame* getPOC(int poc); +/** Get the current Frame from the list **/ +Frame* getCurFrame(void); + /** Remove picture from list */ void remove(Frame& pic); diff -r 6a310b24c6a2 -r de91aae2db53 source/encoder/api.cpp --- a/source/encoder/api.cppThu Nov 02 12:17:29 2017 +0530 +++ b/source/encoder/api.cppTue Oct 31 13:57:37 2017 +0530 @@ -340,6 +340,16 @@ return 0; } +int x265_get_slicetype_poc_and_scenecut(x265_encoder *enc, int *slicetype, int *poc, int *sceneCut) +{ +if (!enc) +return -1; +Encoder *encoder = static_cast<Encoder*>(enc); +if (!encoder->copySlicetypePocAndSceneCut(slicetype, poc, sceneCut)) +return 0; +return -1; +} + void x265_cleanup(void) { BitCost::destroy(); @@ -413,6 +423,7 @@ sizeof(x265_frame_stats), _encoder_intra_refresh, _encoder_ctu_info, +_get_slicetype_poc_and_scenecut, }; typedef const x265_api* (*api_get_func)(int bitDepth); diff -r 6a310b24c6a2 -r de91aae2db53 source/encoder/encoder.cpp --- a/source/encoder/encoder.cppThu Nov 02 12:17:29 2017 +0530 +++ b/source/encoder/encoder.cppTue Oct 31 13:57:37 2017 +0530 @@ -429,6 +429,23 @@ } } +int Encoder::copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut) +{ +Frame *FramePtr = m_dpb->m_picList.getCurFrame(); +if (FramePtr != NULL) +{ +*slicetype = FramePtr->m_lowres.sliceType; +*poc = FramePtr->m_encData->m_slice->m_poc; +*sceneCut = FramePtr->m_lowres.bScenecut; +} +else +{ +x265_log(NULL, X265_LOG_WARNING, "Frame is still in lookahead pipeline, this API must be called after (poc >= lookaheadDepth + bframes + 2) condition check\n"); +return -1; +} +return 0; +} + void Encoder::destroy() { #if ENABLE_HDR10_PLUS diff -r 6a310b24c6a2 -r de91aae2db53 source/encoder/encoder.h --- a/source/encoder/encoder.h Thu Nov 02 12:17:29 2017 +0530 +++ b/source/encoder/encoder.h Tue Oct 31 13:57:37 2017 +0530 @@ -205,6 +205,8 @@ void copyCtuInfo(x265_ctu_info_t** frameCtuInfo, int poc); +int copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *scene
[x265] [PATCH] avx2: integral_init4h -> added 'INTEGRAL_FOUR_HORIZONTAL_4' macro to reduce data movement for '4' element case
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1497422024 -19800 # Wed Jun 14 12:03:44 2017 +0530 # Node ID 65e038ecbbf63a2f449ccf52358c5fbbec408b27 # Parent 28bbc349d17035a3c1fcdfbdca3b8e21ae6b669b avx2: integral_init4h -> added 'INTEGRAL_FOUR_HORIZONTAL_4' macro to reduce data movement for '4' element case diff -r 28bbc349d170 -r 65e038ecbbf6 source/common/x86/seaintegral.asm --- a/source/common/x86/seaintegral.asm Wed Jun 07 17:06:57 2017 +0530 +++ b/source/common/x86/seaintegral.asm Wed Jun 14 12:03:44 2017 +0530 @@ -148,11 +148,6 @@ jnz .loop RET -;- -;static void integral_init4h_c(uint32_t *sum, pixel *pix, intptr_t stride) -;- -INIT_YMM avx2 - %macro INTEGRAL_FOUR_HORIZONTAL_16 0 pmovzxbw m0, [r1] pmovzxbw m1, [r1 + 1] @@ -163,6 +158,24 @@ paddw m0, m1 %endmacro +%macro INTEGRAL_FOUR_HORIZONTAL_4 0 +movd xm0, [r1] +movd xm1, [r1 + 1] +pmovzxbw xm0, xm0 +pmovzxbw xm1, xm1 +paddw xm0, xm1 +movd xm1, [r1 + 2] +pmovzxbw xm1, xm1 +paddw xm0, xm1 +movd xm1, [r1 + 3] +pmovzxbw xm1, xm1 +paddw xm0, xm1 +%endmacro + +;- +;static void integral_init4h(uint32_t *sum, pixel *pix, intptr_t stride) +;- +INIT_YMM avx2 cglobal integral4h, 3, 5, 3 lear3, [4 * r2] subr0, r3 @@ -205,7 +218,7 @@ jmp .end .loop_4: -INTEGRAL_FOUR_HORIZONTAL_16 +INTEGRAL_FOUR_HORIZONTAL_4 pmovzxwd xm0, xm0 movu xm1, [r0] paddd xm0, xm1 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] avx2: integral_init4h -> added 'INTEGRAL_FOUR_HORIZONTAL_4' macro to reduce data movement for '4' element case
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1497417160 -19800 # Wed Jun 14 10:42:40 2017 +0530 # Node ID f6f920ab5be6e00b9c32ac225959fc6b9f68d36b # Parent 28bbc349d17035a3c1fcdfbdca3b8e21ae6b669b avx2: integral_init4h -> added 'INTEGRAL_FOUR_HORIZONTAL_4' macro to reduce data movement for '4' element case diff -r 28bbc349d170 -r f6f920ab5be6 source/common/x86/seaintegral.asm --- a/source/common/x86/seaintegral.asm Wed Jun 07 17:06:57 2017 +0530 +++ b/source/common/x86/seaintegral.asm Wed Jun 14 10:42:40 2017 +0530 @@ -148,11 +148,6 @@ jnz .loop RET -;- -;static void integral_init4h_c(uint32_t *sum, pixel *pix, intptr_t stride) -;- -INIT_YMM avx2 - %macro INTEGRAL_FOUR_HORIZONTAL_16 0 pmovzxbw m0, [r1] pmovzxbw m1, [r1 + 1] @@ -163,6 +158,23 @@ paddw m0, m1 %endmacro +%macro INTEGRAL_FOUR_HORIZONTAL_4 0 +movd xm0, [r1] +movd xm1, [r1 + 1] +pmovzxbw xm0, xm0 +pmovzxbw xm1, xm1 +paddw xm0, xm1 +movd xm1, [r1 + 2] +pmovzxbw xm1, xm1 +paddw xm0, xm1 +movd xm1, [r1 + 3] +paddw xm0, xm1 +%endmacro + +;- +;static void integral_init4h(uint32_t *sum, pixel *pix, intptr_t stride) +;- +INIT_YMM avx2 cglobal integral4h, 3, 5, 3 lear3, [4 * r2] subr0, r3 @@ -205,7 +217,7 @@ jmp .end .loop_4: -INTEGRAL_FOUR_HORIZONTAL_16 +INTEGRAL_FOUR_HORIZONTAL_4 pmovzxwd xm0, xm0 movu xm1, [r0] paddd xm0, xm1 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] avx2: 'integral4v' asm code -> 7.48x faster than 'C' version
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1493905428 -19800 # Thu May 04 19:13:48 2017 +0530 # Node ID 41611825c2f4661536500e1306db7d8c4bf7fd07 # Parent 48502979a4b21f6982dcdacbf7796bf5d9fb395c avx2: 'integral4v' asm code -> 7.48x faster than 'C' version integral_init4v 7.48x202.53 1515.14 diff -r 48502979a4b2 -r 41611825c2f4 source/common/x86/seaintegral.asm --- a/source/common/x86/seaintegral.asm Wed May 03 11:26:26 2017 +0530 +++ b/source/common/x86/seaintegral.asm Thu May 04 19:13:48 2017 +0530 @@ -32,8 +32,19 @@ ;void integral_init4v_c(uint32_t *sum4, intptr_t stride) ;- INIT_YMM avx2 -cglobal integral4v, 2, 2, 0 - +cglobal integral4v, 2, 3, 2 +mov r2, r1 +shl r2, 4 + +.loop +movum0, [r0] +movum1, [r0 + r2] +psubd m1, m0 +movu[r0], m1 +add r0, 32 +sub r1, 8 +cmp r1, 0 +jnz .loop RET ;- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] Fwd: [PATCH 3 of 3] SEA motion search:integralv functions avx2 implementation
-- Forwarded message -- From:Date: Tue, May 2, 2017 at 3:16 PM Subject: [x265] [PATCH 3 of 3] SEA motion search:integralv functions avx2 implementation To: x265-devel@videolan.org # HG changeset patch # User Vignesh Vijayakumar # Date 1493121121 -19800 # Tue Apr 25 17:22:01 2017 +0530 # Node ID e5ee88d08fcedee83efa63869a5a346c711a0e3d # Parent 1afc127e62b4502c8f052ee989843c64b45ffc56 SEA motion search:integralv functions avx2 implementation diff -r 1afc127e62b4 -r e5ee88d08fce source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Fri Apr 28 11:22:29 2017 +0530 +++ b/source/common/CMakeLists.txt Tue Apr 25 17:22:01 2017 +0530 @@ -57,10 +57,10 @@ set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES}) source_group(Intrinsics FILES ${VEC_PRIMITIVES}) -set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h) +set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h seaintegral.h) set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm mc-a2.asm pixel-util8.asm blockcopy8.asm - pixeladd8.asm dct8.asm) + pixeladd8.asm dct8.asm seaintegral.asm) if(HIGH_BIT_DEPTH) set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm loopfilter.asm) else() diff -r 1afc127e62b4 -r e5ee88d08fce source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Apr 28 11:22:29 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Apr 25 17:22:01 2017 +0530 @@ -2158,6 +2158,13 @@ p.fix8Unpack = PFX(cutree_fix8_unpack_avx2); p.fix8Pack = PFX(cutree_fix8_pack_avx2); +p.integral_init4v = PFX(integral4v_avx2); +p.integral_init8v = PFX(integral8v_avx2); +p.integral_init12v = PFX(integral12v_avx2); +p.integral_init16v = PFX(integral16v_avx2); +p.integral_init24v = PFX(integral24v_avx2); +p.integral_init32v = PFX(integral32v_avx2); + /* TODO: This kernel needs to be modified to work with HIGH_BIT_DEPTH only p.planeClipAndMax = PFX(planeClipAndMax_avx2); */ @@ -2178,6 +2185,7 @@ p.costCoeffNxN = PFX(costCoeffNxN_avx2_bmi2); } } + } #else // if HIGH_BIT_DEPTH @@ -3696,6 +3704,13 @@ p.fix8Unpack = PFX(cutree_fix8_unpack_avx2); p.fix8Pack = PFX(cutree_fix8_pack_avx2); +p.integral_init4v = PFX(integral4v_avx2); +p.integral_init8v = PFX(integral8v_avx2); +p.integral_init12v = PFX(integral12v_avx2); +p.integral_init16v = PFX(integral16v_avx2); +p.integral_init24v = PFX(integral24v_avx2); +p.integral_init32v = PFX(integral32v_avx2); + } #endif } diff -r 1afc127e62b4 -r e5ee88d08fce source/common/x86/seaintegral.asm --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/source/common/x86/seaintegral.asm Tue Apr 25 17:22:01 2017 +0530 @@ -0,0 +1,155 @@ +;** *** +;* Copyright (C) 2013-2017 MulticoreWare, Inc +;* +;* Authors: Jayashri Murugan +;* Vignesh V Menon +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;** ***/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION .text + +;-- --- +;void integral_init4v_c(uint32_t *sum4, intptr_t stride) +;-- --- +INIT_YMM avx2 +cglobal integral4v, 2, 4, 2 + +mov r2, 0 xor will be faster method of clearing a register. +mov r3, r1 What are possible values of stride here, is it random number or multiple of a specific number? +shl r3, 4 + +.loop: +movum0, [r0] +movum1, [r0 + r3] +psubd m0, m1, m0 +movu[r0], m0 +add r2, 8 +add r0, 32 +cmp r2, r1 +jl .loop +RET +
[x265] Fwd: [PATCH 2 of 3] SEA motion search:Add testbench for integralv functions
-- Forwarded message -- From:Date: 2017-05-02 15:16 GMT+05:30 Subject: [x265] [PATCH 2 of 3] SEA motion search:Add testbench for integralv functions To: x265-devel@videolan.org # HG changeset patch # User Vignesh Vijayakumar # Date 1493358749 -19800 # Fri Apr 28 11:22:29 2017 +0530 # Node ID 1afc127e62b4502c8f052ee989843c64b45ffc56 # Parent cb67dffd0e2a596c8d3c6d042b8e6c532487d427 SEA motion search:Add testbench for integralv functions diff -r cb67dffd0e2a -r 1afc127e62b4 source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Tue May 02 09:58:13 2017 +0530 +++ b/source/test/pixelharness.cpp Fri Apr 28 11:22:29 2017 +0530 @@ -2003,6 +2003,228 @@ return true; } +bool PixelHarness::check_integral_init4v(integral4v_t ref, integral4v_t opt) +{ +intptr_t srcStep = 64; +int j = 0; >> +uint32_t sum_ans[BUFFSIZE] = { 0 }; >> +uint32_t sum_ans1[BUFFSIZE] = { 0 }; Better names please, check existing naming conventions. + +for (int i = 0; i < 64; i++) +{ +sum_ans[i] = pixel_test_buff[0][i]; +sum_ans1[i] = pixel_test_buff[0][i]; +} +for (int i = 0, k = 0; i < BUFFSIZE; i++) +{ +if (i % 64 == 0) +k++; +sum_ans[i] = sum_ans[i % 64] + k; +sum_ans1[i] = sum_ans1[i % 64] + k; +} +int padx = 4; +int pady = 4; +uint32_t *sum_ans_ptr = sum_ans + srcStep * pady + padx; +uint32_t *sum_ans1_ptr = sum_ans1 + srcStep * pady + padx; +for (int i = 0; i < ITERS; i++) +{ +ref(sum_ans_ptr, srcStep); +checked(opt, sum_ans1_ptr, srcStep); + +if (memcmp(sum_ans, sum_ans1, sizeof(uint32_t) * BUFFSIZE)) +return false; + +reportfail() +j += INCR; +} +return true; +} + +bool PixelHarness::check_integral_init8v(integral8v_t ref, integral8v_t opt) + { +intptr_t srcStep = 64; +int j = 0; +uint32_t sum_ans[BUFFSIZE] = { 0 }; +uint32_t sum_ans1[BUFFSIZE] = { 0 }; + +for (int i = 0; i < 64; i++) +{ +sum_ans[i] = pixel_test_buff[0][i]; +sum_ans1[i] = pixel_test_buff[0][i]; +} +for (int i = 0, k = 0; i < BUFFSIZE; i++) +{ +if (i % 64 == 0) +k++; +sum_ans[i] = sum_ans[i % 64] + k; +sum_ans1[i] = sum_ans1[i % 64] + k; +} +int padx = 4; +int pady = 4; +uint32_t *sum_ans_ptr = sum_ans + srcStep * pady + padx; +uint32_t *sum_ans1_ptr = sum_ans1 + srcStep * pady + padx; +for (int i = 0; i < ITERS; i++) +{ +ref(sum_ans_ptr, srcStep); +checked(opt, sum_ans1_ptr, srcStep); + +if (memcmp(sum_ans, sum_ans1, sizeof(uint32_t) * BUFFSIZE)) +return false; + +reportfail() +j += INCR; +} +return true; +} + +bool PixelHarness::check_integral_init12v(integral12v_t ref, integral12v_t opt) + { +intptr_t srcStep = 64; +int j = 0; +uint32_t sum_ans[BUFFSIZE] = { 0 }; +uint32_t sum_ans1[BUFFSIZE] = { 0 }; + +for (int i = 0; i < 64; i++) +{ +sum_ans[i] = pixel_test_buff[0][i]; +sum_ans1[i] = pixel_test_buff[0][i]; +} +for (int i = 0, k = 0; i < BUFFSIZE; i++) +{ +if (i % 64 == 0) +k++; +sum_ans[i] = sum_ans[i % 64] + k; +sum_ans1[i] = sum_ans1[i % 64] + k; +} +int padx = 4; +int pady = 4; +uint32_t *sum_ans_ptr = sum_ans + srcStep * pady + padx; +uint32_t *sum_ans1_ptr = sum_ans1 + srcStep * pady + padx; +for (int i = 0; i < ITERS; i++) +{ +ref(sum_ans_ptr, srcStep); +checked(opt, sum_ans1_ptr, srcStep); + +if (memcmp(sum_ans, sum_ans1, sizeof(uint32_t) * BUFFSIZE)) +return false; + +reportfail() +j += INCR; +} +return true; +} + +bool PixelHarness::check_integral_init16v(integral16v_t ref, integral16v_t opt) +{ +intptr_t srcStep = 64; +int j = 0; +uint32_t sum_ans[BUFFSIZE] = { 0 }; +uint32_t sum_ans1[BUFFSIZE] = { 0 }; + +for (int i = 0; i < 64; i++) +{ +sum_ans[i] = pixel_test_buff[0][i]; +sum_ans1[i] = pixel_test_buff[0][i]; +} +for (int i = 0, k = 0; i < BUFFSIZE; i++) +{ +if (i % 64 == 0) +k++; +sum_ans[i] = sum_ans[i % 64] + k; +sum_ans1[i] = sum_ans1[i % 64] + k; +} +int padx = 4; +int pady = 4; +uint32_t *sum_ans_ptr = sum_ans + srcStep * pady + padx; +uint32_t *sum_ans1_ptr = sum_ans1 + srcStep * pady + padx; +for (int i = 0; i < ITERS; i++) +{ +ref(sum_ans_ptr, srcStep); +checked(opt, sum_ans1_ptr, srcStep); + +if (memcmp(sum_ans, sum_ans1, sizeof(uint32_t) * BUFFSIZE)) +return false; + +reportfail() +j += INCR; +} +return true; +} + +bool PixelHarness::check_integral_init24v(integral24v_t ref, integral24v_t opt) +{ +intptr_t srcStep = 64; +int j = 0; +uint32_t
[x265] Fwd: [PATCH 1 of 3] SEA motion search:Setup asm primitives for integral calculation
-- Forwarded message -- From:Date: Tue, May 2, 2017 at 3:16 PM Subject: [x265] [PATCH 1 of 3] SEA motion search:Setup asm primitives for integral calculation To: x265-devel@videolan.org # HG changeset patch # User Vignesh Vijayakumar # Date 1493699293 -19800 # Tue May 02 09:58:13 2017 +0530 # Node ID cb67dffd0e2a596c8d3c6d042b8e6c532487d427 # Parent 5bc5e73760cdb61d2674e74cc52149fa0603af8a SEA motion search:Setup asm primitives for integral calculation diff -r 5bc5e73760cd -r cb67dffd0e2a source/common/primitives.cpp --- a/source/common/primitives.cpp Sat Apr 22 17:00:28 2017 -0700 +++ b/source/common/primitives.cpp Tue May 02 09:58:13 2017 +0530 @@ -57,6 +57,7 @@ void setupIntraPrimitives_c(EncoderPrimitives ); void setupLoopFilterPrimitives_c(EncoderPrimitives ); void setupSaoPrimitives_c(EncoderPrimitives ); +void setupSeaIntegralPrimitives_c(EncoderPrimitives ); void setupCPrimitives(EncoderPrimitives ) { @@ -66,6 +67,7 @@ setupIntraPrimitives_c(p); // intrapred.cpp setupLoopFilterPrimitives_c(p); // loopfilter.cpp setupSaoPrimitives_c(p);// sao.cpp +setupSeaIntegralPrimitives_c(p); // framefilter.cpp } void setupAliasPrimitives(EncoderPrimitives ) diff -r 5bc5e73760cd -r cb67dffd0e2a source/common/primitives.h --- a/source/common/primitives.hSat Apr 22 17:00:28 2017 -0700 +++ b/source/common/primitives.hTue May 02 09:58:13 2017 +0530 @@ -202,6 +202,18 @@ typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ); typedef void (*pelFilterChroma_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ); >> + typedef void(*integral4h_t)(uint32_t *sum, pixel *pix, intptr_t stride); >> +typedef void(*integral8h_t)(uint32_t *sum, pixel *pix, intptr_t stride); >> +typedef void(*integral12h_t)(uint32_t *sum, pixel *pix, intptr_t stride); >> +typedef void(*integral16h_t)(uint32_t *sum, pixel *pix, intptr_t stride); >> +typedef void(*integral24h_t)(uint32_t *sum, pixel *pix, intptr_t stride); >> +typedef void(*integral32h_t)(uint32_t *sum, pixel *pix, intptr_t stride); >> + typedef void(*integral4v_t)(uint32_t *sum, intptr_t stride); >> +typedef void(*integral8v_t)(uint32_t *sum, intptr_t stride); >> +typedef void(*integral12v_t)(uint32_t *sum, intptr_t stride); >> +typedef void(*integral16v_t)(uint32_t *sum, intptr_t stride); >> +typedef void(*integral24v_t)(uint32_t *sum, intptr_t stride); >> +typedef void(*integral32v_t)(uint32_t *sum, intptr_t stride); Just needed two typedef here, one for horitontal and one for vertical rest of the typedef are redudent here. /* Function pointers to optimized encoder primitives. Each pointer can reference * either an assembly routine, a SIMD intrinsic primitive, or a C function */ @@ -342,6 +354,19 @@ pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0, EDGE_HOR = 1 pelFilterChroma_t pelFilterChroma[2]; // EDGE_VER = 0, EDGE_HOR = 1 >> +integral4h_tintegral_init4h; >> +integral8h_tintegral_init8h; >> +integral12h_tintegral_init12h; >> +integral16h_tintegral_init16h; >> +integral24h_tintegral_init24h; >> +integral32h_tintegral_init32h; >> +integral4v_tintegral_init4v; >> +integral8v_tintegral_init8v; >> +integral12v_tintegral_init12v; >> +integral16v_tintegral_init16v; >> +integral24v_tintegral_init24v; >> +integral32v_tintegral_init32v; >> + An array of appropiate size for horizontal and another for vertical. /* There is one set of chroma primitives per color space. An encoder will * have just a single color space and thus it will only ever use one entry * in this array. However we always fill all entries in the array in case diff -r 5bc5e73760cd -r cb67dffd0e2a source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Sat Apr 22 17:00:28 2017 -0700 +++ b/source/common/x86/asm-primitives.cpp Tue May 02 09:58:13 2017 +0530 @@ -114,6 +114,7 @@ #include "blockcopy8.h" #include "intrapred.h" #include "dct8.h" +#include "seaintegral.h" } #define ALL_LUMA_CU_TYPED(prim, fncdef, fname, cpu) \ diff -r 5bc5e73760cd -r cb67dffd0e2a source/common/x86/seaintegral.h --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/source/common/x86/seaintegral.h Tue May 02 09:58:13 2017 +0530 @@ -0,0 +1,41 @@ +/** *** +* Copyright (C) 2013-2017 MulticoreWare, Inc +* +* Authors: Vignesh V Menon +* Jayashri Murugan +* +* This program is free software; you can redistribute it and/or modify +* it under the
Re: [x265] Interested in fast popcnt substitute below SSE4.2?
Hi Mario, Sorry for late reply, you have shared an interesting and useful information. Currently we are doing some experimental refactoring over the ASM code base, so it might take some time. Hoping to receive more post like this. Regards, Praveen Tiwari On Wed, Mar 1, 2017 at 8:21 PM, Mario *LigH* Rohkrämer <cont...@ligh.de> wrote: > Apparently not interesting... > > > > Am 23.02.2017, 10:05 Uhr, schrieb Mario *LigH* Rohkrämer <cont...@ligh.de > >: > > Another point of view on this matter: >> >> http://danluu.com/assembly-intrinsics/ >> >> Seems to relativate the impact. >> >> I don't know if you already knew about all this before... >> >> >> Am 22.02.2017, 13:39 Uhr, schrieb Mario *LigH* Rohkrämer <cont...@ligh.de >> >: >> >> http://wm.ite.pl/articles/sse-popcount.html >>> >>> May even be faster than the popcnt instruction implemented in a >>> supporting CPU! >>> >>> Found via a German "conspiracy news" blog (no, that's not at all meant >>> seriously) which sometimes also mentions computer security issues and >>> interesting programming challenges: https://blog.fefe.de/?ts=a653b91f >>> >>> >> >> > > -- > > Fun and success! > Mario *LigH* Rohkrämer > mailto:cont...@ligh.de > > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] Threadpool-fix: correctly detect 'mac' os
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1486449333 -19800 # Tue Feb 07 12:05:33 2017 +0530 # Node ID 816af3e011a6194ca62bd24f03b514feffc3493c # Parent 20141129537b00b09d66f50082059a91b0b7e7f1 Threadpool-fix: correctly detect 'mac' os diff -r 20141129537b -r 816af3e011a6 source/CMakeLists.txt --- a/source/CMakeLists.txt Fri Feb 03 17:30:27 2017 +0530 +++ b/source/CMakeLists.txt Tue Feb 07 12:05:33 2017 +0530 @@ -122,7 +122,7 @@ set(XCODE 1) endif() if(APPLE) - add_definitions(-DMACOS) + add_definitions(-DMACOS=1) endif() if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") diff -r 20141129537b -r 816af3e011a6 source/common/threadpool.cpp --- a/source/common/threadpool.cpp Fri Feb 03 17:30:27 2017 +0530 +++ b/source/common/threadpool.cpp Tue Feb 07 12:05:33 2017 +0530 @@ -57,7 +57,10 @@ #endif -#if MACOS +/* TODO FIX: Macro __MACH__ ideally should be part of MACOS definition, but adding to Cmake + behaving is not as expected, need to fix this. */ + +#if MACOS && __MACH__ #include #include #endif @@ -617,7 +620,7 @@ return sysconf(_SC_NPROCESSORS_CONF); #elif __unix__ return sysconf(_SC_NPROCESSORS_ONLN); -#elif MACOS +#elif MACOS && __MACH__ int nm[2]; size_t len = 4; uint32_t count; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 9] pcs: update design to have 'm_achivedFps' for every PCS Instance
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1479128885 -19800 # Mon Nov 14 18:38:05 2016 +0530 # Branch stable # Node ID 8defd4e7b2e4875247e4ec95e0dd3b9630983526 # Parent bdf273f9521784ceeda868222d415303a0bcf58b pcs: update design to have 'm_achivedFps' for every PCS Instance diff -r bdf273f95217 -r 8defd4e7b2e4 source/api-uhdkit.cpp --- a/source/api-uhdkit.cpp Tue Nov 08 14:20:24 2016 +0530 +++ b/source/api-uhdkit.cpp Mon Nov 14 18:38:05 2016 +0530 @@ -206,8 +206,6 @@ return -1; if (numEncoded > 0) { -uhdkitEnc->m_achievedFps = numEncoded * 100.0 / (double)(endTime - startTime); -uhdkitEnc->m_achievedFps = uhdkitEnc->m_achievedFps / uhdkitEnc->m_param->gops; // Achieved fps for each gop encoder uhdkitEnc->m_encodedFrameCount += numEncoded; controllerIndex = ((uhdkitEnc->m_encodedFrameCount - 1) / uhdkitEnc->m_param->x265Param->keyframeMax) % uhdkitEnc->m_param->gops; X265_CHECK(controllerIndex >= 0 && controllerIndex < uhdkitEnc->m_param->gops, "Invalid controllerIndex: %d, must be between 0 and %d\n", controllerIndex, uhdkitEnc->m_param->gops); diff -r bdf273f95217 -r 8defd4e7b2e4 source/pcs/api-pcs.cpp --- a/source/pcs/api-pcs.cppTue Nov 08 14:20:24 2016 +0530 +++ b/source/pcs/api-pcs.cppMon Nov 14 18:38:05 2016 +0530 @@ -211,6 +211,7 @@ m_pcsParam->statusPrintInterval = param->statusPrintInterval; m_curTimeStamp = m_lastTimeStamp = X265_NS::x265_mdate(); m_framesWindow = 1; +m_achievedFps = 0.0; m_outFrameCountOfLastAccumulatorReset = 0; time(_lastStatusOutputTime); @@ -289,11 +290,11 @@ int64_t elapsedEncTime = m_curTimeStamp - m_lastTimeStamp; int controllerIndex = ((uhdkitEnc->m_encodedFrameCount - 1) / uhdkitEnc->m_param->x265Param->keyframeMax) % uhdkitEnc->m_param->gops; X265_CHECK(controllerIndex >= 0 && controllerIndex < uhdkitEnc->m_param->gops, "Invalid controllerIndex: %d, must be between 0 and %d\n", controllerIndex, uhdkitEnc->m_param->gops); -if (((m_bScenecut == 1) && elapsedEncTime > 0) || elapsedEncTime >= 30 || uhdkitEnc->m_achievedFps < m_pcsParam->fpsSetPoint) +if (((m_bScenecut == 1) && elapsedEncTime > 0) || elapsedEncTime >= 30 || m_achievedFps < m_pcsParam->fpsSetPoint) { // Don't allow outrageously high frame rate measurements to skew the controller. -uhdkitEnc->m_achievedFps = X265_MIN(uhdkitEnc->m_achievedFps, 4 * m_pcsParam->fpsSetPoint); -error = (m_pcsParam->fpsSetPoint - uhdkitEnc->m_achievedFps) / m_pcsParam->fpsSetPoint; +m_achievedFps = X265_MIN(m_achievedFps, 4 * m_pcsParam->fpsSetPoint); +error = (m_pcsParam->fpsSetPoint - m_achievedFps) / m_pcsParam->fpsSetPoint; if (m_pcsParam->integralReset > 0) { @@ -308,7 +309,7 @@ { double lowerBound = (m_pcsParam->fpsSetPoint * SATURATION_RANGE_MIN) / 100.0; /* Lower bound, 3% of set-point */ double upperBound = (m_pcsParam->fpsSetPoint * SATURATION_RANGE_MAX) / 100.0; /* Upper bound, 10% of set-point */ -double fpsDiff = (uhdkitEnc->m_achievedFps - m_pcsParam->fpsSetPoint); +double fpsDiff =(m_achievedFps - m_pcsParam->fpsSetPoint); resetErrorAccumulater = (fpsDiff >= lowerBound && fpsDiff <= upperBound) || m_bScenecut; /* Steady state, or scenecut */ } @@ -388,7 +389,7 @@ m_outFrameCountOfLastAccumulatorReset = uhdkitEnc->m_encodedFrameCount; m_lastTimeStamp = m_curTimeStamp; if (uhdkitEnc->m_reconfigParam->logLevel == UHDKIT_LOG_INFO) - uhdkit_pcs_printStatus(>m_reconfigParam[controllerIndex], uhdkitEnc->m_achievedFps); + uhdkit_pcs_printStatus(>m_reconfigParam[controllerIndex], m_achievedFps); } return true; } @@ -398,6 +399,11 @@ m_bScenecut = pic->frameData.bScenecut; } +void pcs::uhdkit_pcs_update_fps(int64_t startTime, int64_t endTime, int numEncoded) +{ +m_achievedFps = numEncoded * 100.0 / (double)(endTime - startTime); +} + int pcs::uhdkit_pcs_getControlParamValue(const x265_param *param, int index) { int controlParamValue[NUM_CONTROLLER] = { param->bEnableFastIntra, param->bEnableEarlySkip, param->bEnableRectInter, diff -r bdf273f95217 -r 8defd4e7b2e4 source/pcs/pcs.h --- a/source/pcs/pcs.h Tue Nov 08 14:20:24 2016 +0530 +++ b/source/pcs/pcs.h Mon Nov 14 18:38:05 2016 +0530 @@ -32,6 +32,7 @@ /* variables handled by the PCS Instance */ pcs_param* m_pcsParam;
Re: [x265] [PATCH 1 of 9] pcs: update design to have 'm_achivedFps' for every PCS Instance
Please, ignore this patch. Thanks. On Thu, Nov 17, 2016 at 8:51 PM, <prav...@multicorewareinc.com> wrote: > # HG changeset patch > # User Praveen Tiwari <prav...@multicorewareinc.com> > # Date 1479128885 -19800 > # Mon Nov 14 18:38:05 2016 +0530 > # Branch stable > # Node ID 8defd4e7b2e4875247e4ec95e0dd3b9630983526 > # Parent bdf273f9521784ceeda868222d415303a0bcf58b > pcs: update design to have 'm_achivedFps' for every PCS Instance > > diff -r bdf273f95217 -r 8defd4e7b2e4 source/api-uhdkit.cpp > --- a/source/api-uhdkit.cpp Tue Nov 08 14:20:24 2016 +0530 > +++ b/source/api-uhdkit.cpp Mon Nov 14 18:38:05 2016 +0530 > @@ -206,8 +206,6 @@ > return -1; > if (numEncoded > 0) > { > -uhdkitEnc->m_achievedFps = numEncoded * 100.0 / > (double)(endTime - startTime); > -uhdkitEnc->m_achievedFps = uhdkitEnc->m_achievedFps / > uhdkitEnc->m_param->gops; // Achieved fps for each gop encoder > uhdkitEnc->m_encodedFrameCount += numEncoded; > controllerIndex = ((uhdkitEnc->m_encodedFrameCount - 1) / > uhdkitEnc->m_param->x265Param->keyframeMax) % uhdkitEnc->m_param->gops; > X265_CHECK(controllerIndex >= 0 && controllerIndex < > uhdkitEnc->m_param->gops, "Invalid controllerIndex: %d, must be between 0 > and %d\n", controllerIndex, uhdkitEnc->m_param->gops); > diff -r bdf273f95217 -r 8defd4e7b2e4 source/pcs/api-pcs.cpp > --- a/source/pcs/api-pcs.cppTue Nov 08 14:20:24 2016 +0530 > +++ b/source/pcs/api-pcs.cppMon Nov 14 18:38:05 2016 +0530 > @@ -211,6 +211,7 @@ > m_pcsParam->statusPrintInterval = param->statusPrintInterval; > m_curTimeStamp = m_lastTimeStamp = X265_NS::x265_mdate(); > m_framesWindow = 1; > +m_achievedFps = 0.0; > m_outFrameCountOfLastAccumulatorReset = 0; > time(_lastStatusOutputTime); > > @@ -289,11 +290,11 @@ > int64_t elapsedEncTime = m_curTimeStamp - m_lastTimeStamp; > int controllerIndex = ((uhdkitEnc->m_encodedFrameCount - 1) / > uhdkitEnc->m_param->x265Param->keyframeMax) % uhdkitEnc->m_param->gops; > X265_CHECK(controllerIndex >= 0 && controllerIndex < > uhdkitEnc->m_param->gops, "Invalid controllerIndex: %d, must be between 0 > and %d\n", controllerIndex, uhdkitEnc->m_param->gops); > -if (((m_bScenecut == 1) && elapsedEncTime > 0) || elapsedEncTime > >= 30 || uhdkitEnc->m_achievedFps < m_pcsParam->fpsSetPoint) > +if (((m_bScenecut == 1) && elapsedEncTime > 0) || elapsedEncTime > >= 30 || m_achievedFps < m_pcsParam->fpsSetPoint) > { > // Don't allow outrageously high frame rate measurements to > skew the controller. > -uhdkitEnc->m_achievedFps = X265_MIN(uhdkitEnc->m_achievedFps, > 4 * m_pcsParam->fpsSetPoint); > -error = (m_pcsParam->fpsSetPoint - uhdkitEnc->m_achievedFps) > / m_pcsParam->fpsSetPoint; > +m_achievedFps = X265_MIN(m_achievedFps, 4 * > m_pcsParam->fpsSetPoint); > +error = (m_pcsParam->fpsSetPoint - m_achievedFps) / > m_pcsParam->fpsSetPoint; > > if (m_pcsParam->integralReset > 0) > { > @@ -308,7 +309,7 @@ > { > double lowerBound = (m_pcsParam->fpsSetPoint * > SATURATION_RANGE_MIN) / 100.0; /* Lower bound, 3% of set-point */ > double upperBound = (m_pcsParam->fpsSetPoint * > SATURATION_RANGE_MAX) / 100.0; /* Upper bound, 10% of set-point */ > -double fpsDiff = (uhdkitEnc->m_achievedFps - > m_pcsParam->fpsSetPoint); > +double fpsDiff =(m_achievedFps - > m_pcsParam->fpsSetPoint); > resetErrorAccumulater = (fpsDiff >= lowerBound && fpsDiff > <= upperBound) || m_bScenecut; /* Steady state, or scenecut */ > } > > @@ -388,7 +389,7 @@ > m_outFrameCountOfLastAccumulatorReset = uhdkitEnc->m_ > encodedFrameCount; > m_lastTimeStamp = m_curTimeStamp; > if (uhdkitEnc->m_reconfigParam->logLevel == UHDKIT_LOG_INFO) > - > uhdkit_pcs_printStatus(>m_reconfigParam[controllerIndex], > uhdkitEnc->m_achievedFps); > + > uhdkit_pcs_printStatus(>m_reconfigParam[controllerIndex], > m_achievedFps); > } > return true; > } > @@ -398,6 +399,11 @@ > m_bScenecut = pic->frameData.bScenecut; > } > > +void pcs::uhdkit_pcs_update_fps(int64_t startTime, int64_t endTime, int &
[x265] [PATCH] encoder.cpp: print reconfigure params for debug purpose
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1479392046 -19800 # Thu Nov 17 19:44:06 2016 +0530 # Node ID 64dc12e9aae9acacaaab5f7875d01fb09d4156d6 # Parent 4c1652f3884fba9fab4c589dd057b12e6bf33d5b encoder.cpp: print reconfigure params for debug purpose diff -r 4c1652f3884f -r 64dc12e9aae9 source/encoder/encoder.cpp --- a/source/encoder/encoder.cppTue Nov 15 11:16:04 2016 +0530 +++ b/source/encoder/encoder.cppThu Nov 17 19:44:06 2016 +0530 @@ -2433,10 +2433,10 @@ x265_param* oldParam = m_param; x265_param* newParam = m_latestParam; -x265_log(newParam, X265_LOG_INFO, "Reconfigured param options, input Frame: %d\n", m_pocLast + 1); +x265_log(newParam, X265_LOG_DEBUG, "Reconfigured param options, input Frame: %d\n", m_pocLast + 1); char tmp[40]; -#define TOOLCMP(COND1, COND2, STR) if (COND1 != COND2) { sprintf(tmp, STR, COND1, COND2); x265_log(newParam, X265_LOG_INFO, tmp); } +#define TOOLCMP(COND1, COND2, STR) if (COND1 != COND2) { sprintf(tmp, STR, COND1, COND2); x265_log(newParam, X265_LOG_DEBUG, tmp); } TOOLCMP(oldParam->maxNumReferences, newParam->maxNumReferences, "ref=%d to %d\n"); TOOLCMP(oldParam->bEnableFastIntra, newParam->bEnableFastIntra, "fast-intra=%d to %d\n"); TOOLCMP(oldParam->bEnableEarlySkip, newParam->bEnableEarlySkip, "early-skip=%d to %d\n"); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH] [multi-lib] Support 8+10+12 bits in single DLL (Workaround)
Hi Min, Can you please verify for VC12 ? I double checked on this I am getting different output for this patch. 8-bit encoded file size is same but different binary (compared using beyond compare), 10 and 12 bit size and binary both are different. I applied you patch build once (like 8 bit build) and collected all depth outputs (8, 10 and 12), compared with three builds of x265 i.e 8 bit, 10 bit and 12 bit. Regards, Praveen On Fri, Sep 23, 2016 at 2:47 AM, chen <chenm...@163.com> wrote: > Hi Praveen, > > I test your cmdlind on my VS2008 build. > I build three bit-depth version and compare with one bit-depth version, > but the output are still matched in both 10 and 12 bit. > > Regards, > Min > > At 2016-09-22 14:39:50,"Praveen Tiwari" <prav...@multicorewareinc.com> > wrote: > > Hi Min, > > After this patch outputs are changing, tested for following command line > for 10-bit and 12-bit outputs. > > --input=NebutaFestival_2560x1600_60_10bit_crop.yuv --input-res=2560x1600 > --fps=60 --numa-pools="NULL" --output-depth=12 --hash=1 -o NFOut12.hevc > > > > > Regards, > Praveen > > On Thu, Sep 15, 2016 at 1:55 AM, chen <chenm...@163.com> wrote: > >> From ea50e494473623ed0dbff2907194aaf268dc449a Mon Sep 17 00:00:00 2001 >> From: Min Chen <min.c...@multicorewareinc.com> >> Date: Wed, 14 Sep 2016 15:23:38 -0500 >> Subject: [PATCH] [multi-lib] Support 8+10+12 bits in single DLL >> (Workaround) >> >> --- >> source/CMakeLists.txt | 40 +++- >> 1 files changed, 39 insertions(+), 1 deletions(-) >> >> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt >> index dd19d28..c2c2f7f 100644 >> --- a/source/CMakeLists.txt >> +++ b/source/CMakeLists.txt >> @@ -36,6 +36,7 @@ configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" >> configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" >> "${PROJECT_BINARY_DIR}/x265_config.h") >> >> + >> SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" >> "${CMAKE_MODULE_PATH}") >> >> # System architecture detection >> @@ -396,6 +397,39 @@ if(WIN32) >> endif(WINXP_SUPPORT) >> endif() >> >> + >> +if(ENABLE_SHARED AND LINKED_10BIT AND LINKED_12BIT) >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?setParamAspectRatio@x265 >> @@YAXPEAUx265_param@@HH@Z\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?getParamAspectRatio@x265 >> @@YAXPEAUx265_param@@AEAH1@Z\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?general_log_file@x265 >> @@YAXPEBUx265_param@@PEBDH1ZZ\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?general_log@x265 >> @@YAXPEBUx265_param@@PEBDH1ZZ\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def >> "?x265_api_get_94@x265_10bit@@YAPEBUx265_api@@H@Z\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def >> "?x265_api_get_94@x265_12bit@@YAPEBUx265_api@@H@Z\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def >> "?x265_api_query@x265_10bit@@YAPEBUx265_api@@HHPEAH@Z\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def >> "?x265_api_query@x265_12bit@@YAPEBUx265_api@@HHPEAH@Z\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_mdate@x265 >> @@YA_JXZ\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def >> "?x265_picturePlaneSize@x265@@YAI@Z\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_ssim2dB@x265 >> @@YANN@Z\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_ssim2dB@x265 >> @@YANN@Z\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_report_simd@x265 >> @@YAXPEAUx265_param@@@Z\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_fopen@x265 >> @@YAPEAU_iobuf@@PEBD0@Z\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_malloc@x265 >> @@YAPEAX_K@Z\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_free@x265 >> @@YAXPEAX@Z\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_atoi@x265 >> @@YAHPEBDAEA_N@Z\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?start@Thread@x265@ >> @QEAA_NXZ\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?stop@Thread@x265@ >> @QEAAXXZ\n") >> +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "??0Thre
Re: [x265] [PATCH] [multi-lib] Support 8+10+12 bits in single DLL (Workaround)
Hi Min, After this patch outputs are changing, tested for following command line for 10-bit and 12-bit outputs. --input=NebutaFestival_2560x1600_60_10bit_crop.yuv --input-res=2560x1600 --fps=60 --numa-pools="NULL" --output-depth=12 --hash=1 -o NFOut12.hevc Regards, Praveen On Thu, Sep 15, 2016 at 1:55 AM, chen <chenm...@163.com> wrote: > From ea50e494473623ed0dbff2907194aaf268dc449a Mon Sep 17 00:00:00 2001 > From: Min Chen <min.c...@multicorewareinc.com> > Date: Wed, 14 Sep 2016 15:23:38 -0500 > Subject: [PATCH] [multi-lib] Support 8+10+12 bits in single DLL > (Workaround) > > --- > source/CMakeLists.txt | 40 +++- > 1 files changed, 39 insertions(+), 1 deletions(-) > > diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt > index dd19d28..c2c2f7f 100644 > --- a/source/CMakeLists.txt > +++ b/source/CMakeLists.txt > @@ -36,6 +36,7 @@ configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" > configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" > "${PROJECT_BINARY_DIR}/x265_config.h") > > + > SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" > "${CMAKE_MODULE_PATH}") > > # System architecture detection > @@ -396,6 +397,39 @@ if(WIN32) > endif(WINXP_SUPPORT) > endif() > > + > +if(ENABLE_SHARED AND LINKED_10BIT AND LINKED_12BIT) > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?setParamAspectRatio@x265 > @@YAXPEAUx265_param@@HH@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?getParamAspectRatio@x265 > @@YAXPEAUx265_param@@AEAH1@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?general_log_file@x265@@ > YAXPEBUx265_param@@PEBDH1ZZ\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?general_log@x265@@ > YAXPEBUx265_param@@PEBDH1ZZ\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def > "?x265_api_get_94@x265_10bit@@YAPEBUx265_api@@H@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def > "?x265_api_get_94@x265_12bit@@YAPEBUx265_api@@H@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_api_query@x265_10bit > @@YAPEBUx265_api@@HHPEAH@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_api_query@x265_12bit > @@YAPEBUx265_api@@HHPEAH@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_mdate@x265 > @@YA_JXZ\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def > "?x265_picturePlaneSize@x265@@YAI@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_ssim2dB@x265 > @@YANN@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_ssim2dB@x265 > @@YANN@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_report_simd@x265@@ > YAXPEAUx265_param@@@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_fopen@x265@@YAPEAU_ > iobuf@@PEBD0@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_malloc@x265 > @@YAPEAX_K@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_free@x265 > @@YAXPEAX@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?x265_atoi@x265 > @@YAHPEBDAEA_N@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?start@Thread@x265@ > @QEAA_NXZ\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?stop@Thread@x265@ > @QEAAXXZ\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "??0Thread@x265@@QEAA@XZ > \n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "??1Thread@x265@@UEAA@XZ > \n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?g_maxCUDepth@x265 > @@3IA\n") > +if(WINXP_SUPPORT) > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?cond_init@x265@@ > YAHPEAUConditionVariable@1@@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?cond_wait@x265@@ > YAHPEAUConditionVariable@1@PEAU_RTL_CRITICAL_SECTION@@K@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?cond_destroy@x265@@ > YAXPEAUConditionVariable@1@@Z\n") > +file(APPEND ${PROJECT_BINARY_DIR}/x265.def "?cond_broadcast@x265 > @@YAXPEAUConditionVariable@1@@Z\n") > +endif() > +endif() > + > include(version) # determine X265_VERSION and X265_LATEST_TAG > include_directories(. common encoder "${PROJECT_BINARY_DIR}") > > @@ -608,7 +642,11 @@ if(ENABLE_CLI) > if(WIN32 OR NOT ENABLE_SHARED OR INTEL_CXX) > # The CLI cannot link to the shared library on Windows, it > # requires internal APIs not exported fr
[x265] [PATCH] threadpool.cpp: fix default pool param behaviour for window systems, if NULL or "*" or "" (default) x265 will use all available threads on each NUMA node
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1473370883 25200 # Thu Sep 08 14:41:23 2016 -0700 # Branch stable # Node ID 6e301b63952bc77f9e4710b6f46a6409e093d9c7 # Parent 6a9b6a828f791902a02fbf700caee2d3f32129c0 threadpool.cpp: fix default pool param behaviour for window systems, if NULL or "*" or "" (default) x265 will use all available threads on each NUMA node diff -r 6a9b6a828f79 -r 6e301b63952b source/common/threadpool.cpp --- a/source/common/threadpool.cpp Wed Jul 13 19:24:23 2016 +0530 +++ b/source/common/threadpool.cpp Thu Sep 08 14:41:23 2016 -0700 @@ -299,8 +299,8 @@ * For windows because threads can't be allocated to live across sockets * changing the default behavior to be per-socket pools -- FIXME */ #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 - if (!p->numaPools) - { +if (!p->numaPools || (strcmp(p->numaPools, "NULL") == 0 || strcmp(p->numaPools, "*") == 0 || strcmp(p->numaPools, "") == 0)) +{ char poolString[50] = ""; for (int i = 0; i < numNumaNodes; i++) { ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH] threadpool.cpp: fix default pool param behaviour, if NULL or “” (default) x265 will use all available threads on each NUMA node
Please ignore this this behaviour is not required for linux systems. Thanks. Regards, Praveen On Wed, Sep 7, 2016 at 5:19 PM, <prav...@multicorewareinc.com> wrote: > # HG changeset patch > # User Praveen Tiwari <prav...@multicorewareinc.com> > # Date 1473246754 -19800 > # Wed Sep 07 16:42:34 2016 +0530 > # Node ID 9587a394ba58a2c3a579db5fb3f7531daf49559b > # Parent df559450949bd085b0fc5e01332aa8458af2fa43 > threadpool.cpp: fix default pool param behaviour, if NULL or 灯 (default) > x265 will use all available threads on each NUMA node > > diff -r df559450949b -r 9587a394ba58 source/common/threadpool.cpp > --- a/source/common/threadpool.cpp Wed Aug 10 13:26:18 2016 +0530 > +++ b/source/common/threadpool.cpp Wed Sep 07 16:42:34 2016 +0530 > @@ -330,8 +330,8 @@ > { > for (int j = i; j < numNumaNodes; j++) > { > -threadsPerPool[numNumaNodes] += cpusPerNode[j]; > -nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << j); > +threadsPerPool[j] += cpusPerNode[j]; > +nodeMaskPerPool[j] |= ((uint64_t)1 << j); > } > break; > } > @@ -366,8 +366,8 @@ > { > for (int i = 0; i < numNumaNodes; i++) > { > -threadsPerPool[numNumaNodes] += cpusPerNode[i]; > -nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << i); > +threadsPerPool[i] += cpusPerNode[i]; > +nodeMaskPerPool[i] |= ((uint64_t)1 << i); > } > } > > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] threadpool.cpp: fix default pool param behaviour, if NULL or “” (default) x265 will use all available threads on each NUMA node
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1473246754 -19800 # Wed Sep 07 16:42:34 2016 +0530 # Node ID 9587a394ba58a2c3a579db5fb3f7531daf49559b # Parent df559450949bd085b0fc5e01332aa8458af2fa43 threadpool.cpp: fix default pool param behaviour, if NULL or (default) x265 will use all available threads on each NUMA node diff -r df559450949b -r 9587a394ba58 source/common/threadpool.cpp --- a/source/common/threadpool.cpp Wed Aug 10 13:26:18 2016 +0530 +++ b/source/common/threadpool.cpp Wed Sep 07 16:42:34 2016 +0530 @@ -330,8 +330,8 @@ { for (int j = i; j < numNumaNodes; j++) { -threadsPerPool[numNumaNodes] += cpusPerNode[j]; -nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << j); +threadsPerPool[j] += cpusPerNode[j]; +nodeMaskPerPool[j] |= ((uint64_t)1 << j); } break; } @@ -366,8 +366,8 @@ { for (int i = 0; i < numNumaNodes; i++) { -threadsPerPool[numNumaNodes] += cpusPerNode[i]; -nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << i); +threadsPerPool[i] += cpusPerNode[i]; +nodeMaskPerPool[i] |= ((uint64_t)1 << i); } } ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: replace mova by movu to avoid crashing x265_denoise_dct_sse4() 32-bit version
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1471952630 -19800 # Tue Aug 23 17:13:50 2016 +0530 # Node ID 1c646739b6628a3a8e308a22c948a4495a157140 # Parent 49a0d1176aef5bc6330fcfd39b4589616c174f0a asm: replace mova by movu to avoid crashing x265_denoise_dct_sse4() 32-bit version diff -r 49a0d1176aef -r 1c646739b662 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asmWed Jul 27 21:47:20 2016 +0200 +++ b/source/common/x86/dct8.asmTue Aug 23 17:13:50 2016 +0530 @@ -2112,7 +2112,7 @@ pxor m5, m5 shr r3d, 3 .loop: -mova m0, [r0] +movu m0, [r0] pabswm1, m0 movu m2, [r1] @@ -2130,7 +2130,7 @@ pcmpgtw m4, m1, m5 pand m1, m4 psignw m1, m0 -mova [r0], m1 +movu [r0], m1 add r0, 16 add r1, 32 add r2, 16 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH] threadpool: fix warning: ‘int popCount(uint64_t)’ defined but not used [-Wunused-function]
I remember some numa functionality requires minimum window 7, they are not supported on previous version of window OS. Regards, Praveen On Mon, May 30, 2016 at 6:43 PM, Mateusz <mateu...@poczta.onet.pl> wrote: > There is a serious bug in threadpool code that prevent working in Windows > XP/Vista. > VS 2015 error when compiling for 32-bit Windows XP: > (ClCompile target) -> > I:\vs\x265\source\common\threadpool.cpp(590): error C3861: > 'GetNumaNodeProcessorMaskEx': identifier not found [I:\vs\x265\ma\ > 8-b\common\common.vcxproj] > > Did you see patch https://patches.videolan.org/patch/13495/ (it fixes > also this warning)? > > > W dniu 2016-05-30 o 14:45, prav...@multicorewareinc.com pisze: > > # HG changeset patch > > # User Praveen Tiwari <prav...@multicorewareinc.com> > > # Date 1464585837 -19800 > > # Mon May 30 10:53:57 2016 +0530 > > # Node ID b8dbe8d7c09e7fc0b7cce236569fc5df2eb70b1e > > # Parent aeade2e8d8688ebffb8455b8948d89d6a72e2c38 > > threadpool: fix warning: ‘int popCount(uint64_t)’ defined but not used > [-Wunused-function] > > static int popCount(uint64_t x) > > > > diff -r aeade2e8d868 -r b8dbe8d7c09e source/common/threadpool.cpp > > --- a/source/common/threadpool.cppThu May 26 16:45:09 2016 +0530 > > +++ b/source/common/threadpool.cppMon May 30 10:53:57 2016 +0530 > > @@ -68,6 +68,7 @@ > > # define strcasecmp _stricmp > > #endif > > > > +#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 > > const uint64_t m1 = 0x; //binary: 0101... > > const uint64_t m2 = 0x; //binary: 00110011.. > > const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary: 4 zeros, 4 ones ... > > @@ -80,6 +81,7 @@ > > x = (x + (x >> 4)) & m3; > > return (x * h01) >> 56; > > } > > +#endif > > > > namespace X265_NS { > > // x265 private namespace > > > > > > > > ___ > > x265-devel mailing list > > x265-devel@videolan.org > > https://mailman.videolan.org/listinfo/x265-devel > > > > > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] threadpool: fix warning: ‘int popCount(uint64_t)’ defined but not used [-Wunused-function]
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1464585837 -19800 # Mon May 30 10:53:57 2016 +0530 # Node ID b8dbe8d7c09e7fc0b7cce236569fc5df2eb70b1e # Parent aeade2e8d8688ebffb8455b8948d89d6a72e2c38 threadpool: fix warning: int popCount(uint64_t) defined but not used [-Wunused-function] static int popCount(uint64_t x) diff -r aeade2e8d868 -r b8dbe8d7c09e source/common/threadpool.cpp --- a/source/common/threadpool.cpp Thu May 26 16:45:09 2016 +0530 +++ b/source/common/threadpool.cpp Mon May 30 10:53:57 2016 +0530 @@ -68,6 +68,7 @@ # define strcasecmp _stricmp #endif +#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 const uint64_t m1 = 0x; //binary: 0101... const uint64_t m2 = 0x; //binary: 00110011.. const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary: 4 zeros, 4 ones ... @@ -80,6 +81,7 @@ x = (x + (x >> 4)) & m3; return (x * h01) >> 56; } +#endif namespace X265_NS { // x265 private namespace ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] threadpool: fix memory leak
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1464004220 -19800 # Mon May 23 17:20:20 2016 +0530 # Node ID 75d8cadc3f1acbffbbbc651d26c597a96007167b # Parent 5af929bc0ed0827ae0be018c1c8edc10d8650406 threadpool: fix memory leak diff -r 5af929bc0ed0 -r 75d8cadc3f1a source/common/threadpool.cpp --- a/source/common/threadpool.cpp Mon May 23 15:47:38 2016 +0530 +++ b/source/common/threadpool.cpp Mon May 23 17:20:20 2016 +0530 @@ -3,6 +3,7 @@ * * Authors: Steve Borho <st...@borho.org> * Min Chen <chenm...@163.com> + * Praveen Kumar Tiwari <prav...@multicorewareinc.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -420,6 +421,7 @@ if ((nodeMaskPerPool[node] >> j) & 1) len += sprintf(nodesstr + len, ",%d", j); x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on numa nodes %s\n", i, numThreads, nodesstr + 1); +delete[] nodesstr; } else x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", numThreads); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 2] threadpool-fix: utilize all available NUMA nodes for threadpool distribution for windows system,
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1463997405 -19800 # Mon May 23 15:26:45 2016 +0530 # Node ID 2f8a373347649f29953ca9f434eec329e1339aca # Parent 4723933fdec920debefe606d50a9a312f7bc7f6b threadpool-fix: utilize all available NUMA nodes for threadpool distribution for windows system, linux threadpool configuration info, match with windows -> clean logic diff -r 4723933fdec9 -r 2f8a37334764 source/common/threadpool.cpp --- a/source/common/threadpool.cpp Fri May 13 09:32:11 2016 +0530 +++ b/source/common/threadpool.cpp Mon May 23 15:26:45 2016 +0530 @@ -28,6 +28,10 @@ #include +#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 +#include +#endif + #if X86_64 #ifdef __GNUC__ @@ -64,6 +68,19 @@ # define strcasecmp _stricmp #endif +const uint64_t m1 = 0x; //binary: 0101... +const uint64_t m2 = 0x; //binary: 00110011.. +const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary: 4 zeros, 4 ones ... +const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power of 0,1,2,3... + +int popCount(uint64_t x) +{ +x -= (x >> 1) & m1; +x = (x & m2) + ((x >> 2) & m2); +x = (x + (x >> 4)) & m3; +return (x * h01) >> 56; +} + namespace X265_NS { // x265 private namespace @@ -238,7 +255,6 @@ memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool)); int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM); -int cpuCount = getCpuCount(); bool bNumaSupport = false; #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 @@ -248,26 +264,54 @@ #endif -for (int i = 0; i < cpuCount; i++) +#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 +PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY; +for (int i = 0; i < numNumaNodes; i++) { -#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 -UCHAR node; -if (GetNumaProcessorNode((UCHAR)i, )) -cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++; -else +GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer); +cpusPerNode[i] = popCount(groupAffinityPointer->Mask); +} +delete groupAffinityPointer; #elif HAVE_LIBNUMA -if (bNumaSupport >= 0) -cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++; -else +if (bNumaSupport >= 0) +{ +for (int i = 0; i < numNumaNodes; i++) +{ +struct bitmask* bitMask = numa_allocate_cpumask(); +int ret = numa_node_to_cpus(i, bitMask); +if (!ret) +cpusPerNode[i] = numa_num_possible_cpus(); +else +x265_log(p, X265_LOG_ERROR, "Failed to genrate CPU mask\n"); +numa_free_cpumask(bitMask); +} +} +#else // NUMA not supported +cpusPerNode[0] = getCpuCount(); #endif -cpusPerNode[0]++; -} if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG) -for (int i = 0; i < numNumaNodes; i++) -x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]); - -/* limit threads based on param->numaPools */ +for (int i = 0; i < numNumaNodes; i++) +x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]); +/* limit threads based on param->numaPools + * For windows because threads can't be allocated to live across sockets + * changing the default behavior to be per-socket pools -- FIXME */ +#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 + if (!p->numaPools) + { + char poolString[50] = ""; + for (int i = 0; i < numNumaNodes; i++) + { + char nextCount[10] = ""; + if (i) + sprintf(nextCount, ",%d", cpusPerNode[i]); + else + sprintf(nextCount, "%d", cpusPerNode[i]); + strcat(poolString, nextCount); + } + x265_param_parse(p, "pools", poolString); + } +#endif if (p->numaPools && *p->numaPools) { const char *nodeStr = p->numaPools; @@ -389,16 +433,15 @@ X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n"); #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 -m_winCpuMask = 0x0; -GROUP_AFFINITY groupAffinity; +memset(_groupAffinity, 0, sizeof(GROUP_AFFINITY)); for (int i = 0; i < getNumaNodeCount(); i++) { int numaNode = ((nodeMask >> i) & 0x1U) ? i : -1; if (numaNode != -1) -if (GetNumaNodeProcessorMaskEx((USHORT)
Re: [x265] [PATCH 1 of 7] threadpool.cpp: get correct CPU count for multisocket machines -> windows system fix
Hi, I am combining these patches into a single patch along with some updates, so please ignore these patches. On top of this I will update Mateusz patch (CLI: new logic for '--pools ' option ) to avoid merge conflicts. Thanks. . Regards, Praveen On Fri, May 20, 2016 at 4:31 PM, <prav...@multicorewareinc.com> wrote: > # HG changeset patch > # User Praveen Tiwari <prav...@multicorewareinc.com> > # Date 1463655478 -19800 > # Thu May 19 16:27:58 2016 +0530 > # Node ID 9a6ab28b736e1167ac26977d7da8ab2d23cc296f > # Parent aca781339b4c8dae94ff7da73f18cd4439757e87 > threadpool.cpp: get correct CPU count for multisocket machines -> windows > system fix > > diff -r aca781339b4c -r 9a6ab28b736e source/common/threadpool.cpp > --- a/source/common/threadpool.cpp Tue May 10 15:33:17 2016 +0530 > +++ b/source/common/threadpool.cpp Thu May 19 16:27:58 2016 +0530 > @@ -64,6 +64,19 @@ > # define strcasecmp _stricmp > #endif > > +const uint64_t m1 = 0x; //binary: 0101... > +const uint64_t m2 = 0x; //binary: 00110011.. > +const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary: 4 zeros, 4 ones ... > +const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power of > 0,1,2,3... > + > +int popCount(uint64_t x) > +{ > +x -= (x >> 1) & m1; > +x = (x & m2) + ((x >> 2) & m2); > +x = (x + (x >> 4)) & m3; > +return (x * h01) >> 56; > +} > + > namespace X265_NS { > // x265 private namespace > > @@ -525,9 +538,17 @@ > int ThreadPool::getCpuCount() > { > #if _WIN32 > -SYSTEM_INFO sysinfo; > -GetSystemInfo(); > -return sysinfo.dwNumberOfProcessors; > +enum { MAX_NODE_NUM = 127 }; > +int cpus = 0; > +int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM); > +PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY; > +for (int i = 0; i < numNumaNodes; i++) > +{ > +GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer); > +cpus += popCount(groupAffinityPointer->Mask); > +} > +delete groupAffinityPointer; > +return cpus; > #elif __unix__ && X265_ARCH_ARM > /* Return the number of processors configured by OS. Because, most > embedded linux distributions > * uses only one processor as the scheduler doesn't have enough work > to utilize all processors */ > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 7] threadpool.cpp: get correct CPU count for multisocket machines -> windows system fix
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1463655478 -19800 # Thu May 19 16:27:58 2016 +0530 # Node ID 9a6ab28b736e1167ac26977d7da8ab2d23cc296f # Parent aca781339b4c8dae94ff7da73f18cd4439757e87 threadpool.cpp: get correct CPU count for multisocket machines -> windows system fix diff -r aca781339b4c -r 9a6ab28b736e source/common/threadpool.cpp --- a/source/common/threadpool.cpp Tue May 10 15:33:17 2016 +0530 +++ b/source/common/threadpool.cpp Thu May 19 16:27:58 2016 +0530 @@ -64,6 +64,19 @@ # define strcasecmp _stricmp #endif +const uint64_t m1 = 0x; //binary: 0101... +const uint64_t m2 = 0x; //binary: 00110011.. +const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary: 4 zeros, 4 ones ... +const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power of 0,1,2,3... + +int popCount(uint64_t x) +{ +x -= (x >> 1) & m1; +x = (x & m2) + ((x >> 2) & m2); +x = (x + (x >> 4)) & m3; +return (x * h01) >> 56; +} + namespace X265_NS { // x265 private namespace @@ -525,9 +538,17 @@ int ThreadPool::getCpuCount() { #if _WIN32 -SYSTEM_INFO sysinfo; -GetSystemInfo(); -return sysinfo.dwNumberOfProcessors; +enum { MAX_NODE_NUM = 127 }; +int cpus = 0; +int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM); +PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY; +for (int i = 0; i < numNumaNodes; i++) +{ +GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer); +cpus += popCount(groupAffinityPointer->Mask); +} +delete groupAffinityPointer; +return cpus; #elif __unix__ && X265_ARCH_ARM /* Return the number of processors configured by OS. Because, most embedded linux distributions * uses only one processor as the scheduler doesn't have enough work to utilize all processors */ ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 7] threadpool.cpp: nicely populate detected NUMA node along with logical cores per node -> windows system fix
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1463738737 -19800 # Fri May 20 15:35:37 2016 +0530 # Node ID e988eee2f0dc962b9b94c8cef6f739522bce9afb # Parent 9a6ab28b736e1167ac26977d7da8ab2d23cc296f threadpool.cpp: nicely populate detected NUMA node along with logical cores per node -> windows system fix diff -r 9a6ab28b736e -r e988eee2f0dc source/common/threadpool.cpp --- a/source/common/threadpool.cpp Thu May 19 16:27:58 2016 +0530 +++ b/source/common/threadpool.cpp Fri May 20 15:35:37 2016 +0530 @@ -251,7 +251,6 @@ memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool)); int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM); -int cpuCount = getCpuCount(); bool bNumaSupport = false; #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 @@ -261,20 +260,24 @@ #endif +#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 +PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY; +for (int i = 0; i < numNumaNodes; i++) +{ +GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer); +cpusPerNode[i] = popCount(groupAffinityPointer->Mask); +} +delete groupAffinityPointer; +#elif HAVE_LIBNUMA +int cpuCount = getCpuCount(); for (int i = 0; i < cpuCount; i++) { -#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 -UCHAR node; -if (GetNumaProcessorNode((UCHAR)i, )) -cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++; -else -#elif HAVE_LIBNUMA if (bNumaSupport >= 0) cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++; -else +} +#elif +cpusPerNode[0] = getCpuCount(); #endif -cpusPerNode[0]++; -} if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG) for (int i = 0; i < numNumaNodes; i++) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 7 of 7] threadpool.cpp: correctly set pool string -> windows system
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1463740562 -19800 # Fri May 20 16:06:02 2016 +0530 # Node ID 7be1c425db3030d901382ceb6c837f5e93014ca8 # Parent 521874f0eea07c26b6a9580302fe61fc7f223e02 threadpool.cpp: correctly set pool string -> windows system diff -r 521874f0eea0 -r 7be1c425db30 source/common/threadpool.cpp --- a/source/common/threadpool.cpp Fri May 20 15:44:40 2016 +0530 +++ b/source/common/threadpool.cpp Fri May 20 16:06:02 2016 +0530 @@ -284,10 +284,25 @@ #endif if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG) -for (int i = 0; i < numNumaNodes; i++) -x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]); - -/* limit threads based on param->numaPools */ +/* limit threads based on param->numaPools + * For windows because threads can't be allocated to live across sockets + * changing the default behavior to be per-socket pools -- FIXME */ +#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 + if (!p->numaPools) + { + char poolString[50] = ""; + for (int i = 0; i < numNumaNodes; i++) + { + char nextCount[10] = ""; + if (i) + sprintf(nextCount, ",%d", cpusPerNode[i]); + else + sprintf(nextCount, "%d", cpusPerNode[i]); + strcat(poolString, nextCount); + } + x265_param_parse(p, "pools", poolString); + } +#endif if (p->numaPools && *p->numaPools) { const char *nodeStr = p->numaPools; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 6 of 7] threadpool.cpp: ensure for minimum window version -> _WIN32_WINNT_WIN7
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1463739280 -19800 # Fri May 20 15:44:40 2016 +0530 # Node ID 521874f0eea07c26b6a9580302fe61fc7f223e02 # Parent e96bb0aaec630d44ea318222ae221fe116f4f11a threadpool.cpp: ensure for minimum window version -> _WIN32_WINNT_WIN7 diff -r e96bb0aaec63 -r 521874f0eea0 source/common/threadpool.cpp --- a/source/common/threadpool.cpp Fri May 20 15:35:37 2016 +0530 +++ b/source/common/threadpool.cpp Fri May 20 15:44:40 2016 +0530 @@ -28,9 +28,9 @@ #include -#if _WIN32 +#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 #include -#endif // _WIN32 +#endif #if X86_64 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 3 of 7] threadpool.cpp: utilize all available NUMA nodes for threadpool distribution -> windows system fix
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1463738737 -19800 # Fri May 20 15:35:37 2016 +0530 # Node ID ab1b27a1712d581c32b007f0f72e482f4a83905d # Parent e988eee2f0dc962b9b94c8cef6f739522bce9afb threadpool.cpp: utilize all available NUMA nodes for threadpool distribution -> windows system fix diff -r e988eee2f0dc -r ab1b27a1712d source/common/threadpool.cpp --- a/source/common/threadpool.cpp Fri May 20 15:35:37 2016 +0530 +++ b/source/common/threadpool.cpp Fri May 20 15:35:37 2016 +0530 @@ -27,6 +27,7 @@ #include "threading.h" #include +#include #if X86_64 @@ -405,16 +406,15 @@ X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n"); #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 -m_winCpuMask = 0x0; -GROUP_AFFINITY groupAffinity; +memset(_groupAffinity, 0, sizeof(GROUP_AFFINITY)); for (int i = 0; i < getNumaNodeCount(); i++) { int numaNode = ((nodeMask >> i) & 0x1U) ? i : -1; if (numaNode != -1) -if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, )) -m_winCpuMask |= groupAffinity.Mask; +if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, _groupAffinity)) +break; } -m_numaMask = _winCpuMask; +m_numaMask = _groupAffinity.Mask; #elif HAVE_LIBNUMA if (numa_available() >= 0) { @@ -496,11 +496,16 @@ setThreadNodeAffinity(m_numaMask); } -/* static */ void ThreadPool::setThreadNodeAffinity(void *numaMask) { #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 -if (SetThreadAffinityMask(GetCurrentThread(), *((DWORD_PTR*)numaMask))) +UNREFERENCED_PARAMETER(numaMask); +GROUP_AFFINITY groupAffinity; +memset(, 0, sizeof(GROUP_AFFINITY)); +groupAffinity.Group = m_groupAffinity.Group; +groupAffinity.Mask = m_groupAffinity.Mask; +const PGROUP_AFFINITY affinityPointer = +if (SetThreadGroupAffinity(GetCurrentThread(), affinityPointer, NULL)) return; else x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity for NUMA node mask\n"); diff -r e988eee2f0dc -r ab1b27a1712d source/common/threadpool.h --- a/source/common/threadpool.hFri May 20 15:35:37 2016 +0530 +++ b/source/common/threadpool.hFri May 20 15:35:37 2016 +0530 @@ -85,7 +85,7 @@ int m_numWorkers; void* m_numaMask; // node mask in linux, cpu mask in windows #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 -DWORD_PTR m_winCpuMask; +GROUP_AFFINITY m_groupAffinity; #endif bool m_isActive; @@ -99,6 +99,7 @@ bool start(); void stopWorkers(); void setCurrentThreadAffinity(); +void setThreadNodeAffinity(void *numaMask); int tryAcquireSleepingThread(sleepbitmap_t firstTryBitmap, sleepbitmap_t secondTryBitmap); int tryBondPeers(int maxPeers, sleepbitmap_t peerBitmap, BondedTaskGroup& master); @@ -106,7 +107,6 @@ static int getCpuCount(); static int getNumaNodeCount(); -static void setThreadNodeAffinity(void *numaMask); }; /* Any worker thread may enlist the help of idle worker threads from the same ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 4 of 7] threadpool.cpp: window specific header 'winnt.h'
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1463738737 -19800 # Fri May 20 15:35:37 2016 +0530 # Node ID 598afbba907e06563ac08c0058abdbd7ba50d5d8 # Parent ab1b27a1712d581c32b007f0f72e482f4a83905d threadpool.cpp: window specific header 'winnt.h' diff -r ab1b27a1712d -r 598afbba907e source/common/threadpool.cpp --- a/source/common/threadpool.cpp Fri May 20 15:35:37 2016 +0530 +++ b/source/common/threadpool.cpp Fri May 20 15:35:37 2016 +0530 @@ -27,7 +27,10 @@ #include "threading.h" #include + +#if _WIN32 #include +#endif // _WIN32 #if X86_64 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 5 of 7] threadpool.cpp: fix linux error: #elif with no expression
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1463738737 -19800 # Fri May 20 15:35:37 2016 +0530 # Node ID e96bb0aaec630d44ea318222ae221fe116f4f11a # Parent 598afbba907e06563ac08c0058abdbd7ba50d5d8 threadpool.cpp: fix linux error: #elif with no expression diff -r 598afbba907e -r e96bb0aaec63 source/common/threadpool.cpp --- a/source/common/threadpool.cpp Fri May 20 15:35:37 2016 +0530 +++ b/source/common/threadpool.cpp Fri May 20 15:35:37 2016 +0530 @@ -279,7 +279,7 @@ if (bNumaSupport >= 0) cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++; } -#elif +#else // NUMA not supported cpusPerNode[0] = getCpuCount(); #endif ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH] ThreadPool.cpp: fix getCpuCount function for windows systems
Please ignore this sending updated patch. thanks. Regards, Praveen On Tue, May 17, 2016 at 7:17 PM, <prav...@multicorewareinc.com> wrote: > # HG changeset patch > # User Praveen Tiwari <prav...@multicorewareinc.com> > # Date 1463492830 -19800 > # Tue May 17 19:17:10 2016 +0530 > # Node ID cf3c2e0dce0997a499ae1d50fda6891cae83e685 > # Parent 372fc5b12ed6003f8784702956ccf7203ea68a2e > ThreadPool.cpp: fix getCpuCount function for windows systems > > diff -r 372fc5b12ed6 -r cf3c2e0dce09 source/common/threadpool.cpp > --- a/source/common/threadpool.cpp Tue May 17 19:06:36 2016 +0530 > +++ b/source/common/threadpool.cpp Tue May 17 19:17:10 2016 +0530 > @@ -545,9 +545,17 @@ > int ThreadPool::getCpuCount() > { > #if _WIN32 > -SYSTEM_INFO sysinfo; > -GetSystemInfo(); > -return sysinfo.dwNumberOfProcessors; > +enum { MAX_NODE_NUM = 127 }; > +int cpus = 0; > +int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM); > +PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY; > +for (int i = 0; i < numNumaNodes; i++) > +{ > +GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer); > +cpus += (int)bitCount(groupAffinityPointer->Mask); > +} > +delete groupAffinityPointer; > +return cpus; > #elif __unix__ && X265_ARCH_ARM > /* Return the number of processors configured by OS. Because, most > embedded linux distributions > * uses only one processor as the scheduler doesn't have enough work > to utilize all processors */ > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH] ThreadPool.cpp: fix core count for windows machines
Please ignore this sending updated patch. Thanks Regards, Praveen On Tue, May 17, 2016 at 8:01 PM, Pradeep Ramachandran < prad...@multicorewareinc.com> wrote: > > On Tue, May 17, 2016 at 7:07 PM, <prav...@multicorewareinc.com> wrote: > >> # HG changeset patch >> # User Praveen Tiwari <prav...@multicorewareinc.com> >> # Date 1463492196 -19800 >> # Tue May 17 19:06:36 2016 +0530 >> # Node ID 372fc5b12ed6003f8784702956ccf7203ea68a2e >> # Parent e5b5bdc3c154f908706fb75e006f9abf9b3de96f >> ThreadPool.cpp: fix core count for windows machines >> >> diff -r e5b5bdc3c154 -r 372fc5b12ed6 source/common/threadpool.cpp >> --- a/source/common/threadpool.cpp Sat May 14 07:29:46 2016 +0530 >> +++ b/source/common/threadpool.cpp Tue May 17 19:06:36 2016 +0530 >> @@ -27,6 +27,7 @@ >> #include "threading.h" >> >> #include >> +#include >> >> #if X86_64 >> >> @@ -64,6 +65,18 @@ >> # define strcasecmp _stricmp >> #endif >> >> +uint64_t bitCount(uint64_t value) >> +{ >> +uint64_t count = 0; >> +while (value > 0) // until all bits are zero >> +{ >> +if ((value & 1) == 1) // check lower bit >> +count++; >> +value >>= 1; // shift bits, removing lower bit >> +} >> +return count; >> +} >> + >> namespace X265_NS { >> // x265 private namespace >> >> @@ -238,7 +251,6 @@ >> memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool)); >> >> int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM); >> -int cpuCount = getCpuCount(); >> bool bNumaSupport = false; >> >> #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 >> @@ -248,20 +260,28 @@ >> #endif >> >> >> +#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 >> +PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY; >> +for (int i = 0; i < numNumaNodes; i++) >> +{ >> +GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer); >> +cpusPerNode[i] = (int)bitCount(groupAffinityPointer->Mask); >> +} >> +delete groupAffinityPointer; >> +#elif HAVE_LIBNUMA >> +int cpuCount = getCpuCount(); >> > > Can we move to the cleaner implementation of not relying on CPU counts for > non-windows platforms also? > > >> for (int i = 0; i < cpuCount; i++) >> { >> -#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 >> -UCHAR node; >> -if (GetNumaProcessorNode((UCHAR)i, )) >> -cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++; >> -else >> -#elif HAVE_LIBNUMA >> if (bNumaSupport >= 0) >> cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++; >> -else >> +} >> +#elif >> +int cpuCount = getCpuCount(); >> +for (int i = 0; i < cpuCount; i++) >> +{ >> +cpusPerNode[0]++; >> +} >> > > How about cpusPerNode[0] = getCpuCount() here? The for loop is unnecessary. > > >> #endif >> -cpusPerNode[0]++; >> -} >> >> if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG) >> for (int i = 0; i < numNumaNodes; i++) >> ___ >> x265-devel mailing list >> x265-devel@videolan.org >> https://mailman.videolan.org/listinfo/x265-devel >> > > > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] ThreadPool.cpp: fix getCpuCount function for windows systems
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1463492830 -19800 # Tue May 17 19:17:10 2016 +0530 # Node ID cf3c2e0dce0997a499ae1d50fda6891cae83e685 # Parent 372fc5b12ed6003f8784702956ccf7203ea68a2e ThreadPool.cpp: fix getCpuCount function for windows systems diff -r 372fc5b12ed6 -r cf3c2e0dce09 source/common/threadpool.cpp --- a/source/common/threadpool.cpp Tue May 17 19:06:36 2016 +0530 +++ b/source/common/threadpool.cpp Tue May 17 19:17:10 2016 +0530 @@ -545,9 +545,17 @@ int ThreadPool::getCpuCount() { #if _WIN32 -SYSTEM_INFO sysinfo; -GetSystemInfo(); -return sysinfo.dwNumberOfProcessors; +enum { MAX_NODE_NUM = 127 }; +int cpus = 0; +int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM); +PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY; +for (int i = 0; i < numNumaNodes; i++) +{ +GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer); +cpus += (int)bitCount(groupAffinityPointer->Mask); +} +delete groupAffinityPointer; +return cpus; #elif __unix__ && X265_ARCH_ARM /* Return the number of processors configured by OS. Because, most embedded linux distributions * uses only one processor as the scheduler doesn't have enough work to utilize all processors */ ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] ThreadPool.cpp: fix core count for windows machines
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1463492196 -19800 # Tue May 17 19:06:36 2016 +0530 # Node ID 372fc5b12ed6003f8784702956ccf7203ea68a2e # Parent e5b5bdc3c154f908706fb75e006f9abf9b3de96f ThreadPool.cpp: fix core count for windows machines diff -r e5b5bdc3c154 -r 372fc5b12ed6 source/common/threadpool.cpp --- a/source/common/threadpool.cpp Sat May 14 07:29:46 2016 +0530 +++ b/source/common/threadpool.cpp Tue May 17 19:06:36 2016 +0530 @@ -27,6 +27,7 @@ #include "threading.h" #include +#include #if X86_64 @@ -64,6 +65,18 @@ # define strcasecmp _stricmp #endif +uint64_t bitCount(uint64_t value) +{ +uint64_t count = 0; +while (value > 0) // until all bits are zero +{ +if ((value & 1) == 1) // check lower bit +count++; +value >>= 1; // shift bits, removing lower bit +} +return count; +} + namespace X265_NS { // x265 private namespace @@ -238,7 +251,6 @@ memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool)); int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM); -int cpuCount = getCpuCount(); bool bNumaSupport = false; #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 @@ -248,20 +260,28 @@ #endif +#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 +PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY; +for (int i = 0; i < numNumaNodes; i++) +{ +GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer); +cpusPerNode[i] = (int)bitCount(groupAffinityPointer->Mask); +} +delete groupAffinityPointer; +#elif HAVE_LIBNUMA +int cpuCount = getCpuCount(); for (int i = 0; i < cpuCount; i++) { -#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 -UCHAR node; -if (GetNumaProcessorNode((UCHAR)i, )) -cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++; -else -#elif HAVE_LIBNUMA if (bNumaSupport >= 0) cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++; -else +} +#elif +int cpuCount = getCpuCount(); +for (int i = 0; i < cpuCount; i++) +{ +cpusPerNode[0]++; +} #endif -cpusPerNode[0]++; -} if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG) for (int i = 0; i < numNumaNodes; i++) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH] motion.cpp: optimize 'X265_DIA_SEARCH' byeliminating costly branch instructions
Yes, this is for eliminating if...else so it's perform a conditional assignment for correctness of code. I will try to update macro definition. Thanks. -Original Message- From: "chen" <chenm...@163.com> Sent: 09-03-2016 05:52 To: "Development for x265" <x265-devel@videolan.org> Subject: Re: [x265] [PATCH] motion.cpp: optimize 'X265_DIA_SEARCH' byeliminating costly branch instructions I suggest you to modify macro And this patch depends on side effect of conditional statment, it is bad code style. At 2016-03-08 22:48:49,prav...@multicorewareinc.com wrote: ># HG changeset patch ># User Praveen Tiwari <prav...@multicorewareinc.com> ># Date 1457448163 -19800 ># Tue Mar 08 20:12:43 2016 +0530 ># Node ID 519441d72cf723dc3b279a91a6080f329729cb49 ># Parent 0e1b6472c05e3a53538d8e064e502d8a7508eb6e >motion.cpp: optimize 'X265_DIA_SEARCH' by eliminating costly branch >instructions > >diff -r 0e1b6472c05e -r 519441d72cf7 source/encoder/motion.cpp >--- a/source/encoder/motion.cppTue Mar 08 19:08:57 2016 +0530 >+++ b/source/encoder/motion.cppTue Mar 08 20:12:43 2016 +0530 >@@ -659,10 +659,10 @@ > do > { > COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs); >-COPY1_IF_LT(bcost, (costs[0] << 4) + 1); >-COPY1_IF_LT(bcost, (costs[1] << 4) + 3); >-COPY1_IF_LT(bcost, (costs[2] << 4) + 4); >-COPY1_IF_LT(bcost, (costs[3] << 4) + 12); >+(((costs[0] << 4) + 1) < bcost) && (bcost = ((costs[0] << 4) + >1)); // if ((y) < (x)) (x) = (y); >+(((costs[1] << 4) + 3) < bcost) && (bcost = ((costs[1] << 4) + >3)); >+(((costs[2] << 4) + 4) < bcost) && (bcost = ((costs[2] << 4) + >4)); >+(((costs[3] << 4) + 12) < bcost) && (bcost = ((costs[3] << 4) + >12)); > if (!(bcost & 15)) > break; > bmv.x -= (bcost << 28) >> 30; >___ >x265-devel mailing list >x265-devel@videolan.org >https://mailman.videolan.org/listinfo/x265-devel___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] motion.cpp: optimization, eliminate branching
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1457444337 -19800 # Tue Mar 08 19:08:57 2016 +0530 # Node ID 0e1b6472c05e3a53538d8e064e502d8a7508eb6e # Parent d50663b272c4f308de6f669afe1323990971e08d motion.cpp: optimization, eliminate branching diff -r d50663b272c4 -r 0e1b6472c05e source/encoder/motion.cpp --- a/source/encoder/motion.cpp Tue Mar 08 15:50:05 2016 +0530 +++ b/source/encoder/motion.cpp Tue Mar 08 19:08:57 2016 +0530 @@ -30,6 +30,7 @@ #if _MSC_VER #pragma warning(disable: 4127) // conditional expression is constant (macros use this construct) +#pragma warning (disable: 4706) // assignment within conditional expression #endif using namespace X265_NS; @@ -762,8 +763,7 @@ ucost2 = bcost; if (bmv.notZero() && bmv != pmv) DIA1_ITER(bmv.x, bmv.y); -if (bcost == ucost2) -cross_start = 3; +(bcost == ucost2) && (cross_start = 3); /* Early Termination */ omv = bmv; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] param: cleanup, print reconfigured param option along with its old and new value
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1457432405 -19800 # Tue Mar 08 15:50:05 2016 +0530 # Node ID d50663b272c4f308de6f669afe1323990971e08d # Parent 88aebc166fa8e16f91d5f0acce77690003be9d91 param: cleanup, print reconfigured param option along with its old and new value diff -r 88aebc166fa8 -r d50663b272c4 source/common/param.cpp --- a/source/common/param.cpp Fri Mar 04 16:59:45 2016 +0530 +++ b/source/common/param.cpp Tue Mar 08 15:50:05 2016 +0530 @@ -1373,36 +1373,32 @@ if (!param || !reconfiguredParam) return; -x265_log(param,X265_LOG_INFO, "Reconfigured param options :\n"); - -char buf[80] = { 0 }; char tmp[40]; -#define TOOLCMP(COND1, COND2, STR, VAL) if (COND1 != COND2) { sprintf(tmp, STR, VAL); appendtool(param, buf, sizeof(buf), tmp); } -TOOLCMP(param->maxNumReferences, reconfiguredParam->maxNumReferences, "ref=%d", reconfiguredParam->maxNumReferences); -TOOLCMP(param->maxTUSize, reconfiguredParam->maxTUSize, "max-tu-size=%d", reconfiguredParam->maxTUSize); -TOOLCMP(param->searchRange, reconfiguredParam->searchRange, "merange=%d", reconfiguredParam->searchRange); -TOOLCMP(param->subpelRefine, reconfiguredParam->subpelRefine, "subme= %d", reconfiguredParam->subpelRefine); -TOOLCMP(param->rdLevel, reconfiguredParam->rdLevel, "rd=%d", reconfiguredParam->rdLevel); -TOOLCMP(param->psyRd, reconfiguredParam->psyRd, "psy-rd=%.2lf", reconfiguredParam->psyRd); -TOOLCMP(param->rdoqLevel, reconfiguredParam->rdoqLevel, "rdoq=%d", reconfiguredParam->rdoqLevel); -TOOLCMP(param->psyRdoq, reconfiguredParam->psyRdoq, "psy-rdoq=%.2lf", reconfiguredParam->psyRdoq); -TOOLCMP(param->noiseReductionIntra, reconfiguredParam->noiseReductionIntra, "nr-intra=%d", reconfiguredParam->noiseReductionIntra); -TOOLCMP(param->noiseReductionInter, reconfiguredParam->noiseReductionInter, "nr-inter=%d", reconfiguredParam->noiseReductionInter); -TOOLCMP(param->bEnableTSkipFast, reconfiguredParam->bEnableTSkipFast, "tskip-fast=%d", reconfiguredParam->bEnableTSkipFast); -TOOLCMP(param->bEnableSignHiding, reconfiguredParam->bEnableSignHiding, "signhide=%d", reconfiguredParam->bEnableSignHiding); -TOOLCMP(param->bEnableFastIntra, reconfiguredParam->bEnableFastIntra, "fast-intra=%d", reconfiguredParam->bEnableFastIntra); -if (param->bEnableLoopFilter && (param->deblockingFilterBetaOffset != reconfiguredParam->deblockingFilterBetaOffset +#define TOOLCMP(COND1, COND2, STR, OLD_VAL, NEW_VAL) if (COND1 != COND2) { sprintf(tmp, STR, OLD_VAL, NEW_VAL); printf("\n%s\n", tmp);} +TOOLCMP(param->maxNumReferences, reconfiguredParam->maxNumReferences, "[x265] Reconfigure: ref=%d to %d", param->maxNumReferences, reconfiguredParam->maxNumReferences); +TOOLCMP(param->maxTUSize, reconfiguredParam->maxTUSize, "[x265] Reconfigure: max-tu-size=%d to %d", param->maxTUSize, reconfiguredParam->maxTUSize); +TOOLCMP(param->searchRange, reconfiguredParam->searchRange, "[x265] Reconfigure: merange=%d to %d", param->searchRange, reconfiguredParam->searchRange); +TOOLCMP(param->subpelRefine, reconfiguredParam->subpelRefine, "[x265] Reconfigure: subme=%d to %d", param->subpelRefine, reconfiguredParam->subpelRefine); +TOOLCMP(param->rdLevel, reconfiguredParam->rdLevel, "[x265] Reconfigure: rd=%d to %d", param->rdLevel, reconfiguredParam->rdLevel); +TOOLCMP(param->psyRd, reconfiguredParam->psyRd, "[x265] Reconfigure: psy-rd=%.2lf to %.2lf", param->psyRd, reconfiguredParam->psyRd); +TOOLCMP(param->rdoqLevel, reconfiguredParam->rdoqLevel, "[x265] Reconfigure: rdoq=%d to %d", param->rdoqLevel, reconfiguredParam->rdoqLevel); +TOOLCMP(param->psyRdoq, reconfiguredParam->psyRdoq, "[x265] Reconfigure: psy-rdoq=%.2lf to %.2lf", param->psyRdoq, reconfiguredParam->psyRdoq); +TOOLCMP(param->noiseReductionIntra, reconfiguredParam->noiseReductionIntra, "[x265] Reconfigure: nr-intra=%d to %d", param->noiseReductionIntra, reconfiguredParam->noiseReductionIntra); +TOOLCMP(param->noiseReductionInter, reconfiguredParam->noiseReductionInter, "[x265] Reconfigure: nr-inter=%d to %d", param->noiseReductionInter, reconfiguredParam->noiseReductionInter); +TOOLCMP(param->bEnableTSkipFast, reconfiguredParam->bEnableTSkipFast, "[x265] Reconfigure: tskip-fast=%d to %d", param->bEnableTSkipFast, reconfiguredParam->bEnableTSkipFast); +TOOLCMP(param->
Re: [x265] [PATCH] param: cleanup, print reconfigured param option along with its old and configured value
Please ignore the patch need to update. Thanks. Regards, Praveen On Tue, Mar 8, 2016 at 10:57 AM, <prav...@multicorewareinc.com> wrote: > # HG changeset patch > # User Praveen Tiwari <prav...@multicorewareinc.com> > # Date 1457356750 -19800 > # Mon Mar 07 18:49:10 2016 +0530 > # Node ID 6f7dbb1c901cb5b5b88cc20c3213906465021338 > # Parent 88aebc166fa8e16f91d5f0acce77690003be9d91 > param: cleanup, print reconfigured param option along with its old and > configured value > > diff -r 88aebc166fa8 -r 6f7dbb1c901c source/common/param.cpp > --- a/source/common/param.cpp Fri Mar 04 16:59:45 2016 +0530 > +++ b/source/common/param.cpp Mon Mar 07 18:49:10 2016 +0530 > @@ -1373,36 +1373,31 @@ > if (!param || !reconfiguredParam) > return; > > -x265_log(param,X265_LOG_INFO, "Reconfigured param options :\n"); > - > -char buf[80] = { 0 }; > char tmp[40]; > -#define TOOLCMP(COND1, COND2, STR, VAL) if (COND1 != COND2) { > sprintf(tmp, STR, VAL); appendtool(param, buf, sizeof(buf), tmp); } > -TOOLCMP(param->maxNumReferences, reconfiguredParam->maxNumReferences, > "ref=%d", reconfiguredParam->maxNumReferences); > -TOOLCMP(param->maxTUSize, reconfiguredParam->maxTUSize, > "max-tu-size=%d", reconfiguredParam->maxTUSize); > -TOOLCMP(param->searchRange, reconfiguredParam->searchRange, > "merange=%d", reconfiguredParam->searchRange); > -TOOLCMP(param->subpelRefine, reconfiguredParam->subpelRefine, "subme= > %d", reconfiguredParam->subpelRefine); > -TOOLCMP(param->rdLevel, reconfiguredParam->rdLevel, "rd=%d", > reconfiguredParam->rdLevel); > -TOOLCMP(param->psyRd, reconfiguredParam->psyRd, "psy-rd=%.2lf", > reconfiguredParam->psyRd); > -TOOLCMP(param->rdoqLevel, reconfiguredParam->rdoqLevel, "rdoq=%d", > reconfiguredParam->rdoqLevel); > -TOOLCMP(param->psyRdoq, reconfiguredParam->psyRdoq, "psy-rdoq=%.2lf", > reconfiguredParam->psyRdoq); > -TOOLCMP(param->noiseReductionIntra, > reconfiguredParam->noiseReductionIntra, "nr-intra=%d", > reconfiguredParam->noiseReductionIntra); > -TOOLCMP(param->noiseReductionInter, > reconfiguredParam->noiseReductionInter, "nr-inter=%d", > reconfiguredParam->noiseReductionInter); > -TOOLCMP(param->bEnableTSkipFast, reconfiguredParam->bEnableTSkipFast, > "tskip-fast=%d", reconfiguredParam->bEnableTSkipFast); > -TOOLCMP(param->bEnableSignHiding, > reconfiguredParam->bEnableSignHiding, "signhide=%d", > reconfiguredParam->bEnableSignHiding); > -TOOLCMP(param->bEnableFastIntra, reconfiguredParam->bEnableFastIntra, > "fast-intra=%d", reconfiguredParam->bEnableFastIntra); > -if (param->bEnableLoopFilter && (param->deblockingFilterBetaOffset != > reconfiguredParam->deblockingFilterBetaOffset > +#define TOOLCMP(COND1, COND2, STR, OLD_VAL, NEW_VAL) if (COND1 != COND2) > { sprintf(tmp, STR, OLD_VAL, NEW_VAL);} > +TOOLCMP(param->maxNumReferences, reconfiguredParam->maxNumReferences, > "[x265] Reconfigure: ref=%d to %d", param->maxNumReferences, > reconfiguredParam->maxNumReferences); > +TOOLCMP(param->maxTUSize, reconfiguredParam->maxTUSize, "[x265] > Reconfigure: max-tu-size=%d to %d", param->maxTUSize, > reconfiguredParam->maxTUSize); > +TOOLCMP(param->searchRange, reconfiguredParam->searchRange, "[x265] > Reconfigure: merange=%d to %d", param->searchRange, > reconfiguredParam->searchRange); > +TOOLCMP(param->subpelRefine, reconfiguredParam->subpelRefine, "[x265] > Reconfigure: subme=%d to %d", param->subpelRefine, > reconfiguredParam->subpelRefine); > +TOOLCMP(param->rdLevel, reconfiguredParam->rdLevel, "[x265] > Reconfigure: rd=%d to %d", param->rdLevel, reconfiguredParam->rdLevel); > +TOOLCMP(param->psyRd, reconfiguredParam->psyRd, "[x265] Reconfigure: > psy-rd=%.2lf to %.2lf", param->psyRd, reconfiguredParam->psyRd); > +TOOLCMP(param->rdoqLevel, reconfiguredParam->rdoqLevel, "[x265] > Reconfigure: rdoq=%d to %d", param->rdoqLevel, > reconfiguredParam->rdoqLevel); > +TOOLCMP(param->psyRdoq, reconfiguredParam->psyRdoq, "[x265] > Reconfigure: psy-rdoq=%.2lf to %.2lf", param->psyRdoq, > reconfiguredParam->psyRdoq); > +TOOLCMP(param->noiseReductionIntra, > reconfiguredParam->noiseReductionIntra, "[x265] Reconfigure: nr-intra=%d to > %d", param->noiseReductionIntra, reconf
[x265] [PATCH] param: cleanup, print reconfigured param option along with its old and configured value
# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1457356750 -19800 # Mon Mar 07 18:49:10 2016 +0530 # Node ID 6f7dbb1c901cb5b5b88cc20c3213906465021338 # Parent 88aebc166fa8e16f91d5f0acce77690003be9d91 param: cleanup, print reconfigured param option along with its old and configured value diff -r 88aebc166fa8 -r 6f7dbb1c901c source/common/param.cpp --- a/source/common/param.cpp Fri Mar 04 16:59:45 2016 +0530 +++ b/source/common/param.cpp Mon Mar 07 18:49:10 2016 +0530 @@ -1373,36 +1373,31 @@ if (!param || !reconfiguredParam) return; -x265_log(param,X265_LOG_INFO, "Reconfigured param options :\n"); - -char buf[80] = { 0 }; char tmp[40]; -#define TOOLCMP(COND1, COND2, STR, VAL) if (COND1 != COND2) { sprintf(tmp, STR, VAL); appendtool(param, buf, sizeof(buf), tmp); } -TOOLCMP(param->maxNumReferences, reconfiguredParam->maxNumReferences, "ref=%d", reconfiguredParam->maxNumReferences); -TOOLCMP(param->maxTUSize, reconfiguredParam->maxTUSize, "max-tu-size=%d", reconfiguredParam->maxTUSize); -TOOLCMP(param->searchRange, reconfiguredParam->searchRange, "merange=%d", reconfiguredParam->searchRange); -TOOLCMP(param->subpelRefine, reconfiguredParam->subpelRefine, "subme= %d", reconfiguredParam->subpelRefine); -TOOLCMP(param->rdLevel, reconfiguredParam->rdLevel, "rd=%d", reconfiguredParam->rdLevel); -TOOLCMP(param->psyRd, reconfiguredParam->psyRd, "psy-rd=%.2lf", reconfiguredParam->psyRd); -TOOLCMP(param->rdoqLevel, reconfiguredParam->rdoqLevel, "rdoq=%d", reconfiguredParam->rdoqLevel); -TOOLCMP(param->psyRdoq, reconfiguredParam->psyRdoq, "psy-rdoq=%.2lf", reconfiguredParam->psyRdoq); -TOOLCMP(param->noiseReductionIntra, reconfiguredParam->noiseReductionIntra, "nr-intra=%d", reconfiguredParam->noiseReductionIntra); -TOOLCMP(param->noiseReductionInter, reconfiguredParam->noiseReductionInter, "nr-inter=%d", reconfiguredParam->noiseReductionInter); -TOOLCMP(param->bEnableTSkipFast, reconfiguredParam->bEnableTSkipFast, "tskip-fast=%d", reconfiguredParam->bEnableTSkipFast); -TOOLCMP(param->bEnableSignHiding, reconfiguredParam->bEnableSignHiding, "signhide=%d", reconfiguredParam->bEnableSignHiding); -TOOLCMP(param->bEnableFastIntra, reconfiguredParam->bEnableFastIntra, "fast-intra=%d", reconfiguredParam->bEnableFastIntra); -if (param->bEnableLoopFilter && (param->deblockingFilterBetaOffset != reconfiguredParam->deblockingFilterBetaOffset +#define TOOLCMP(COND1, COND2, STR, OLD_VAL, NEW_VAL) if (COND1 != COND2) { sprintf(tmp, STR, OLD_VAL, NEW_VAL);} +TOOLCMP(param->maxNumReferences, reconfiguredParam->maxNumReferences, "[x265] Reconfigure: ref=%d to %d", param->maxNumReferences, reconfiguredParam->maxNumReferences); +TOOLCMP(param->maxTUSize, reconfiguredParam->maxTUSize, "[x265] Reconfigure: max-tu-size=%d to %d", param->maxTUSize, reconfiguredParam->maxTUSize); +TOOLCMP(param->searchRange, reconfiguredParam->searchRange, "[x265] Reconfigure: merange=%d to %d", param->searchRange, reconfiguredParam->searchRange); +TOOLCMP(param->subpelRefine, reconfiguredParam->subpelRefine, "[x265] Reconfigure: subme=%d to %d", param->subpelRefine, reconfiguredParam->subpelRefine); +TOOLCMP(param->rdLevel, reconfiguredParam->rdLevel, "[x265] Reconfigure: rd=%d to %d", param->rdLevel, reconfiguredParam->rdLevel); +TOOLCMP(param->psyRd, reconfiguredParam->psyRd, "[x265] Reconfigure: psy-rd=%.2lf to %.2lf", param->psyRd, reconfiguredParam->psyRd); +TOOLCMP(param->rdoqLevel, reconfiguredParam->rdoqLevel, "[x265] Reconfigure: rdoq=%d to %d", param->rdoqLevel, reconfiguredParam->rdoqLevel); +TOOLCMP(param->psyRdoq, reconfiguredParam->psyRdoq, "[x265] Reconfigure: psy-rdoq=%.2lf to %.2lf", param->psyRdoq, reconfiguredParam->psyRdoq); +TOOLCMP(param->noiseReductionIntra, reconfiguredParam->noiseReductionIntra, "[x265] Reconfigure: nr-intra=%d to %d", param->noiseReductionIntra, reconfiguredParam->noiseReductionIntra); +TOOLCMP(param->noiseReductionInter, reconfiguredParam->noiseReductionInter, "[x265] Reconfigure: nr-inter=%d to %d", param->noiseReductionInter, reconfiguredParam->noiseReductionInter); +TOOLCMP(param->bEnableTSkipFast, reconfiguredParam->bEnableTSkipFast, "[x265] Reconfigure: tskip-fast=%d to %d", param->bEnableTSkipFast, reconfiguredParam->bEnableTSkipFast); +TOOLCMP(param->bEnableSig
[x265] Fwd: [PATCH] asm: avx2 code for weight_sp() 16bpp
-- Forwarded message -- From: aasaipr...@multicorewareinc.com Date: Mon, Jun 29, 2015 at 4:51 PM Subject: [x265] [PATCH] asm: avx2 code for weight_sp() 16bpp To: x265-devel@videolan.org # HG changeset patch # User Aasaipriya Chandran aasaipr...@multicorewareinc.com # Date 1435562395 -19800 # Mon Jun 29 12:49:55 2015 +0530 # Node ID bebe4e496a432608cf0a9c495debd1970caa387e # Parent 9feee64efa440c25f016d15ae982789e5393a77e asm: avx2 code for weight_sp() 16bpp avx2: weight_sp 11.37x 4496.63 51139.20 sse4: weight_sp 6.48x8163.87 52870.36 diff -r 9feee64efa44 -r bebe4e496a43 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jun 26 15:29:51 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jun 29 12:49:55 2015 +0530 @@ -1517,6 +1517,7 @@ p.scale1D_128to64 = PFX(scale1D_128to64_avx2); p.scale2D_64to32 = PFX(scale2D_64to32_avx2); p.weight_pp = PFX(weight_pp_avx2); +p.weight_sp = PFX(weight_sp_avx2); p.sign = PFX(calSign_avx2); p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2); diff -r 9feee64efa44 -r bebe4e496a43 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Fri Jun 26 15:29:51 2015 +0530 +++ b/source/common/x86/pixel-util8.asm Mon Jun 29 12:49:55 2015 +0530 @@ -1674,8 +1674,128 @@ dec r5d jnz .loopH RET - -%if ARCH_X86_64 +%endif + + +%if HIGH_BIT_DEPTH +INIT_YMM avx2 +cglobal weight_sp, 6,7,9 +mova m1, [pw_1023] +mova m2, [pw_1] +mov r6d, r7m r7 is 8th register (0-7). so it should be cglobal weight_sp, 6, 8, 9 and ARCH_X86_64 only code. +shl r6d, 16 +orr6d, r6m +vpbroadcastd m3, r6d ; m3 = [round w0] +movd xm4, r8m ; m4 = [shift] +vpbroadcastd m5, r9m ; m5 = [offset] + +; correct row stride +add r3d, r3d +add r2d, r2d +mov r6d, r4d +and r6d, ~(mmsize / SIZEOF_PIXEL - 1) +sub r3d, r6d +sub r3d, r6d +sub r2d, r6d +sub r2d, r6d + +; generate partial width mask (MUST BE IN YMM0) +mov r6d, r4d +and r6d, (mmsize / SIZEOF_PIXEL - 1) +movd xm0, r6d +pshuflw m0, m0, 0 +punpcklqdqm0, m0 +vinserti128 m0, m0, xm0, 1 +pcmpgtw m0, [pw_0_15] + +.loopH: +mov r6d, r4d + +.loopW: +movu m6, [r0] +paddw m6, [pw_2000] + +punpcklwd m7, m6, m2 +pmaddwd m7, m3 ;(round w0) +psrad m7, xm4 ;(shift) +paddd m7, m5 ;(offset) + +punpckhwd m6, m2 +pmaddwd m6, m3 +psrad m6, xm4 +paddd m6, m5 + +packusdw m7, m6 +pminuwm7, m1 + +sub r6d, (mmsize / SIZEOF_PIXEL) +jl.width14 +movu [r1], m7 +lea r0, [r0 + mmsize] +lea r1, [r1 + mmsize] +je.nextH +jmp .loopW + +.width14: +add r6d, 16 +cmp r6d, 14 +jl.width12 +movu [r1], xm7 +vextracti128 xm8, m7, 1 +movq [r1 + 16], xm8 +pextrd[r1 + 24], xm8, 2 +je.nextH + +.width12: +cmp r6d, 12 +jl.width10 +movu [r1], xm7 +vextracti128 xm8, m7, 1 +movq [r1 + 16], xm8 +je.nextH + +.width10: +cmp r6d, 10 +jl.width8 +movu [r1], xm7 +vextracti128 xm8, m7, 1 +movd [r1 + 16], xm8 +je.nextH + +.width8: +cmp r6d, 8 +jl.width6 +movu [r1], xm7 +je.nextH + +.width6 +cmp r6d, 6 +jl.width4 +movq [r1], xm7 +pextrd[r1 + 8], xm7, 2 +je.nextH + +.width4: +cmp r6d, 4 +jl
Re: [x265] Fwd: [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp
You would like to visit 8bpp code as well. Regards, Praveen On Mon, Jun 29, 2015 at 11:24 AM, Rajesh Paulraj raj...@multicorewareinc.com wrote: We don't need to push this patch. I will improve sse version for the same size. We may not need avx2 code for this.(will make sure after rewriting sse2 code) On Mon, Jun 29, 2015 at 10:21 AM, Deepthi Nandakumar deep...@multicorewareinc.com wrote: This does not build for HBD disabled On Fri, Jun 26, 2015 at 5:40 PM, Rajesh Paulraj raj...@multicorewareinc.com wrote: yes. It looks like we need to optimize sse2 code. I will work on this. On Fri, Jun 26, 2015 at 5:31 PM, Praveen Tiwari prav...@multicorewareinc.com wrote: -- Forwarded message -- From: raj...@multicorewareinc.com Date: Fri, Jun 26, 2015 at 3:14 PM Subject: [x265] [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp To: x265-devel@videolan.org # HG changeset patch # User Rajesh Paulrajraj...@multicorewareinc.com # Date 1435311076 -19800 # Fri Jun 26 15:01:16 2015 +0530 # Node ID 956401f1a679f1e71181b704d64e4acdb6f1a93f # Parent d64227e54233d1646c55bcb4b0b831e5340009ed asm: pixelavg_pp[8xN] avx2 code for 10bpp avx2: avg_pp[ 8x4] 4.39x145.09 636.75 avg_pp[ 8x8] 5.33x215.27 1146.55 avg_pp[ 8x16] 6.50x336.88 2190.68 avg_pp[ 8x32] 7.71x579.86 4470.84 sse2: avg_pp[ 8x4] 2.31x287.63 663.94 avg_pp[ 8x8] 3.26x370.21 1205.26 avg_pp[ 8x16] 3.99x581.63 2323.25 avg_pp[ 8x32] 4.78x995.79 4755.58 Basically, our macro pixel_avg_8xN just SSE (just simple syntax conversion for avx2, not using 256 bit capability) so, fundamentally there should be no major improvement in speed. But improvements 287.63c - 145.09c, 370.21c - 215.27 etc are quite good. Does it means SSE2 codes are not optimize well ? Can you revisit SSE code using this algorithm? diff -r d64227e54233 -r 956401f1a679 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jun 25 16:25:51 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jun 26 15:01:16 2015 +0530 @@ -1362,6 +1362,10 @@ p.cu[BLOCK_32x32].intra_pred[33]= PFX(intra_pred_ang32_33_avx2); p.cu[BLOCK_32x32].intra_pred[34]= PFX(intra_pred_ang32_2_avx2); +p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx2); +p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx2); +p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx2); +p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_avx2); p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2); p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2); p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2); diff -r d64227e54233 -r 956401f1a679 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmThu Jun 25 16:25:51 2015 +0530 +++ b/source/common/x86/mc-a.asmFri Jun 26 15:01:16 2015 +0530 @@ -4490,6 +4490,88 @@ RET %endif +%macro pixel_avg_W8 0 +movuxm0, [r2] +movuxm1, [r4] +pavgw xm0, xm1 +movu[r0], xm0 +movuxm2, [r2 + r3] +movuxm3, [r4 + r5] +pavgw xm2, xm3 +movu[r0 + r1], xm2 + +movuxm0, [r2 + r3 * 2] +movuxm1, [r4 + r5 * 2] +pavgw xm0, xm1 +movu[r0 + r1 * 2], xm0 +movuxm2, [r2 + r6] +movuxm3, [r4 + r7] +pavgw xm2, xm3 +movu[r0 + r8], xm2 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +lea r4, [r4 + 4 * r5] +%endmacro + +;--- +;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) +;--- +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal pixel_avg_8x4, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] +pixel_avg_W8 +RET + +cglobal pixel_avg_8x8, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] +mov r9d, 2 +.loop +pixel_avg_W8 +dec r9d +jnz .loop +RET + +cglobal pixel_avg_8x16, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] +mov r9d, 4 +.loop +pixel_avg_W8 +dec r9d +jnz .loop +RET + +cglobal pixel_avg_8x32, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3
[x265] Fwd: [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp
-- Forwarded message -- From: raj...@multicorewareinc.com Date: Fri, Jun 26, 2015 at 3:14 PM Subject: [x265] [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp To: x265-devel@videolan.org # HG changeset patch # User Rajesh Paulrajraj...@multicorewareinc.com # Date 1435311076 -19800 # Fri Jun 26 15:01:16 2015 +0530 # Node ID 956401f1a679f1e71181b704d64e4acdb6f1a93f # Parent d64227e54233d1646c55bcb4b0b831e5340009ed asm: pixelavg_pp[8xN] avx2 code for 10bpp avx2: avg_pp[ 8x4] 4.39x145.09 636.75 avg_pp[ 8x8] 5.33x215.27 1146.55 avg_pp[ 8x16] 6.50x336.88 2190.68 avg_pp[ 8x32] 7.71x579.86 4470.84 sse2: avg_pp[ 8x4] 2.31x287.63 663.94 avg_pp[ 8x8] 3.26x370.21 1205.26 avg_pp[ 8x16] 3.99x581.63 2323.25 avg_pp[ 8x32] 4.78x995.79 4755.58 diff -r d64227e54233 -r 956401f1a679 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jun 25 16:25:51 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jun 26 15:01:16 2015 +0530 @@ -1362,6 +1362,10 @@ p.cu[BLOCK_32x32].intra_pred[33]= PFX(intra_pred_ang32_33_avx2); p.cu[BLOCK_32x32].intra_pred[34]= PFX(intra_pred_ang32_2_avx2); +p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx2); +p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx2); +p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx2); +p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_avx2); p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2); p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2); p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2); diff -r d64227e54233 -r 956401f1a679 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmThu Jun 25 16:25:51 2015 +0530 +++ b/source/common/x86/mc-a.asmFri Jun 26 15:01:16 2015 +0530 @@ -4490,6 +4490,88 @@ RET %endif +%macro pixel_avg_W8 0 +movuxm0, [r2] +movuxm1, [r4] +pavgw xm0, xm1 +movu[r0], xm0 +movuxm2, [r2 + r3] +movuxm3, [r4 + r5] +pavgw xm2, xm3 +movu[r0 + r1], xm2 + Your macro is not using avx2 capabilities, did you check the performance of two rows combined ? It will reduce your pavgw and movu instruction by half. You can use vinserti128 to combine two rows at a time. +movuxm0, [r2 + r3 * 2] +movuxm1, [r4 + r5 * 2] +pavgw xm0, xm1 +movu[r0 + r1 * 2], xm0 +movuxm2, [r2 + r6] +movuxm3, [r4 + r7] +pavgw xm2, xm3 +movu[r0 + r8], xm2 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +lea r4, [r4 + 4 * r5] +%endmacro + +;--- +;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) +;--- +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal pixel_avg_8x4, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] +pixel_avg_W8 +RET + +cglobal pixel_avg_8x8, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] +mov r9d, 2 +.loop +pixel_avg_W8 +dec r9d +jnz .loop +RET + +cglobal pixel_avg_8x16, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] +mov r9d, 4 +.loop +pixel_avg_W8 +dec r9d +jnz .loop +RET + +cglobal pixel_avg_8x32, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] +mov r9d, 8 +.loop +pixel_avg_W8 +dec r9d +jnz .loop +RET +%endif + %macro pixel_avg_H4 0 movum0, [r2] movum1, [r4] ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] Fwd: [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp
ahh, width is just 8*16 = 128, two rows at a time will need vextracti128 as well while storing, which goes to port5, a bottleneck port. pavgw is much cheaper than it. You may try to combine 16XN sizes. Regards, Praveen On Fri, Jun 26, 2015 at 3:40 PM, Rajesh Paulraj raj...@multicorewareinc.com wrote: I tried using vinserti128. But that reduces the performance than this one. So i kept this version. On Fri, Jun 26, 2015 at 3:37 PM, Praveen Tiwari prav...@multicorewareinc.com wrote: -- Forwarded message -- From: raj...@multicorewareinc.com Date: Fri, Jun 26, 2015 at 3:14 PM Subject: [x265] [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp To: x265-devel@videolan.org # HG changeset patch # User Rajesh Paulrajraj...@multicorewareinc.com # Date 1435311076 -19800 # Fri Jun 26 15:01:16 2015 +0530 # Node ID 956401f1a679f1e71181b704d64e4acdb6f1a93f # Parent d64227e54233d1646c55bcb4b0b831e5340009ed asm: pixelavg_pp[8xN] avx2 code for 10bpp avx2: avg_pp[ 8x4] 4.39x145.09 636.75 avg_pp[ 8x8] 5.33x215.27 1146.55 avg_pp[ 8x16] 6.50x336.88 2190.68 avg_pp[ 8x32] 7.71x579.86 4470.84 sse2: avg_pp[ 8x4] 2.31x287.63 663.94 avg_pp[ 8x8] 3.26x370.21 1205.26 avg_pp[ 8x16] 3.99x581.63 2323.25 avg_pp[ 8x32] 4.78x995.79 4755.58 diff -r d64227e54233 -r 956401f1a679 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jun 25 16:25:51 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jun 26 15:01:16 2015 +0530 @@ -1362,6 +1362,10 @@ p.cu[BLOCK_32x32].intra_pred[33]= PFX(intra_pred_ang32_33_avx2); p.cu[BLOCK_32x32].intra_pred[34]= PFX(intra_pred_ang32_2_avx2); +p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx2); +p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx2); +p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx2); +p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_avx2); p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2); p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2); p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2); diff -r d64227e54233 -r 956401f1a679 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmThu Jun 25 16:25:51 2015 +0530 +++ b/source/common/x86/mc-a.asmFri Jun 26 15:01:16 2015 +0530 @@ -4490,6 +4490,88 @@ RET %endif +%macro pixel_avg_W8 0 +movuxm0, [r2] +movuxm1, [r4] +pavgw xm0, xm1 +movu[r0], xm0 +movuxm2, [r2 + r3] +movuxm3, [r4 + r5] +pavgw xm2, xm3 +movu[r0 + r1], xm2 + Your macro is not using avx2 capabilities, did you check the performance of two rows combined ? It will reduce your pavgw and movu instruction by half. You can use vinserti128 to combine two rows at a time. +movuxm0, [r2 + r3 * 2] +movuxm1, [r4 + r5 * 2] +pavgw xm0, xm1 +movu[r0 + r1 * 2], xm0 +movuxm2, [r2 + r6] +movuxm3, [r4 + r7] +pavgw xm2, xm3 +movu[r0 + r8], xm2 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +lea r4, [r4 + 4 * r5] +%endmacro + +;--- +;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) +;--- +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal pixel_avg_8x4, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] +pixel_avg_W8 +RET + +cglobal pixel_avg_8x8, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] +mov r9d, 2 +.loop +pixel_avg_W8 +dec r9d +jnz .loop +RET + +cglobal pixel_avg_8x16, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] +mov r9d, 4 +.loop +pixel_avg_W8 +dec r9d +jnz .loop +RET + +cglobal pixel_avg_8x32, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] +mov r9d, 8 +.loop +pixel_avg_W8 +dec r9d +jnz .loop +RET +%endif + %macro pixel_avg_H4 0 movum0, [r2] movum1, [r4] ___ x265-devel mailing list x265-devel@videolan.org
[x265] Fwd: [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp
-- Forwarded message -- From: raj...@multicorewareinc.com Date: Fri, Jun 26, 2015 at 3:14 PM Subject: [x265] [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp To: x265-devel@videolan.org # HG changeset patch # User Rajesh Paulrajraj...@multicorewareinc.com # Date 1435311076 -19800 # Fri Jun 26 15:01:16 2015 +0530 # Node ID 956401f1a679f1e71181b704d64e4acdb6f1a93f # Parent d64227e54233d1646c55bcb4b0b831e5340009ed asm: pixelavg_pp[8xN] avx2 code for 10bpp avx2: avg_pp[ 8x4] 4.39x145.09 636.75 avg_pp[ 8x8] 5.33x215.27 1146.55 avg_pp[ 8x16] 6.50x336.88 2190.68 avg_pp[ 8x32] 7.71x579.86 4470.84 sse2: avg_pp[ 8x4] 2.31x287.63 663.94 avg_pp[ 8x8] 3.26x370.21 1205.26 avg_pp[ 8x16] 3.99x581.63 2323.25 avg_pp[ 8x32] 4.78x995.79 4755.58 Basically, our macro pixel_avg_8xN just SSE (just simple syntax conversion for avx2, not using 256 bit capability) so, fundamentally there should be no major improvement in speed. But improvements 287.63c - 145.09c, 370.21c - 215.27 etc are quite good. Does it means SSE2 codes are not optimize well ? Can you revisit SSE code using this algorithm? diff -r d64227e54233 -r 956401f1a679 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jun 25 16:25:51 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jun 26 15:01:16 2015 +0530 @@ -1362,6 +1362,10 @@ p.cu[BLOCK_32x32].intra_pred[33]= PFX(intra_pred_ang32_33_avx2); p.cu[BLOCK_32x32].intra_pred[34]= PFX(intra_pred_ang32_2_avx2); +p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx2); +p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx2); +p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx2); +p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_avx2); p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2); p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2); p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2); diff -r d64227e54233 -r 956401f1a679 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmThu Jun 25 16:25:51 2015 +0530 +++ b/source/common/x86/mc-a.asmFri Jun 26 15:01:16 2015 +0530 @@ -4490,6 +4490,88 @@ RET %endif +%macro pixel_avg_W8 0 +movuxm0, [r2] +movuxm1, [r4] +pavgw xm0, xm1 +movu[r0], xm0 +movuxm2, [r2 + r3] +movuxm3, [r4 + r5] +pavgw xm2, xm3 +movu[r0 + r1], xm2 + +movuxm0, [r2 + r3 * 2] +movuxm1, [r4 + r5 * 2] +pavgw xm0, xm1 +movu[r0 + r1 * 2], xm0 +movuxm2, [r2 + r6] +movuxm3, [r4 + r7] +pavgw xm2, xm3 +movu[r0 + r8], xm2 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +lea r4, [r4 + 4 * r5] +%endmacro + +;--- +;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) +;--- +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal pixel_avg_8x4, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] +pixel_avg_W8 +RET + +cglobal pixel_avg_8x8, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] +mov r9d, 2 +.loop +pixel_avg_W8 +dec r9d +jnz .loop +RET + +cglobal pixel_avg_8x16, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] +mov r9d, 4 +.loop +pixel_avg_W8 +dec r9d +jnz .loop +RET + +cglobal pixel_avg_8x32, 6,10,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] +mov r9d, 8 +.loop +pixel_avg_W8 +dec r9d +jnz .loop +RET +%endif + %macro pixel_avg_H4 0 movum0, [r2] movum1, [r4] ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] param: added x265_atof as internal encoder function, moved to namespace X265_NS
# HG changeset patch # User Praveen Tiwari prav...@multicorewareinc.com # Date 1434720481 -19800 # Fri Jun 19 18:58:01 2015 +0530 # Node ID f53934a895e1ffc04edeae11183ad3556c09467b # Parent 44b6b2df7016f0129e66d91e9aab03261d02758a param: added x265_atof as internal encoder function, moved to namespace X265_NS diff -r 44b6b2df7016 -r f53934a895e1 source/common/param.cpp --- a/source/common/param.cpp Fri Jun 19 16:43:29 2015 +0530 +++ b/source/common/param.cpp Fri Jun 19 18:58:01 2015 +0530 @@ -471,16 +471,6 @@ return 0; } -static double x265_atof(const char* str, bool bError) -{ -char *end; -double v = strtod(str, end); - -if (end == str || *end != '\0') -bError = true; -return v; -} - static int parseName(const char* arg, const char* const* names, bool bError) { for (int i = 0; names[i]; i++) @@ -890,6 +880,16 @@ return v; } +double x265_atof(const char* str, bool bError) +{ +char *end; +double v = strtod(str, end); + +if (end == str || *end != '\0') +bError = true; +return v; +} + /* cpu name can be: * auto || true - x265::cpu_detect() * false || no - disabled diff -r 44b6b2df7016 -r f53934a895e1 source/common/param.h --- a/source/common/param.h Fri Jun 19 16:43:29 2015 +0530 +++ b/source/common/param.h Fri Jun 19 18:58:01 2015 +0530 @@ -2,6 +2,7 @@ * Copyright (C) 2013 x265 project * * Authors: Deepthi Nandakumar deep...@multicorewareinc.com + * Praveen Kumar Tiwari prav...@multicorewareinc.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -33,6 +34,7 @@ void x265_param_apply_fastfirstpass(x265_param *p); char* x265_param2string(x265_param *param); int x265_atoi(const char *str, bool bError); +double x265_atof(const char *str, bool bError); int parseCpuName(const char *value, bool bError); void setParamAspectRatio(x265_param *p, int width, int height); void getParamAspectRatio(x265_param *p, int width, int height); # HG changeset patch # User Praveen Tiwari prav...@multicorewareinc.com # Date 1434720481 -19800 # Fri Jun 19 18:58:01 2015 +0530 # Node ID f53934a895e1ffc04edeae11183ad3556c09467b # Parent 44b6b2df7016f0129e66d91e9aab03261d02758a param: added x265_atof as internal encoder function, moved to namespace X265_NS diff -r 44b6b2df7016 -r f53934a895e1 source/common/param.cpp --- a/source/common/param.cpp Fri Jun 19 16:43:29 2015 +0530 +++ b/source/common/param.cpp Fri Jun 19 18:58:01 2015 +0530 @@ -471,16 +471,6 @@ return 0; } -static double x265_atof(const char* str, bool bError) -{ -char *end; -double v = strtod(str, end); - -if (end == str || *end != '\0') -bError = true; -return v; -} - static int parseName(const char* arg, const char* const* names, bool bError) { for (int i = 0; names[i]; i++) @@ -890,6 +880,16 @@ return v; } +double x265_atof(const char* str, bool bError) +{ +char *end; +double v = strtod(str, end); + +if (end == str || *end != '\0') +bError = true; +return v; +} + /* cpu name can be: * auto || true - x265::cpu_detect() * false || no - disabled diff -r 44b6b2df7016 -r f53934a895e1 source/common/param.h --- a/source/common/param.h Fri Jun 19 16:43:29 2015 +0530 +++ b/source/common/param.h Fri Jun 19 18:58:01 2015 +0530 @@ -2,6 +2,7 @@ * Copyright (C) 2013 x265 project * * Authors: Deepthi Nandakumar deep...@multicorewareinc.com + * Praveen Kumar Tiwari prav...@multicorewareinc.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -33,6 +34,7 @@ void x265_param_apply_fastfirstpass(x265_param *p); char* x265_param2string(x265_param *param); int x265_atoi(const char *str, bool bError); +double x265_atof(const char *str, bool bError); int parseCpuName(const char *value, bool bError); void setParamAspectRatio(x265_param *p, int width, int height); void getParamAspectRatio(x265_param *p, int width, int height); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: intra_pred_ang32_18 improved by ~45% over SSE4
# HG changeset patch # User Praveen Tiwari prav...@multicorewareinc.com # Date 1428992352 -19800 # Tue Apr 14 11:49:12 2015 +0530 # Node ID 8c31f8daf9a2bbb3408178685eee97d84ca045ff # Parent 9a0818c97dc72b7974889fd34de073cdb4fde771 asm: intra_pred_ang32_18 improved by ~45% over SSE4 AVX2: intra_ang_32x32[18] 33.10x 354.58 11737.10 SSE4: intra_ang_32x32[18] 17.51x 650.80 11396.64 diff -r 9a0818c97dc7 -r 8c31f8daf9a2 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Apr 14 13:41:40 2015 +0800 +++ b/source/common/x86/asm-primitives.cpp Tue Apr 14 11:49:12 2015 +0530 @@ -1821,6 +1821,7 @@ p.cu[BLOCK_32x32].intra_pred[23] = x265_intra_pred_ang32_23_avx2; p.cu[BLOCK_32x32].intra_pred[22] = x265_intra_pred_ang32_22_avx2; p.cu[BLOCK_32x32].intra_pred[21] = x265_intra_pred_ang32_21_avx2; +p.cu[BLOCK_32x32].intra_pred[18] = x265_intra_pred_ang32_18_avx2; // copy_sp primitives p.cu[BLOCK_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2; diff -r 9a0818c97dc7 -r 8c31f8daf9a2 source/common/x86/intrapred.h --- a/source/common/x86/intrapred.h Tue Apr 14 13:41:40 2015 +0800 +++ b/source/common/x86/intrapred.h Tue Apr 14 11:49:12 2015 +0530 @@ -277,6 +277,7 @@ void x265_intra_pred_ang32_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang32_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang32_21_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang32_18_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_all_angs_pred_4x4_sse2(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); diff -r 9a0818c97dc7 -r 8c31f8daf9a2 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Tue Apr 14 13:41:40 2015 +0800 +++ b/source/common/x86/intrapred8.asm Tue Apr 14 11:49:12 2015 +0530 @@ -28,6 +28,7 @@ SECTION_RODATA 32 intra_pred_shuff_0_8:times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +intra_pred_shuff_15_0: times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 pb_0_8times 8 db 0, 8 pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8 @@ -10366,6 +10367,99 @@ RET +INIT_YMM avx2 +cglobal intra_pred_ang32_18, 4, 4, 3 +movu m0, [r2] +movu xm1, [r2 + 1 + 64] +pshufb xm1, [intra_pred_shuff_15_0] +mova xm2, xm0 +vinserti128m1, m1, xm2, 1 + +lear3, [r1 * 3] + +movu [r0], m0 +palignrm2, m0, m1, 15 +movu [r0 + r1], m2 +palignrm2, m0, m1, 14 +movu [r0 + r1 * 2], m2 +palignrm2, m0, m1, 13 +movu [r0 + r3], m2 + +lear0, [r0 + r1 * 4] +palignrm2, m0, m1, 12 +movu [r0], m2 +palignrm2, m0, m1, 11 +movu [r0 + r1], m2 +palignrm2, m0, m1, 10 +movu [r0 + r1 * 2], m2 +palignrm2, m0, m1, 9 +movu [r0 + r3], m2 + +lear0, [r0 + r1 * 4] +palignrm2, m0, m1, 8 +movu [r0], m2 +palignrm2, m0, m1, 7 +movu [r0 + r1], m2 +palignrm2, m0, m1, 6 +movu [r0 + r1 * 2], m2 +palignrm2, m0, m1, 5 +movu [r0 + r3], m2 + +lear0, [r0 + r1 * 4] +palignrm2, m0, m1, 4 +movu [r0], m2 +palignrm2, m0, m1, 3 +movu [r0 + r1], m2 +palignrm2, m0, m1, 2 +movu [r0 + r1 * 2], m2 +palignrm2, m0, m1, 1 +movu [r0 + r3], m2 + +lear0, [r0 + r1 * 4] +movu [r0], m1 + +movu xm0, [r2 + 64 + 17] +pshufb xm0, [intra_pred_shuff_15_0] +vinserti128m0, m0, xm1, 1 + +palignrm2, m1, m0, 15 +movu [r0 + r1], m2 +palignrm2, m1, m0, 14 +movu [r0 + r1 * 2], m2 +palignrm2, m1, m0, 13 +movu [r0 + r3], m2 + +lear0, [r0 + r1 * 4] +palignrm2, m1, m0, 12 +movu [r0], m2 +palignrm2, m1, m0, 11 +movu [r0 + r1], m2 +palignrm2, m1, m0, 10 +movu [r0 + r1 * 2], m2 +palignrm2, m1, m0, 9 +movu [r0 + r3], m2 + +lear0, [r0 + r1 * 4] +palignrm2, m1, m0, 8 +movu [r0], m2 +palignrm2, m1, m0, 7 +movu [r0 + r1], m2 +palignrm2, m1, m0,6 +movu [r0
[x265] [PATCH] asm: intra_pred_ang32_18 improved by ~44% over SSE4
# HG changeset patch # User Praveen Tiwari prav...@multicorewareinc.com # Date 1428917176 -19800 # Mon Apr 13 14:56:16 2015 +0530 # Node ID f4310212b0745d51d0cc5ed8b2a3098e1bcea016 # Parent 4cccf22b00ee188a72c8dc3896d7dc1613d855ad asm: intra_pred_ang32_18 improved by ~44% over SSE4 AVX2: intra_ang_32x32[18] 31.25x 363.88 11371.31 SSE4: intra_ang_32x32[18] 18.11x 648.61 11743.52 diff -r 4cccf22b00ee -r f4310212b074 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Apr 10 18:15:38 2015 -0500 +++ b/source/common/x86/asm-primitives.cpp Mon Apr 13 14:56:16 2015 +0530 @@ -1819,6 +1819,7 @@ p.cu[BLOCK_32x32].intra_pred[23] = x265_intra_pred_ang32_23_avx2; p.cu[BLOCK_32x32].intra_pred[22] = x265_intra_pred_ang32_22_avx2; p.cu[BLOCK_32x32].intra_pred[21] = x265_intra_pred_ang32_21_avx2; +p.cu[BLOCK_32x32].intra_pred[18] = x265_intra_pred_ang32_18_avx2; // copy_sp primitives p.cu[BLOCK_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2; diff -r 4cccf22b00ee -r f4310212b074 source/common/x86/intrapred.h --- a/source/common/x86/intrapred.h Fri Apr 10 18:15:38 2015 -0500 +++ b/source/common/x86/intrapred.h Mon Apr 13 14:56:16 2015 +0530 @@ -277,6 +277,7 @@ void x265_intra_pred_ang32_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang32_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang32_21_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang32_18_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); diff -r 4cccf22b00ee -r f4310212b074 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Fri Apr 10 18:15:38 2015 -0500 +++ b/source/common/x86/intrapred8.asm Mon Apr 13 14:56:16 2015 +0530 @@ -28,6 +28,7 @@ SECTION_RODATA 32 intra_pred_shuff_0_8:times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +intra_pred_shuff_15_0: times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 pb_0_8times 8 db 0, 8 pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8 @@ -10366,6 +10367,101 @@ RET +INIT_YMM avx2 +cglobal intra_pred_ang32_18, 4, 6, 3 +movu m0, [r2] +movu xm1, [r2 + 1 + 64] +pshufb xm1, [intra_pred_shuff_15_0] +movu xm2, xm0 +vinserti128m1, m1, xm2, 1 + +lear4, [r1 * 2] +lear3, [r1 * 3] +lear5, [r1 * 4] + +movu [r0], m0 +palignrm2, m0, m1, 15 +movu [r0 + r1], m2 +palignrm2, m0, m1, 14 +movu [r0 + r4], m2 +palignrm2, m0, m1, 13 +movu [r0 + r3], m2 + +lear0, [r0 + r5] +palignrm2, m0, m1, 12 +movu [r0], m2 +palignrm2, m0, m1, 11 +movu [r0 + r1], m2 +palignrm2, m0, m1, 10 +movu [r0 + r4], m2 +palignrm2, m0, m1, 9 +movu [r0 + r3], m2 + +lear0, [r0 + r5] +palignrm2, m0, m1, 8 +movu [r0], m2 +palignrm2, m0, m1, 7 +movu [r0 + r1], m2 +palignrm2, m0, m1, 6 +movu [r0 + r4], m2 +palignrm2, m0, m1, 5 +movu [r0 + r3], m2 + +lear0, [r0 + r5] +palignrm2, m0, m1, 4 +movu [r0], m2 +palignrm2, m0, m1, 3 +movu [r0 + r1], m2 +palignrm2, m0, m1, 2 +movu [r0 + r4], m2 +palignrm2, m0, m1, 1 +movu [r0 + r3], m2 + +lear0, [r0 + r5] +movu [r0], m1 + +movu xm0, [r2 + 64 + 17] +pshufb xm0, [intra_pred_shuff_15_0] +vinserti128m0, m0, xm1, 1 + +palignrm2, m1, m0, 15 +movu [r0 + r1], m2 +palignrm2, m1, m0, 14 +movu [r0 + r4], m2 +palignrm2, m1, m0, 13 +movu [r0 + r3], m2 + +lear0, [r0 + r5] +palignrm2, m1, m0, 12 +movu [r0], m2 +palignrm2, m1, m0, 11 +movu [r0 + r1], m2 +palignrm2, m1, m0, 10 +movu [r0 + r4], m2 +palignrm2, m1, m0, 9 +movu [r0 + r3], m2 + +lear0, [r0 + r5] +palignrm2, m1, m0, 8 +movu [r0], m2 +palignrm2, m1, m0, 7 +movu [r0 + r1], m2 +palignrm2, m1, m0,6
[x265] [PATCH 1 of 9] asm: intra_pred_ang16_12 improved by ~20% over SSE4
# HG changeset patch # User Praveen Tiwari prav...@multicorewareinc.com # Date 1428557307 -19800 # Thu Apr 09 10:58:27 2015 +0530 # Node ID 561f063f3ef9c65397c3f43ca84bcd51185f6ad4 # Parent 7f2d92923de47e7e40f04ff27ed70074b0dca9d3 asm: intra_pred_ang16_12 improved by ~20% over SSE4 AVX2: intra_ang_16x16[12] 15.16x 777.51 11785.44 SSE4: intra_ang_16x16[12] 11.51x 976.41 11238.16 diff -r 7f2d92923de4 -r 561f063f3ef9 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Apr 08 14:51:00 2015 -0500 +++ b/source/common/x86/asm-primitives.cpp Thu Apr 09 10:58:27 2015 +0530 @@ -1771,6 +1771,7 @@ p.cu[BLOCK_16x16].intra_pred[7] = x265_intra_pred_ang16_7_avx2; p.cu[BLOCK_16x16].intra_pred[8] = x265_intra_pred_ang16_8_avx2; p.cu[BLOCK_16x16].intra_pred[9] = x265_intra_pred_ang16_9_avx2; +p.cu[BLOCK_16x16].intra_pred[12] = x265_intra_pred_ang16_12_avx2; p.cu[BLOCK_16x16].intra_pred[11] = x265_intra_pred_ang16_11_avx2; p.cu[BLOCK_16x16].intra_pred[25] = x265_intra_pred_ang16_25_avx2; p.cu[BLOCK_16x16].intra_pred[28] = x265_intra_pred_ang16_28_avx2; diff -r 7f2d92923de4 -r 561f063f3ef9 source/common/x86/intrapred.h --- a/source/common/x86/intrapred.h Wed Apr 08 14:51:00 2015 -0500 +++ b/source/common/x86/intrapred.h Thu Apr 09 10:58:27 2015 +0530 @@ -240,6 +240,7 @@ void x265_intra_pred_ang16_7_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang16_8_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang16_9_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang16_12_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang16_11_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang16_25_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang16_28_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); diff -r 7f2d92923de4 -r 561f063f3ef9 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Wed Apr 08 14:51:00 2015 -0500 +++ b/source/common/x86/intrapred8.asm Thu Apr 09 10:58:27 2015 +0530 @@ -133,6 +133,17 @@ db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 + +ALIGN 32 +c_ang16_mode_12: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 + db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 + db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9 + db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 + db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 + db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 + db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 + db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + ALIGN 32 c_ang16_mode_28: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 @@ -12066,6 +12077,65 @@ packuswb %1, %2 %endmacro + +INIT_YMM avx2 +cglobal intra_pred_ang16_12, 3, 5, 13 +mova m11, [pw_1024] +lea r5, [intra_pred_shuff_0_8] + +movu xm9, [r2 + 32] +pinsrbxm9, [r2], 0 +pslldqxm7, xm9, 1 +pinsrbxm7, [r2 + 6], 0 +vinserti128 m9, m9, xm7, 1 +pshufbm9, [r5] + +movu xm12, [r2 + 6 + 32] + +psrldqxm10, xm12, 2 +psrldqxm8, xm12, 1 +vinserti128 m10, m10, xm8, 1 +pshufbm10, [r5] + +lea r3, [3 * r1] +lea r4, [c_ang16_mode_12] + +INTRA_PRED_ANG16_CAL_ROW m0, m1
[x265] [PATCH 6 of 9] asm: intra_pred_ang8_15 improved by ~5% over SSE4
# HG changeset patch # User Praveen Tiwari prav...@multicorewareinc.com # Date 1428579494 -19800 # Thu Apr 09 17:08:14 2015 +0530 # Node ID 31ce12d63d6560df4ce29bdb948525cf73f057f4 # Parent 48278b974eec1dfc8da1643355a701ea073fec36 asm: intra_pred_ang8_15 improved by ~5% over SSE4 AVX2: intra_ang_8x8[15] 9.57x342.52 3279.56 SSE4: intra_ang_8x8[15] 8.95x360.01 3223.45 diff -r 48278b974eec -r 31ce12d63d65 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Apr 09 16:30:54 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Apr 09 17:08:14 2015 +0530 @@ -1766,6 +1766,7 @@ p.cu[BLOCK_8x8].intra_pred[11] = x265_intra_pred_ang8_11_avx2; p.cu[BLOCK_8x8].intra_pred[13] = x265_intra_pred_ang8_13_avx2; p.cu[BLOCK_8x8].intra_pred[14] = x265_intra_pred_ang8_14_avx2; +p.cu[BLOCK_8x8].intra_pred[15] = x265_intra_pred_ang8_15_avx2; p.cu[BLOCK_16x16].intra_pred[3] = x265_intra_pred_ang16_3_avx2; p.cu[BLOCK_16x16].intra_pred[4] = x265_intra_pred_ang16_4_avx2; p.cu[BLOCK_16x16].intra_pred[5] = x265_intra_pred_ang16_5_avx2; diff -r 48278b974eec -r 31ce12d63d65 source/common/x86/intrapred.h --- a/source/common/x86/intrapred.h Thu Apr 09 16:30:54 2015 +0530 +++ b/source/common/x86/intrapred.h Thu Apr 09 17:08:14 2015 +0530 @@ -235,6 +235,7 @@ void x265_intra_pred_ang8_11_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang8_13_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang8_14_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_15_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang16_3_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang16_4_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang16_5_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); diff -r 48278b974eec -r 31ce12d63d65 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Thu Apr 09 16:30:54 2015 +0530 +++ b/source/common/x86/intrapred8.asm Thu Apr 09 17:08:14 2015 +0530 @@ -684,6 +684,12 @@ db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 +ALIGN 32 +c_ang8_mode_15: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 + db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 + db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 + db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 + const ang_table %assign x 0 %rep 32 @@ -11876,6 +11882,68 @@ movhps[r0 + r3], xm2 RET +INIT_YMM avx2 +cglobal intra_pred_ang8_15, 3, 6, 6 +mova m3, [pw_1024] +movu xm5, [r2 + 16] +pinsrbxm5, [r2], 0 +lea r5, [intra_pred_shuff_0_8] +mova xm0, xm5 +pslldqxm5, 1 +pinsrbxm5, [r2 + 2], 0 +vinserti128 m0, m0, xm5, 1 +pshufbm0, [r5] + +lea r4, [c_ang8_mode_15] +pmaddubsw m1, m0, [r4] +pmulhrsw m1, m3 +mova xm0, xm5 +pslldqxm5, 1 +pinsrbxm5, [r2 + 4], 0 +vinserti128 m0, m0, xm5, 1 +pshufbm0, [r5] +pmaddubsw m2, m0, [r4 + mmsize] +pmulhrsw m2, m3 +mova xm0, xm5 +pslldqxm5, 1 +pinsrbxm5, [r2 + 6], 0 +vinserti128 m0, m0, xm5, 1 +pshufbm0, [r5] +pmaddubsw m4, m0, [r4 + 2 * mmsize] +pmulhrsw m4, m3 +mova xm0, xm5 +pslldqxm5, 1 +pinsrbxm5, [r2 + 8], 0 +vinserti128 m0, m0, xm5, 1 +pshufbm0, [r5] +pmaddubsw m0, [r4 + 3 * mmsize] +pmulhrsw m0, m3 +packuswb m1, m2 +packuswb m4, m0 + +vperm2i128m2, m1, m4, 0010b +vperm2i128m1, m1, m4, 00110001b +punpcklbw m4, m2, m1 +punpckhbw m2, m1 +punpcklwd m1, m4, m2 +punpckhwd m4, m2 +mova m0
[x265] [PATCH 9 of 9] asm: intra_pred_ang8_21 improved by ~5% over SSE4
# HG changeset patch # User Praveen Tiwari prav...@multicorewareinc.com # Date 1428583713 -19800 # Thu Apr 09 18:18:33 2015 +0530 # Node ID 759643fade74e82075a9a6491c41d9f3563df7e2 # Parent 72c75090a5dcbe002bd28d2190703b6d74ac7c81 asm: intra_pred_ang8_21 improved by ~5% over SSE4 AVX2: intra_ang_8x8[21] 8.55x239.75 2050.08 SSE4: intra_ang_8x8[21] 8.03x252.60 2027.91 diff -r 72c75090a5dc -r 759643fade74 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Apr 09 18:07:22 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Apr 09 18:18:33 2015 +0530 @@ -1765,6 +1765,7 @@ p.cu[BLOCK_8x8].intra_pred[24] = x265_intra_pred_ang8_24_avx2; p.cu[BLOCK_8x8].intra_pred[11] = x265_intra_pred_ang8_11_avx2; p.cu[BLOCK_8x8].intra_pred[13] = x265_intra_pred_ang8_13_avx2; +p.cu[BLOCK_8x8].intra_pred[21] = x265_intra_pred_ang8_21_avx2; p.cu[BLOCK_8x8].intra_pred[22] = x265_intra_pred_ang8_22_avx2; p.cu[BLOCK_8x8].intra_pred[23] = x265_intra_pred_ang8_23_avx2; p.cu[BLOCK_8x8].intra_pred[14] = x265_intra_pred_ang8_14_avx2; diff -r 72c75090a5dc -r 759643fade74 source/common/x86/intrapred.h --- a/source/common/x86/intrapred.h Thu Apr 09 18:07:22 2015 +0530 +++ b/source/common/x86/intrapred.h Thu Apr 09 18:18:33 2015 +0530 @@ -236,6 +236,7 @@ void x265_intra_pred_ang8_13_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang8_14_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang8_15_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_21_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang8_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang8_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang16_3_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); diff -r 72c75090a5dc -r 759643fade74 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Thu Apr 09 18:07:22 2015 +0530 +++ b/source/common/x86/intrapred8.asm Thu Apr 09 18:18:33 2015 +0530 @@ -11945,6 +11945,57 @@ movhps[r0 + r3], xm2 RET +INIT_YMM avx2 +cglobal intra_pred_ang8_21, 3, 6, 6 +mova m3, [pw_1024] +movu xm5, [r2] +lea r5, [intra_pred_shuff_0_8] +mova xm0, xm5 +pslldqxm5, 1 +pinsrbxm5, [r2 + 2 + 16], 0 +vinserti128 m0, m0, xm5, 1 +pshufbm0, [r5] + +lea r4, [c_ang8_mode_15] +pmaddubsw m1, m0, [r4] +pmulhrsw m1, m3 +mova xm0, xm5 +pslldqxm5, 1 +pinsrbxm5, [r2 + 4 + 16], 0 +vinserti128 m0, m0, xm5, 1 +pshufbm0, [r5] +pmaddubsw m2, m0, [r4 + mmsize] +pmulhrsw m2, m3 +mova xm0, xm5 +pslldqxm5, 1 +pinsrbxm5, [r2 + 6 + 16], 0 +vinserti128 m0, m0, xm5, 1 +pshufbm0, [r5] +pmaddubsw m4, m0, [r4 + 2 * mmsize] +pmulhrsw m4, m3 +mova xm0, xm5 +pslldqxm5, 1 +pinsrbxm5, [r2 + 8 + 16], 0 +vinserti128 m0, m0, xm5, 1 +pshufbm0, [r5] +pmaddubsw m0, [r4 + 3 * mmsize] +pmulhrsw m0, m3 +packuswb m1, m2 +packuswb m4, m0 + +lea r3, [3 * r1] +movq [r0], xm1 +vextracti128 xm2, m1, 1 +movq [r0 + r1], xm2 +movhps[r0 + 2 * r1], xm1 +movhps[r0 + r3], xm2 +lea r0, [r0 + 4 * r1] +movq [r0], xm4 +vextracti128 xm2, m4, 1 +movq [r0 + r1], xm2 +movhps[r0 + 2 * r1], xm4 +movhps[r0 + r3], xm2 +RET INIT_YMM avx2 cglobal intra_pred_ang8_22, 3, 6, 6 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 4 of 9] asm: intra_pred_ang8_13 improved by ~16% over SSE4
# HG changeset patch # User Praveen Tiwari prav...@multicorewareinc.com # Date 1428575027 -19800 # Thu Apr 09 15:53:47 2015 +0530 # Node ID 79c8c583603a8dda7fe22973b55b18d9ff08cc64 # Parent 6f9c3e9aec5218f89389c6f1f363b86181fc20cf asm: intra_pred_ang8_13 improved by ~16% over SSE4 AVX2: intra_ang_8x8[13] 10.68x 297.95 3183.33 SSE4: intra_ang_8x8[13] 9.16x352.32 3225.62 diff -r 6f9c3e9aec52 -r 79c8c583603a source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Apr 09 13:28:56 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Apr 09 15:53:47 2015 +0530 @@ -1764,6 +1764,7 @@ p.cu[BLOCK_8x8].intra_pred[12] = x265_intra_pred_ang8_12_avx2; p.cu[BLOCK_8x8].intra_pred[24] = x265_intra_pred_ang8_24_avx2; p.cu[BLOCK_8x8].intra_pred[11] = x265_intra_pred_ang8_11_avx2; +p.cu[BLOCK_8x8].intra_pred[13] = x265_intra_pred_ang8_13_avx2; p.cu[BLOCK_16x16].intra_pred[3] = x265_intra_pred_ang16_3_avx2; p.cu[BLOCK_16x16].intra_pred[4] = x265_intra_pred_ang16_4_avx2; p.cu[BLOCK_16x16].intra_pred[5] = x265_intra_pred_ang16_5_avx2; diff -r 6f9c3e9aec52 -r 79c8c583603a source/common/x86/intrapred.h --- a/source/common/x86/intrapred.h Thu Apr 09 13:28:56 2015 +0530 +++ b/source/common/x86/intrapred.h Thu Apr 09 15:53:47 2015 +0530 @@ -233,6 +233,7 @@ void x265_intra_pred_ang8_12_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang8_24_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang8_11_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_13_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang16_3_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang16_4_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang16_5_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); diff -r 6f9c3e9aec52 -r 79c8c583603a source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Thu Apr 09 13:28:56 2015 +0530 +++ b/source/common/x86/intrapred8.asm Thu Apr 09 15:53:47 2015 +0530 @@ -672,6 +672,11 @@ pw_planar32_L:dw 31, 30, 29, 28, 27, 26, 25, 24 pw_planar32_H:dw 23, 22, 21, 20, 19, 18, 17, 16 +ALIGN 32 +c_ang8_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 + db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 + db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 + db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 const ang_table %assign x 0 @@ -11866,6 +11871,61 @@ RET + +INIT_YMM avx2 +cglobal intra_pred_ang8_13, 3, 6, 6 +mova m3, [pw_1024] +movu xm5, [r2 + 16] +pinsrbxm5, [r2], 0 +lea r5, [intra_pred_shuff_0_8] +vinserti128 m0, m5, xm5, 1 +pshufbm0, [r5] + +lea r4, [c_ang8_mode_13] +pmaddubsw m1, m0, [r4] +pmulhrsw m1, m3 +pslldqxm5, 1 +pinsrbxm5, [r2 + 4], 0 +pshufbxm4, xm5, [r5] +vinserti128 m0, m0, xm4, 1 +pmaddubsw m2, m0, [r4 + mmsize] +pmulhrsw m2, m3 +vinserti128 m0, m0, xm4, 0 +pmaddubsw m4, m0, [r4 + 2 * mmsize] +pmulhrsw m4, m3 +pslldqxm5, 1 +pinsrbxm5, [r2 + 7], 0 +pshufbxm5, [r5] +vinserti128 m0, m0, xm5, 1 +pmaddubsw m0, [r4 + 3 * mmsize] +pmulhrsw m0, m3 +packuswb m1, m2 +packuswb m4, m0 + +vperm2i128m2, m1, m4, 0010b +vperm2i128m1, m1, m4, 00110001b +punpcklbw m4, m2, m1 +punpckhbw m2, m1 +punpcklwd m1, m4, m2 +punpckhwd m4, m2 +mova m0, [trans8_shuf] +vpermdm1, m0, m1 +vpermdm4, m0, m4 + +lea r3, [3 * r1] +movq [r0], xm1 +movhps[r0 + r1], xm1 +vextracti128 xm2, m1, 1 +movq [r0 + 2 * r1], xm2 +movhps[r0 + r3], xm2 +lea r0, [r0 + 4 * r1] +movq [r0], xm4 +movhps[r0 + r1], xm4 +vextracti128 xm2, m4, 1 +movq [r0 + 2 * r1], xm2 +movhps
[x265] [PATCH 7 of 9] asm: intra_pred_ang8_23 improved by ~18% over SSE4
# HG changeset patch # User Praveen Tiwari prav...@multicorewareinc.com # Date 1428581726 -19800 # Thu Apr 09 17:45:26 2015 +0530 # Node ID 5db8882a1ef6c5f27bc59869692791290945af2a # Parent 31ce12d63d6560df4ce29bdb948525cf73f057f4 asm: intra_pred_ang8_23 improved by ~18% over SSE4 AVX2: intra_ang_8x8[23] 9.75x205.43 2002.05 SSE4: intra_ang_8x8[23] 8.12x251.42 2041.61 diff -r 31ce12d63d65 -r 5db8882a1ef6 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Apr 09 17:08:14 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Apr 09 17:45:26 2015 +0530 @@ -1765,6 +1765,7 @@ p.cu[BLOCK_8x8].intra_pred[24] = x265_intra_pred_ang8_24_avx2; p.cu[BLOCK_8x8].intra_pred[11] = x265_intra_pred_ang8_11_avx2; p.cu[BLOCK_8x8].intra_pred[13] = x265_intra_pred_ang8_13_avx2; +p.cu[BLOCK_8x8].intra_pred[23] = x265_intra_pred_ang8_23_avx2; p.cu[BLOCK_8x8].intra_pred[14] = x265_intra_pred_ang8_14_avx2; p.cu[BLOCK_8x8].intra_pred[15] = x265_intra_pred_ang8_15_avx2; p.cu[BLOCK_16x16].intra_pred[3] = x265_intra_pred_ang16_3_avx2; diff -r 31ce12d63d65 -r 5db8882a1ef6 source/common/x86/intrapred.h --- a/source/common/x86/intrapred.h Thu Apr 09 17:08:14 2015 +0530 +++ b/source/common/x86/intrapred.h Thu Apr 09 17:45:26 2015 +0530 @@ -236,6 +236,7 @@ void x265_intra_pred_ang8_13_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang8_14_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang8_15_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang16_3_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang16_4_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_intra_pred_ang16_5_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); diff -r 31ce12d63d65 -r 5db8882a1ef6 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Thu Apr 09 17:08:14 2015 +0530 +++ b/source/common/x86/intrapred8.asm Thu Apr 09 17:45:26 2015 +0530 @@ -12056,6 +12056,51 @@ movhps[r0 + r3], xm2 RET + +INIT_YMM avx2 +cglobal intra_pred_ang8_23, 3, 6, 6 +mova m3, [pw_1024] +movu xm5, [r2] +lea r5, [intra_pred_shuff_0_8] +vinserti128 m0, m5, xm5, 1 +pshufbm0, [r5] + +lea r4, [c_ang8_mode_13] +pmaddubsw m1, m0, [r4] +pmulhrsw m1, m3 +pslldqxm5, 1 +pinsrbxm5, [r2 + 4 + 16], 0 +pshufbxm4, xm5, [r5] +vinserti128 m0, m0, xm4, 1 +pmaddubsw m2, m0, [r4 + mmsize] +pmulhrsw m2, m3 +vinserti128 m0, m0, xm4, 0 +pmaddubsw m4, m0, [r4 + 2 * mmsize] +pmulhrsw m4, m3 +pslldqxm5, 1 +pinsrbxm5, [r2 + 7 + 16], 0 +pshufbxm5, [r5] +vinserti128 m0, m0, xm5, 1 +pmaddubsw m0, [r4 + 3 * mmsize] +pmulhrsw m0, m3 + +packuswb m1, m2 +packuswb m4, m0 + +lea r3, [3 * r1] +movq [r0], xm1 +vextracti128 xm2, m1, 1 +movq [r0 + r1], xm2 +movhps[r0 + 2 * r1], xm1 +movhps[r0 + r3], xm2 +lea r0, [r0 + 4 * r1] +movq [r0], xm4 +vextracti128 xm2, m4, 1 +movq [r0 + r1], xm2 +movhps[r0 + 2 * r1], xm4 +movhps[r0 + r3], xm2 +RET + INIT_YMM avx2 cglobal intra_pred_ang8_12, 3, 5, 5 mova m3, [pw_1024] ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel