# HG changeset patch # User Jayashri Murugan <jayas...@multicorewareinc.com> # Date 1507182370 -19800 # Thu Oct 05 11:16:10 2017 +0530 # Node ID d4ee703039c6cde39312a596cee019c346a8381b # Parent 14c93ddbd598128b43a96ff21221e2dbb189d275 x86: Aligned routine encoder integration for blockfill_s primitive
diff -r 14c93ddbd598 -r d4ee703039c6 source/common/quant.cpp --- a/source/common/quant.cpp Wed Oct 04 15:55:03 2017 +0530 +++ b/source/common/quant.cpp Thu Oct 05 11:16:10 2017 +0530 @@ -188,8 +188,9 @@ m_nr = NULL; } -bool Quant::init(double psyScale, const ScalingList& scalingList, Entropy& entropy) +bool Quant::init(double psyScale, const ScalingList& scalingList, Entropy& entropy, int cpuid) { + m_cpuid = cpuid; m_entropyCoder = &entropy; m_psyRdoqScale = (int32_t)(psyScale * 256.0); X265_CHECK((psyScale * 256.0) < (double)MAX_INT, "psyScale value too large\n"); @@ -611,7 +612,10 @@ const int add_2nd = 1 << (shift_2nd - 1); int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd; - primitives.cu[sizeIdx].blockfill_s(residual, resiStride, (int16_t)dc_val); + if ((resiStride % 64 == 0) && (m_cpuid & X265_CPU_AVX512)) + primitives.cu[sizeIdx].blockfill_s_aligned(residual, resiStride, (int16_t)dc_val); + else + primitives.cu[sizeIdx].blockfill_s(residual, resiStride, (int16_t)dc_val); return; } diff -r 14c93ddbd598 -r d4ee703039c6 source/common/quant.h --- a/source/common/quant.h Wed Oct 04 15:55:03 2017 +0530 +++ b/source/common/quant.h Thu Oct 05 11:16:10 2017 +0530 @@ -93,6 +93,7 @@ public: + int m_cpuid; NoiseReduction* m_nr; NoiseReduction* m_frameNr; // Array of NR structures, one for each frameEncoder @@ -100,7 +101,7 @@ ~Quant(); /* one-time setup */ - bool init(double psyScale, const ScalingList& scalingList, Entropy& entropy); + bool init(double psyScale, const ScalingList& scalingList, Entropy& entropy, int cpuid); bool allocNoiseReduction(const x265_param& param); /* CU setup */ diff -r 14c93ddbd598 -r d4ee703039c6 source/encoder/search.cpp --- a/source/encoder/search.cpp Wed Oct 04 15:55:03 2017 +0530 +++ b/source/encoder/search.cpp Thu Oct 05 11:16:10 2017 +0530 @@ -81,7 +81,7 @@ m_rdCost.setSsimRd(param.bSsimRd); m_me.init(param.internalCsp); - bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder); + bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder, param.cpuid); if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize) ok &= m_quant.allocNoiseReduction(param); @@ -2914,7 +2914,10 @@ } else { - primitives.cu[sizeIdx].blockfill_s(curResiY, strideResiY, 0); + if ((strideResiY % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512)) + primitives.cu[sizeIdx].blockfill_s_aligned(curResiY, strideResiY, 0); + else + primitives.cu[sizeIdx].blockfill_s(curResiY, strideResiY, 0); cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth); } @@ -2947,7 +2950,10 @@ } else { - primitives.cu[sizeIdxC].blockfill_s(curResiU, strideResiC, 0); + if ((strideResiC % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512)) + primitives.cu[sizeIdxC].blockfill_s_aligned(curResiU, strideResiC, 0); + else + primitives.cu[sizeIdxC].blockfill_s(curResiU, strideResiC, 0); cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep); } @@ -2961,7 +2967,11 @@ } else { - primitives.cu[sizeIdxC].blockfill_s(curResiV, strideResiC, 0); + if ((strideResiC % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512)) + primitives.cu[sizeIdxC].blockfill_s_aligned(curResiV, strideResiC, 0); + else + primitives.cu[sizeIdxC].blockfill_s(curResiV, strideResiC, 0); + cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep); } } @@ -3229,7 +3239,10 @@ { cbfFlag[TEXT_LUMA][0] = 0; singleBits[TEXT_LUMA][0] = 0; - primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0); + if ((strideResiY % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512)) + primitives.cu[partSize].blockfill_s_aligned(curResiY, strideResiY, 0); + else + primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0); #if CHECKED_BUILD || _DEBUG uint32_t numCoeffY = 1 << (log2TrSize << 1); memset(coeffCurY, 0, sizeof(coeff_t)* numCoeffY); @@ -3252,7 +3265,10 @@ { if (checkTransformSkipY) minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA); - primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0); + if ((strideResiY % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512)) + primitives.cu[partSize].blockfill_s_aligned(curResiY, strideResiY, 0); + else + primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0); singleDist[TEXT_LUMA][0] = zeroDistY; singleBits[TEXT_LUMA][0] = 0; singleEnergy[TEXT_LUMA][0] = zeroEnergyY; @@ -3341,7 +3357,10 @@ { cbfFlag[chromaId][tuIterator.section] = 0; singleBits[chromaId][tuIterator.section] = 0; - primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0); + if ((strideResiC % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512)) + primitives.cu[partSizeC].blockfill_s_aligned(curResiC, strideResiC, 0); + else + primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0); #if CHECKED_BUILD || _DEBUG uint32_t numCoeffC = 1 << (log2TrSizeC << 1); memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC); @@ -3364,7 +3383,10 @@ { if (checkTransformSkipC) minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId); - primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0); + if ((strideResiC % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512)) + primitives.cu[partSizeC].blockfill_s_aligned(curResiC, strideResiC, 0); + else + primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0); singleBits[chromaId][tuIterator.section] = 0; singleDist[chromaId][tuIterator.section] = zeroDistC; singleEnergy[chromaId][tuIterator.section] = zeroEnergyC; _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel