# HG changeset patch # User Jayashri Murugan <jayas...@multicorewareinc.com> # Date 1507182997 -19800 # Thu Oct 05 11:26:37 2017 +0530 # Node ID 1748c9a5c9b16c380f926cd5d07a69c4f13a6fab # Parent c497cbf5c2d53ea9c47f3929eaacbb36e703bdfa x86: Aligned routine encoder integration for calcresidual primitive
diff -r c497cbf5c2d5 -r 1748c9a5c9b1 source/encoder/search.cpp --- a/source/encoder/search.cpp Wed Oct 04 16:33:33 2017 +0530 +++ b/source/encoder/search.cpp Thu Oct 05 11:26:37 2017 +0530 @@ -354,8 +354,10 @@ // store original entropy coding status if (bEnableRDOQ) m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); - - primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride); + if ((stride % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512)) + primitives.cu[sizeIdx].calcresidual_aligned(fenc, pred, residual, stride); + else + primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride); uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false); if (numSig) @@ -561,7 +563,10 @@ pixel* tmpRecon = (useTSkip ? m_tsRecon : reconQt); uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride); - primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride); + if ((stride % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512)) + primitives.cu[sizeIdx].calcresidual_aligned(fenc, pred, residual, stride); + else + primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride); uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip); if (numSig) @@ -714,7 +719,10 @@ coeff_t* coeffY = cu.m_trCoeff[0] + coeffOffsetY; uint32_t sizeIdx = log2TrSize - 2; - primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride); + if ((stride % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512)) + primitives.cu[sizeIdx].calcresidual_aligned(fenc, pred, residual, stride); + else + primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride); PicYuv* reconPic = m_frame->m_reconPic; pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); @@ -893,7 +901,11 @@ predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC); cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); - primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride); + if ((stride % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512)) + primitives.cu[sizeIdxC].calcresidual_aligned(fenc, pred, residual, stride); + else + primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride); + uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false); if (numSig) { @@ -992,7 +1004,10 @@ pixel* recon = (useTSkip ? m_tsRecon : reconQt); uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride); - primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride); + if ((stride % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512)) + primitives.cu[sizeIdxC].calcresidual_aligned(fenc, pred, residual, stride); + else + primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride); uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip); if (numSig) @@ -1183,7 +1198,11 @@ X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n"); - primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride); + if ((stride % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512)) + primitives.cu[sizeIdxC].calcresidual_aligned(fenc, pred, residual, stride); + else + primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride); + uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false); if (numSig) { _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel