Please ignore this patch I messed an update. I will resend this soon. Thanks
On Mon, Nov 27, 2017 at 5:11 PM, <prav...@multicorewareinc.com> wrote: > # HG changeset patch > # User Praveen Tiwari <prav...@multicorewareinc.com> > # Date 1511167656 -19800 > # Mon Nov 20 14:17:36 2017 +0530 > # Node ID dffb056e5ad0e2298b0dd65d048f4f16d8508566 > # Parent b24454f3ff6de650aab6835e291837fc4e2a4466 > quant.cpp: 'rdoQuant_c' primitive for SIMD optimization > > This particular section of code appears to be bottleneck in many profiles, > as it > involves 64-bit multiplication operations. For SIMD optimization we need > to convert > few buffer/variables to double. > > diff -r b24454f3ff6d -r dffb056e5ad0 source/common/dct.cpp > --- a/source/common/dct.cpp Wed Nov 22 22:00:48 2017 +0530 > +++ b/source/common/dct.cpp Mon Nov 20 14:17:36 2017 +0530 > @@ -984,6 +984,32 @@ > return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28); > } > > +void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* > costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t > psyScale, uint32_t blkPos, uint32_t log2TrSize) > +{ > + const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - > log2TrSize; /* Represents scaling through forward transform */ > + const int scaleBits = SCALE_BITS - 2 * transformShift; > + const uint32_t trSize = 1 << log2TrSize; > + int max = X265_MAX(0, (2 * transformShift + 1)); > + > + for (int y = 0; y < MLS_CG_SIZE; y++) > + { > + for (int x = 0; x < MLS_CG_SIZE; x++) > + { > + int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* > pre-quantization DCT coeff */ > + int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - > signCoef; /* predicted DCT = source DCT - residual DCT*/ > + > + costUncoded[blkPos + x] = static_cast<double>((signCoef * > signCoef) << scaleBits); > + > + /* when no residual coefficient is coded, predicted coef == > recon coef */ > + costUncoded[blkPos + x] -= static_cast<double>((psyScale * > (predictedCoef)) >> max); > + > + *totalUncodedCost += costUncoded[blkPos + x]; > + *totalRdCost += costUncoded[blkPos + x]; > + } > + blkPos += trSize; > + } > +} > + > namespace X265_NS { > // x265 private namespace > > @@ -993,6 +1019,7 @@ > p.dequant_normal = dequant_normal_c; > p.quant = quant_c; > p.nquant = nquant_c; > + p.rdoQuant = rdoQuant_c; > p.dst4x4 = dst4_c; > p.cu[BLOCK_4x4].dct = dct4_c; > p.cu[BLOCK_8x8].dct = dct8_c; > diff -r b24454f3ff6d -r dffb056e5ad0 source/common/primitives.h > --- a/source/common/primitives.h Wed Nov 22 22:00:48 2017 +0530 > +++ b/source/common/primitives.h Mon Nov 20 14:17:36 2017 +0530 > @@ -216,6 +216,7 @@ > > typedef void (*integralv_t)(uint32_t *sum, intptr_t stride); > typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride); > +typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* > m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* > totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize); > > /* Function pointers to optimized encoder primitives. Each pointer can > reference > * either an assembly routine, a SIMD intrinsic primitive, or a C > function */ > @@ -304,6 +305,7 @@ > > quant_t quant; > nquant_t nquant; > + rdoQuant_t rdoQuant; > dequant_scaling_t dequant_scaling; > dequant_normal_t dequant_normal; > denoiseDct_t denoiseDct; > diff -r b24454f3ff6d -r dffb056e5ad0 source/common/quant.cpp > --- a/source/common/quant.cpp Wed Nov 22 22:00:48 2017 +0530 > +++ b/source/common/quant.cpp Mon Nov 20 14:17:36 2017 +0530 > @@ -663,7 +663,7 @@ > #define PSYVALUE(rec) ((psyScale * (rec)) >> X265_MAX(0, (2 * > transformShift + 1))) > > int64_t costCoeff[trSize * trSize]; /* d*d + lambda * bits */ > - int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0 */ > + double costUncoded[trSize * trSize]; /* d*d + lambda * 0 */ > int64_t costSig[trSize * trSize]; /* lambda * bits */ > > int rateIncUp[trSize * trSize]; /* signal overhead of increasing > level */ > @@ -677,12 +677,12 @@ > bool bIsLuma = ttype == TEXT_LUMA; > > /* total rate distortion cost of transform block, as CBF=0 */ > - int64_t totalUncodedCost = 0; > + double totalUncodedCost = 0; > > /* Total rate distortion cost of this transform block, counting te > distortion of uncoded blocks, > * the distortion and signal cost of coded blocks, and the coding > cost of significant > * coefficient and coefficient group bitmaps */ > - int64_t totalRdCost = 0; > + double totalRdCost = 0; > > TUEntropyCodingParameters codeParams; > cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, > bIsLuma); > @@ -729,24 +729,9 @@ > uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); > uint32_t blkPos = codeParams.scan[scanPosBase]; > > - // TODO: we can't SIMD optimize because PSYVALUE need 64-bits > multiplication, convert to Double can work faster by FMA > - for (int y = 0; y < MLS_CG_SIZE; y++) > - { > - for (int x = 0; x < MLS_CG_SIZE; x++) > - { > - int signCoef = m_resiDctCoeff[blkPos + x]; > /* pre-quantization DCT coeff */ > - int predictedCoef = m_fencDctCoeff[blkPos + x] - > signCoef; /* predicted DCT = source DCT - residual DCT*/ > + // PSYVALUE need 64-bits multiplication, we have converted > few buffers/variables to double, expected to work faster by SIMD > + primitives.rdoQuant(m_resiDctCoeff, m_fencDctCoeff, > costUncoded, &totalUncodedCost, &totalRdCost, psyScale, blkPos, log2TrSize); > > - costUncoded[blkPos + x] = ((int64_t)signCoef * > signCoef) << scaleBits; > - > - /* when no residual coefficient is coded, predicted > coef == recon coef */ > - costUncoded[blkPos + x] -= PSYVALUE(predictedCoef); > - > - totalUncodedCost += costUncoded[blkPos + x]; > - totalRdCost += costUncoded[blkPos + x]; > - } > - blkPos += trSize; > - } > } > } > else > @@ -764,7 +749,7 @@ > for (int x = 0; x < MLS_CG_SIZE; x++) > { > int signCoef = m_resiDctCoeff[blkPos + x]; > /* pre-quantization DCT coeff */ > - costUncoded[blkPos + x] = ((int64_t)signCoef * > signCoef) << scaleBits; > + costUncoded[blkPos + x] = > static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits); > > totalUncodedCost += costUncoded[blkPos + x]; > totalRdCost += costUncoded[blkPos + x]; > @@ -844,7 +829,7 @@ > int signCoef = m_resiDctCoeff[blkPos + > x]; /* pre-quantization DCT coeff */ > int predictedCoef = m_fencDctCoeff[blkPos + x] > - signCoef; /* predicted DCT = source DCT - residual DCT*/ > > - costUncoded[blkPos + x] = ((int64_t)signCoef * > signCoef) << scaleBits; > + costUncoded[blkPos + x] = > static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits); > > /* when no residual coefficient is coded, > predicted coef == recon coef */ > costUncoded[blkPos + x] -= > PSYVALUE(predictedCoef); > @@ -858,7 +843,7 @@ > X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, > log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, > codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); > > costSig[scanPosBase + scanPosOffset] = > SIGCOST(estBitsSbac.significantBits[0][ctxSig]); > - costCoeff[scanPosBase + scanPosOffset] = > costUncoded[blkPos + x]; > + costCoeff[scanPosBase + scanPosOffset] = > static_cast<int64_t>(costUncoded[blkPos + x]); > sigRateDelta[blkPos + x] = > estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0] > [ctxSig]; > } > blkPos += trSize; > @@ -872,7 +857,7 @@ > for (int x = 0; x < MLS_CG_SIZE; x++) > { > int signCoef = m_resiDctCoeff[blkPos + x]; > /* pre-quantization DCT coeff */ > - costUncoded[blkPos + x] = ((int64_t)signCoef * > signCoef) << scaleBits; > + costUncoded[blkPos + x] = > static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits); > > totalUncodedCost += costUncoded[blkPos + x]; > totalRdCost += costUncoded[blkPos + x]; > @@ -883,7 +868,7 @@ > X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, > log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, > codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); > > costSig[scanPosBase + scanPosOffset] = > SIGCOST(estBitsSbac.significantBits[0][ctxSig]); > - costCoeff[scanPosBase + scanPosOffset] = > costUncoded[blkPos + x]; > + costCoeff[scanPosBase + scanPosOffset] = > static_cast<int64_t>(costUncoded[blkPos + x]); > sigRateDelta[blkPos + x] = > estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0] > [ctxSig]; > } > blkPos += trSize; > @@ -922,7 +907,7 @@ > * FIX15 nature of the CABAC cost tables minus the forward > transform scale */ > > /* cost of not coding this coefficient (all distortion, no > signal bits) */ > - costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << > scaleBits; > + costUncoded[blkPos] = static_cast<double>(((int64_t)signCoef > * signCoef) << scaleBits); > X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 > && scanPos!=0)\n"); > if (usePsyMask & scanPos) > /* when no residual coefficient is coded, predicted coef > == recon coef */ > @@ -956,7 +941,7 @@ > // fast zero coeff path > /* set default costs to uncoded costs */ > costSig[scanPos] = SIGCOST(estBitsSbac. > significantBits[0][ctxSig]); > - costCoeff[scanPos] = costUncoded[blkPos] + > costSig[scanPos]; > + costCoeff[scanPos] = static_cast<int64_t>(costUncoded[blkPos] > + costSig[scanPos]); > sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] > - estBitsSbac.significantBits[0][ctxSig]; > totalRdCost += costCoeff[scanPos]; > rateIncUp[blkPos] = greaterOneBits[0]; > @@ -991,7 +976,7 @@ > { > /* set default costs to uncoded costs */ > costSig[scanPos] = SIGCOST(estBitsSbac. > significantBits[0][ctxSig]); > - costCoeff[scanPos] = costUncoded[blkPos] + > costSig[scanPos]; > + costCoeff[scanPos] = > static_cast<int64_t>(costUncoded[blkPos] > + costSig[scanPos]); > } > sigRateDelta[blkPos] = > estBitsSbac.significantBits[1][ctxSig] > - estBitsSbac.significantBits[0][ctxSig]; > sigCoefBits = estBitsSbac.significantBits[1][ctxSig]; > @@ -1138,7 +1123,7 @@ > { > sigCoeffGroupFlag64 |= cgBlkPosMask; > cgRdStats.codedLevelAndDist += costCoeff[scanPos] - > costSig[scanPos]; > - cgRdStats.uncodedDist += costUncoded[blkPos]; > + cgRdStats.uncodedDist += static_cast<int64_t>( > costUncoded[blkPos]); > cgRdStats.nnzBeforePos0 += scanPosinCG; > } > } > @@ -1174,7 +1159,7 @@ > > uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, > cgPosX, cgPosY, cgBlkPos, cgStride); > > - int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac. > significantCoeffGroupBits[sigCtx][0]); > + int64_t costZeroCG = static_cast<int64_t>(totalRdCost + > SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0])); > costZeroCG += cgRdStats.uncodedDist; /* add distortion > for resetting non-zero levels to zero levels */ > costZeroCG -= cgRdStats.codedLevelAndDist; /* remove > distortion and level cost of coded coefficients */ > costZeroCG -= cgRdStats.sigCost; /* remove > signaling cost of significant coeff bitmap */ > @@ -1185,7 +1170,7 @@ > if (costZeroCG < totalRdCost && m_rdoqLevel > 1) > { > sigCoeffGroupFlag64 &= ~cgBlkPosMask; > - totalRdCost = costZeroCG; > + totalRdCost = static_cast<double>(costZeroCG); > costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac. > significantCoeffGroupBits[sigCtx][0]); > > /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */ > @@ -1212,14 +1197,14 @@ > int64_t bestCost; > if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx]) > { > - bestCost = totalUncodedCost + SIGCOST(estBitsSbac. > blockRootCbpBits[0]); > - totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]); > + bestCost = static_cast<int64_t>(totalUncodedCost + > SIGCOST(estBitsSbac.blockRootCbpBits[0])); > + totalRdCost += static_cast<double>((SIGCOST( > estBitsSbac.blockRootCbpBits[1]))); > } > else > { > int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]]; > - bestCost = totalUncodedCost + SIGCOST(estBitsSbac. > blockCbpBits[ctx][0]); > - totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]); > + bestCost = static_cast<int64_t>(totalUncodedCost + > SIGCOST(estBitsSbac.blockCbpBits[ctx][0])); > + totalRdCost += static_cast<double>(SIGCOST( > estBitsSbac.blockCbpBits[ctx][1])); > } > > /* This loop starts with the last non-zero found in the first loop > and then refines this last > @@ -1277,7 +1262,7 @@ > bitsLastNZ += IEP_RATE * suffixLen; > } > > - int64_t costAsLast = totalRdCost - costSig[scanPos] + > SIGCOST(bitsLastNZ); > + int64_t costAsLast = static_cast<int64_t>(totalRdCost - > costSig[scanPos] + SIGCOST(bitsLastNZ)); > > if (costAsLast < bestCost) > { >
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel