# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1511167656 -19800 # Mon Nov 20 14:17:36 2017 +0530 # Node ID dffb056e5ad0e2298b0dd65d048f4f16d8508566 # Parent b24454f3ff6de650aab6835e291837fc4e2a4466 quant.cpp: 'rdoQuant_c' primitive for SIMD optimization
This particular section of code appears to be bottleneck in many profiles, as it involves 64-bit multiplication operations. For SIMD optimization we need to convert few buffer/variables to double. diff -r b24454f3ff6d -r dffb056e5ad0 source/common/dct.cpp --- a/source/common/dct.cpp Wed Nov 22 22:00:48 2017 +0530 +++ b/source/common/dct.cpp Mon Nov 20 14:17:36 2017 +0530 @@ -984,6 +984,32 @@ return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28); } +void rdoQuant_c(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize) +{ + const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ + const int scaleBits = SCALE_BITS - 2 * transformShift; + const uint32_t trSize = 1 << log2TrSize; + int max = X265_MAX(0, (2 * transformShift + 1)); + + for (int y = 0; y < MLS_CG_SIZE; y++) + { + for (int x = 0; x < MLS_CG_SIZE; x++) + { + int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ + int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/ + + costUncoded[blkPos + x] = static_cast<double>((signCoef * signCoef) << scaleBits); + + /* when no residual coefficient is coded, predicted coef == recon coef */ + costUncoded[blkPos + x] -= static_cast<double>((psyScale * (predictedCoef)) >> max); + + *totalUncodedCost += costUncoded[blkPos + x]; + *totalRdCost += costUncoded[blkPos + x]; + } + blkPos += trSize; + } +} + namespace X265_NS { // x265 private namespace @@ -993,6 +1019,7 @@ p.dequant_normal = dequant_normal_c; p.quant = quant_c; p.nquant = nquant_c; + p.rdoQuant = rdoQuant_c; p.dst4x4 = dst4_c; p.cu[BLOCK_4x4].dct = dct4_c; p.cu[BLOCK_8x8].dct = dct8_c; diff -r b24454f3ff6d -r dffb056e5ad0 source/common/primitives.h --- a/source/common/primitives.h Wed Nov 22 22:00:48 2017 +0530 +++ b/source/common/primitives.h Mon Nov 20 14:17:36 2017 +0530 @@ -216,6 +216,7 @@ typedef void (*integralv_t)(uint32_t *sum, intptr_t stride); typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride); +typedef void (*rdoQuant_t)(int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, double* costUncoded, double* totalUncodedCost, double* totalRdCost, int64_t psyScale, uint32_t blkPos, uint32_t log2TrSize); /* Function pointers to optimized encoder primitives. Each pointer can reference * either an assembly routine, a SIMD intrinsic primitive, or a C function */ @@ -304,6 +305,7 @@ quant_t quant; nquant_t nquant; + rdoQuant_t rdoQuant; dequant_scaling_t dequant_scaling; dequant_normal_t dequant_normal; denoiseDct_t denoiseDct; diff -r b24454f3ff6d -r dffb056e5ad0 source/common/quant.cpp --- a/source/common/quant.cpp Wed Nov 22 22:00:48 2017 +0530 +++ b/source/common/quant.cpp Mon Nov 20 14:17:36 2017 +0530 @@ -663,7 +663,7 @@ #define PSYVALUE(rec) ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1))) int64_t costCoeff[trSize * trSize]; /* d*d + lambda * bits */ - int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0 */ + double costUncoded[trSize * trSize]; /* d*d + lambda * 0 */ int64_t costSig[trSize * trSize]; /* lambda * bits */ int rateIncUp[trSize * trSize]; /* signal overhead of increasing level */ @@ -677,12 +677,12 @@ bool bIsLuma = ttype == TEXT_LUMA; /* total rate distortion cost of transform block, as CBF=0 */ - int64_t totalUncodedCost = 0; + double totalUncodedCost = 0; /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks, * the distortion and signal cost of coded blocks, and the coding cost of significant * coefficient and coefficient group bitmaps */ - int64_t totalRdCost = 0; + double totalRdCost = 0; TUEntropyCodingParameters codeParams; cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma); @@ -729,24 +729,9 @@ uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); uint32_t blkPos = codeParams.scan[scanPosBase]; - // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA - for (int y = 0; y < MLS_CG_SIZE; y++) - { - for (int x = 0; x < MLS_CG_SIZE; x++) - { - int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ - int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/ + // PSYVALUE need 64-bits multiplication, we have converted few buffers/variables to double, expected to work faster by SIMD + primitives.rdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, psyScale, blkPos, log2TrSize); - costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits; - - /* when no residual coefficient is coded, predicted coef == recon coef */ - costUncoded[blkPos + x] -= PSYVALUE(predictedCoef); - - totalUncodedCost += costUncoded[blkPos + x]; - totalRdCost += costUncoded[blkPos + x]; - } - blkPos += trSize; - } } } else @@ -764,7 +749,7 @@ for (int x = 0; x < MLS_CG_SIZE; x++) { int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ - costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits; + costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits); totalUncodedCost += costUncoded[blkPos + x]; totalRdCost += costUncoded[blkPos + x]; @@ -844,7 +829,7 @@ int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/ - costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits; + costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits); /* when no residual coefficient is coded, predicted coef == recon coef */ costUncoded[blkPos + x] -= PSYVALUE(predictedCoef); @@ -858,7 +843,7 @@ X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); - costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x]; + costCoeff[scanPosBase + scanPosOffset] = static_cast<int64_t>(costUncoded[blkPos + x]); sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; } blkPos += trSize; @@ -872,7 +857,7 @@ for (int x = 0; x < MLS_CG_SIZE; x++) { int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ - costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits; + costUncoded[blkPos + x] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits); totalUncodedCost += costUncoded[blkPos + x]; totalRdCost += costUncoded[blkPos + x]; @@ -883,7 +868,7 @@ X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); - costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x]; + costCoeff[scanPosBase + scanPosOffset] = static_cast<int64_t>(costUncoded[blkPos + x]); sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; } blkPos += trSize; @@ -922,7 +907,7 @@ * FIX15 nature of the CABAC cost tables minus the forward transform scale */ /* cost of not coding this coefficient (all distortion, no signal bits) */ - costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits; + costUncoded[blkPos] = static_cast<double>(((int64_t)signCoef * signCoef) << scaleBits); X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n"); if (usePsyMask & scanPos) /* when no residual coefficient is coded, predicted coef == recon coef */ @@ -956,7 +941,7 @@ // fast zero coeff path /* set default costs to uncoded costs */ costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); - costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos]; + costCoeff[scanPos] = static_cast<int64_t>(costUncoded[blkPos] + costSig[scanPos]); sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; totalRdCost += costCoeff[scanPos]; rateIncUp[blkPos] = greaterOneBits[0]; @@ -991,7 +976,7 @@ { /* set default costs to uncoded costs */ costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); - costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos]; + costCoeff[scanPos] = static_cast<int64_t>(costUncoded[blkPos] + costSig[scanPos]); } sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; sigCoefBits = estBitsSbac.significantBits[1][ctxSig]; @@ -1138,7 +1123,7 @@ { sigCoeffGroupFlag64 |= cgBlkPosMask; cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos]; - cgRdStats.uncodedDist += costUncoded[blkPos]; + cgRdStats.uncodedDist += static_cast<int64_t>(costUncoded[blkPos]); cgRdStats.nnzBeforePos0 += scanPosinCG; } } @@ -1174,7 +1159,7 @@ uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); - int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); + int64_t costZeroCG = static_cast<int64_t>(totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0])); costZeroCG += cgRdStats.uncodedDist; /* add distortion for resetting non-zero levels to zero levels */ costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */ costZeroCG -= cgRdStats.sigCost; /* remove signaling cost of significant coeff bitmap */ @@ -1185,7 +1170,7 @@ if (costZeroCG < totalRdCost && m_rdoqLevel > 1) { sigCoeffGroupFlag64 &= ~cgBlkPosMask; - totalRdCost = costZeroCG; + totalRdCost = static_cast<double>(costZeroCG); costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */ @@ -1212,14 +1197,14 @@ int64_t bestCost; if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx]) { - bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]); - totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]); + bestCost = static_cast<int64_t>(totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0])); + totalRdCost += static_cast<double>((SIGCOST(estBitsSbac.blockRootCbpBits[1]))); } else { int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]]; - bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]); - totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]); + bestCost = static_cast<int64_t>(totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0])); + totalRdCost += static_cast<double>(SIGCOST(estBitsSbac.blockCbpBits[ctx][1])); } /* This loop starts with the last non-zero found in the first loop and then refines this last @@ -1277,7 +1262,7 @@ bitsLastNZ += IEP_RATE * suffixLen; } - int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ); + int64_t costAsLast = static_cast<int64_t>(totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ)); if (costAsLast < bestCost) { _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel