# HG changeset patch # User Satoshi Nakagawa <nakagawa...@oki.com> # Date 1404286661 -32400 # Wed Jul 02 16:37:41 2014 +0900 # Node ID 3f25ca9b5addda057040a5e1a544b9ede9afc509 # Parent a18972fd05b1d6242a881bef979b9e1ff17543d9 add primitives.nquant for RDOQ
diff -r a18972fd05b1 -r 3f25ca9b5add source/Lib/TLibCommon/TComTrQuant.cpp --- a/source/Lib/TLibCommon/TComTrQuant.cpp Tue Jul 01 14:58:35 2014 -0500 +++ b/source/Lib/TLibCommon/TComTrQuant.cpp Wed Jul 02 16:37:41 2014 +0900 @@ -508,23 +508,30 @@ uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t trSize, TextType ttype, uint32_t absPartIdx, int32_t *lastPos) { - x265_emms(); - selectLambda(ttype); - const uint32_t log2TrSize = g_convertToBit[trSize] + 2; - uint32_t absSum = 0; int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform - uint32_t goRiceParam = 0; - double blockUncodedCost = 0; int scalingListType = (cu->isIntra(absPartIdx) ? 0 : 3) + ttype; X265_CHECK(scalingListType < 6, "scaling list type out of range\n"); int qbits = QUANT_SHIFT + m_qpParam.m_per + transformShift; // Right shift of non-RDOQ quantizer; level = (coeff*Q + offset)>>q_bits int add = (1 << (qbits - 1)); - double *errScale = getErrScaleCoeff(scalingListType, log2TrSize - 2, m_qpParam.m_rem); int32_t *qCoef = getQuantCoeff(scalingListType, m_qpParam.m_rem, log2TrSize - 2); + int numCoeff = 1 << log2TrSize * 2; + int scaledCoeff[32 * 32]; + uint32_t numSig = primitives.nquant(srcCoeff, qCoef, scaledCoeff, dstCoeff, qbits, add, numCoeff); + + X265_CHECK(numSig == primitives.count_nonzero(dstCoeff, numCoeff), "numSig differ\n"); + if (numSig == 0) + return 0; + + x265_emms(); + selectLambda(ttype); + + double *errScale = getErrScaleCoeff(scalingListType, log2TrSize - 2, m_qpParam.m_rem); + + double blockUncodedCost = 0; double costCoeff[32 * 32]; double costSig[32 * 32]; double costCoeff0[32 * 32]; @@ -544,6 +551,7 @@ int c2 = 0; double baseCost = 0; int lastScanPos = -1; + uint32_t goRiceParam = 0; uint32_t c1Idx = 0; uint32_t c2Idx = 0; int cgLastScanPos = -1; @@ -567,16 +575,13 @@ //===== quantization ===== uint32_t blkPos = codingParameters.scan[scanPos]; // set coeff - int Q = qCoef[blkPos]; double scaleFactor = errScale[blkPos]; - int levelDouble = srcCoeff[blkPos]; - levelDouble = (int)std::min<int64_t>((int64_t)abs((int)levelDouble) * Q, MAX_INT - add); - uint32_t maxAbsLevel = (levelDouble + add) >> qbits; + int levelDouble = scaledCoeff[blkPos]; + uint32_t maxAbsLevel = abs(dstCoeff[blkPos]); costCoeff0[scanPos] = ((uint64_t)levelDouble * levelDouble) * scaleFactor; blockUncodedCost += costCoeff0[scanPos]; - dstCoeff[blkPos] = maxAbsLevel; if (maxAbsLevel > 0 && lastScanPos < 0) { @@ -776,7 +781,7 @@ //===== estimate last position ===== if (lastScanPos < 0) { - return absSum; + return 0; } double bestCost = 0; @@ -840,6 +845,7 @@ } // end if (sigCoeffGroupFlag[ cgBlkPos ]) } // end for + uint32_t absSum = 0; for (int pos = 0; pos < bestLastIdxp1; pos++) { int blkPos = codingParameters.scan[pos]; diff -r a18972fd05b1 -r 3f25ca9b5add source/common/dct.cpp --- a/source/common/dct.cpp Tue Jul 01 14:58:35 2014 -0500 +++ b/source/common/dct.cpp Wed Jul 02 16:37:41 2014 +0900 @@ -780,10 +780,8 @@ for (int blockpos = 0; blockpos < numCoeff; blockpos++) { - int level; - int sign; - level = coef[blockpos]; - sign = (level < 0 ? -1 : 1); + int level = coef[blockpos]; + int sign = (level < 0 ? -1 : 1); int tmplevel = abs(level) * quantCoeff[blockpos]; level = ((tmplevel + add) >> qBits); @@ -798,6 +796,27 @@ return acSum; } +uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int32_t* scaledCoeff, int32_t* qCoef, int qBits, int add, int numCoeff) +{ + uint32_t numSig = 0; + + for (int blockpos = 0; blockpos < numCoeff; blockpos++) + { + int level = coef[blockpos]; + int sign = (level < 0 ? -1 : 1); + + int tmplevel = abs(level) * quantCoeff[blockpos]; + scaledCoeff[blockpos] = tmplevel; + level = ((tmplevel + add) >> qBits); + if (level) + ++numSig; + level *= sign; + qCoef[blockpos] = Clip3(-32768, 32767, level); + } + + return numSig; +} + int count_nonzero_c(const int32_t *quantCoeff, int numCoeff) { X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n"); @@ -822,6 +841,7 @@ p.dequant_scaling = dequant_scaling_c; p.dequant_normal = dequant_normal_c; p.quant = quant_c; + p.nquant = nquant_c; p.dct[DST_4x4] = dst4_c; p.dct[DCT_4x4] = dct4_c; p.dct[DCT_8x8] = dct8_c; diff -r a18972fd05b1 -r 3f25ca9b5add source/common/primitives.h --- a/source/common/primitives.h Tue Jul 01 14:58:35 2014 -0500 +++ b/source/common/primitives.h Wed Jul 02 16:37:41 2014 +0900 @@ -147,6 +147,7 @@ typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride); typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos); +typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff); typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift); typedef void (*dequant_normal_t)(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift); typedef int (*count_nonzero_t)(const int32_t *quantCoeff, int numCoeff); @@ -242,6 +243,7 @@ dct_t dct[NUM_DCTS]; idct_t idct[NUM_IDCTS]; quant_t quant; + nquant_t nquant; dequant_scaling_t dequant_scaling; dequant_normal_t dequant_normal; count_nonzero_t count_nonzero; diff -r a18972fd05b1 -r 3f25ca9b5add source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 01 14:58:35 2014 -0500 +++ b/source/common/x86/asm-primitives.cpp Wed Jul 02 16:37:41 2014 +0900 @@ -1061,6 +1061,7 @@ p.dct[DCT_8x8] = x265_dct8_sse4; p.quant = x265_quant_sse4; + p.nquant = x265_nquant_sse4; p.dequant_normal = x265_dequant_normal_sse4; p.cvt16to32_shl = x265_cvt16to32_shl_sse4; p.intra_pred[BLOCK_4x4][0] = x265_intra_pred_planar4_sse4; @@ -1257,6 +1258,7 @@ p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4; p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4; p.quant = x265_quant_sse4; + p.nquant = x265_nquant_sse4; p.dequant_normal = x265_dequant_normal_sse4; p.weight_pp = x265_weight_pp_sse4; p.weight_sp = x265_weight_sp_sse4; diff -r a18972fd05b1 -r 3f25ca9b5add source/common/x86/pixel-util.h --- a/source/common/x86/pixel-util.h Tue Jul 01 14:58:35 2014 -0500 +++ b/source/common/x86/pixel-util.h Wed Jul 02 16:37:41 2014 +0900 @@ -45,6 +45,7 @@ void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride); uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos); +uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff); void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift); int x265_count_nonzero_ssse3(const int32_t *quantCoeff, int numCoeff); diff -r a18972fd05b1 -r 3f25ca9b5add source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Tue Jul 01 14:58:35 2014 -0500 +++ b/source/common/x86/pixel-util8.asm Wed Jul 02 16:37:41 2014 +0900 @@ -879,7 +879,7 @@ %define qbits8 [rsp + 2 * mmsize] %endif - ; fill qbits-8 + ; fill qbits movd m0, r4d mova qbits, m0 @@ -979,6 +979,81 @@ ;----------------------------------------------------------------------------- +; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff); +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal nquant, 5,6,8 + + ; fill qbits + movd m5, r4d ; m5 = qbits + + ; fill offset + movd m6, r5m + pshufd m6, m6, 0 ; m6 = add + + mov r4d, r6m + shr r4d, 3 + pxor m7, m7 ; m7 = numZero +.loop: + ; 4 coeff + movu m0, [r0] ; m0 = level + pxor m1, m1 + pcmpgtd m1, m0 ; m1 = sign + movu m2, [r1] ; m2 = qcoeff + pabsd m0, m0 + pmulld m0, m2 ; m0 = tmpLevel1 + movu [r2], m0 ; m0 = scaledCoeff + paddd m2, m0, m6 + psrad m2, m5 ; m2 = level1 + pxor m4, m4 + pcmpeqd m4, m2 ; m4 = mask4 + + pxor m2, m1 + psubd m2, m1 + packssdw m2, m2 + pmovsxwd m2, m2 + movu [r3], m2 + ; 4 coeff + movu m0, [r0 + 16] ; m0 = level + pxor m1, m1 + pcmpgtd m1, m0 ; m1 = sign + movu m2, [r1 + 16] ; m2 = qcoeff + pabsd m0, m0 + pmulld m0, m2 ; m0 = tmpLevel1 + movu [r2 + 16], m0 ; m0 = scaledCoeff + paddd m2, m0, m6 + psrad m2, m5 ; m2 = level1 + pxor m0, m0 + pcmpeqd m0, m2 ; m0 = mask4 + + pxor m2, m1 + psubd m2, m1 + packssdw m2, m2 + pmovsxwd m2, m2 + movu [r3 + 16], m2 + + packssdw m4, m0 ; m4 = mask8 + psubw m7, m4 ; m7 = numZero + + add r0, 32 + add r1, 32 + add r2, 32 + add r3, 32 + + dec r4d + jnz .loop + + packuswb m7, m7 + pxor m0, m0 + psadbw m0, m7 + mov eax, r6m + movd r4d, m0 + sub eax, r4d ; numSig + + RET + + +;----------------------------------------------------------------------------- ; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift) ;----------------------------------------------------------------------------- INIT_XMM sse4 diff -r a18972fd05b1 -r 3f25ca9b5add source/test/mbdstharness.cpp --- a/source/test/mbdstharness.cpp Tue Jul 01 14:58:35 2014 -0500 +++ b/source/test/mbdstharness.cpp Wed Jul 02 16:37:41 2014 +0900 @@ -327,6 +327,50 @@ return true; } +bool MBDstHarness::check_nquant_primitive(nquant_t ref, nquant_t opt) +{ + int j = 0; + + for (int i = 0; i <= ITERS; i++) + { + int width = (rand() % 4 + 1) * 4; + + if (width == 12) + { + width = 32; + } + int height = width; + + uint32_t optReturnValue = 0; + uint32_t refReturnValue = 0; + + int bits = rand() % 32; + int valueToAdd = rand() % (32 * 1024); + int cmp_size = sizeof(int) * height * width; + int numCoeff = height * width; + + int index1 = rand() % TEST_CASES; + int index2 = rand() % TEST_CASES; + + refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff); + optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff); + + if (memcmp(mintbuf3, mintbuf5, cmp_size)) + return false; + + if (memcmp(mintbuf4, mintbuf6, cmp_size)) + return false; + + if (optReturnValue != refReturnValue) + return false; + + reportfail(); + j += 16; + } + + return true; +} + bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt) { ALIGN_VAR_32(int32_t, qcoeff[32 * 32]); @@ -409,6 +453,15 @@ } } + if (opt.nquant) + { + if (!check_nquant_primitive(ref.nquant, opt.nquant)) + { + printf("nquant: Failed!\n"); + return false; + } + } + if (opt.count_nonzero) { if (!check_count_nonzero_primitive(ref.count_nonzero, opt.count_nonzero)) @@ -460,6 +513,12 @@ REPORT_SPEEDUP(opt.quant, ref.quant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32, &dummy); } + if (opt.nquant) + { + printf("nquant\t\t"); + REPORT_SPEEDUP(opt.nquant, ref.nquant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32); + } + if (opt.count_nonzero) { for (int i = 4; i <= 32; i <<= 1) diff -r a18972fd05b1 -r 3f25ca9b5add source/test/mbdstharness.h --- a/source/test/mbdstharness.h Tue Jul 01 14:58:35 2014 -0500 +++ b/source/test/mbdstharness.h Wed Jul 02 16:37:41 2014 +0900 @@ -44,6 +44,7 @@ bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt); bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt); bool check_quant_primitive(quant_t ref, quant_t opt); + bool check_nquant_primitive(nquant_t ref, nquant_t opt); bool check_dct_primitive(dct_t ref, dct_t opt, intptr_t width); bool check_idct_primitive(idct_t ref, idct_t opt, intptr_t width); bool check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel