# HG changeset patch # User Vignesh Vijayakumar <vign...@multicorewareinc.com> # Date 1513752346 -19800 # Wed Dec 20 12:15:46 2017 +0530 # Node ID 9a2c5411769847c4283594b99c1b07a99e92ea4a # Parent a2224f4d257cf5f5cd391f455aae3117b7fe65ab x86: nonPsyRdoQuant primitive
This patch also adds AVX512 assembly code for this primitive C code : 182.62c AVX512 : 586.00c diff -r a2224f4d257c -r 9a2c54117698 source/common/dct.cpp --- a/source/common/dct.cpp Fri Dec 22 13:52:16 2017 +0530 +++ b/source/common/dct.cpp Wed Dec 20 12:15:46 2017 +0530 @@ -980,10 +980,27 @@ sum += sbacGetEntropyBits(mstate, firstC2Flag); } } - return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28); } +template<int log2TrSize> +static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos) +{ + const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ + const int scaleBits = SCALE_BITS - 2 * transformShift; + const uint32_t trSize = 1 << log2TrSize; + for (int y = 0; y < MLS_CG_SIZE; y++) + { + for (int x = 0; x < MLS_CG_SIZE; x++) + { + int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ + costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits)); + *totalUncodedCost += costUncoded[blkPos + x]; + *totalRdCost += costUncoded[blkPos + x]; + } + blkPos += trSize; + } +} namespace X265_NS { // x265 private namespace @@ -993,6 +1010,10 @@ p.dequant_normal = dequant_normal_c; p.quant = quant_c; p.nquant = nquant_c; + p.cu[BLOCK_4x4].nonPsyRdoQuant = nonPsyRdoQuant_c<2>; + p.cu[BLOCK_8x8].nonPsyRdoQuant = nonPsyRdoQuant_c<3>; + p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_c<4>; + p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_c<5>; p.dst4x4 = dst4_c; p.cu[BLOCK_4x4].dct = dct4_c; p.cu[BLOCK_8x8].dct = dct8_c; diff -r a2224f4d257c -r 9a2c54117698 source/common/primitives.h --- a/source/common/primitives.h Fri Dec 22 13:52:16 2017 +0530 +++ b/source/common/primitives.h Wed Dec 20 12:15:46 2017 +0530 @@ -223,7 +223,7 @@ typedef void (*integralv_t)(uint32_t *sum, intptr_t stride); typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride); - +typedef void(*nonPsyRdoQuant_t)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos); /* Function pointers to optimized encoder primitives. Each pointer can reference * either an assembly routine, a SIMD intrinsic primitive, or a C function */ struct EncoderPrimitives @@ -299,9 +299,9 @@ intra_allangs_t intra_pred_allangs; intra_filter_t intra_filter; intra_pred_t intra_pred[NUM_INTRA_MODE]; + nonPsyRdoQuant_t nonPsyRdoQuant; } cu[NUM_CU_SIZES]; - /* These remaining primitives work on either fixed block sizes or take * block dimensions as arguments and thus do not belong in either the PU or * the CU arrays */ diff -r a2224f4d257c -r 9a2c54117698 source/common/quant.cpp --- a/source/common/quant.cpp Fri Dec 22 13:52:16 2017 +0530 +++ b/source/common/quant.cpp Wed Dec 20 12:15:46 2017 +0530 @@ -734,12 +734,9 @@ { int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/ - - costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits; - + costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits)); /* when no residual coefficient is coded, predicted coef == recon coef */ costUncoded[blkPos + x] -= PSYVALUE(predictedCoef); - totalUncodedCost += costUncoded[blkPos + x]; totalRdCost += costUncoded[blkPos + x]; } @@ -753,25 +750,11 @@ for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++) { X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); - uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); uint32_t blkPos = codeParams.scan[scanPosBase]; - - for (int y = 0; y < MLS_CG_SIZE; y++) - { - for (int x = 0; x < MLS_CG_SIZE; x++) - { - int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ - costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits; - - totalUncodedCost += costUncoded[blkPos + x]; - totalRdCost += costUncoded[blkPos + x]; - } - blkPos += trSize; - } + primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); } } - static const uint8_t table_cnt[5][SCAN_SET_SIZE] = { // patternSigCtx = 0 @@ -841,12 +824,9 @@ { int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/ - - costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits; - + costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits)); /* when no residual coefficient is coded, predicted coef == recon coef */ costUncoded[blkPos + x] -= PSYVALUE(predictedCoef); - totalUncodedCost += costUncoded[blkPos + x]; totalRdCost += costUncoded[blkPos + x]; @@ -865,16 +845,12 @@ else { // non-psy path + primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); + blkPos = codeParams.scan[scanPosBase]; for (int y = 0; y < MLS_CG_SIZE; y++) { for (int x = 0; x < MLS_CG_SIZE; x++) { - int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ - costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits; - - totalUncodedCost += costUncoded[blkPos + x]; - totalRdCost += costUncoded[blkPos + x]; - const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; X265_CHECK(trSize > 4, "trSize check failure\n"); diff -r a2224f4d257c -r 9a2c54117698 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Dec 22 13:52:16 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 20 12:15:46 2017 +0530 @@ -3083,6 +3083,12 @@ p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx512); p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx512); p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx512); + + p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx512); + p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx512); + p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx512); + p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx512); + } #endif } @@ -5265,7 +5271,10 @@ p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x32>; p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x64>; p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_48x64>; - + p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx512); + p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx512); + p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx512); + p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx512); } #endif } diff -r a2224f4d257c -r 9a2c54117698 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asm Fri Dec 22 13:52:16 2017 +0530 +++ b/source/common/x86/dct8.asm Wed Dec 20 12:15:46 2017 +0530 @@ -510,8 +510,22 @@ tab_idct8_2: times 1 dw 89, 75, 50, 18, 75, -18, -89, -50 times 1 dw 50, -89, 18, 75, 18, -50, 75, -89 - pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 +;Transform shift and scale bits table for rdoQuant +tab_nonpsyRdo8 : dq 5, 5 + dq 4, 7 + dq 3, 9 + dq 2, 11 + +tab_nonpsyRdo10: dq 3, 9 + dq 2, 11 + dq 1, 13 + dq 0, 15 + +tab_nonpsyRdo12: dq 1, 13 + dq 0, 15 + dq -1, 17 + dq -2, 19 SECTION .text cextern pd_1 @@ -6399,4 +6413,319 @@ movhps [r1 + 2 * r2], xm0 movhps [r1 + r3], xm1 RET + +;static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos) +;{ +; const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ +; const int scaleBits = SCALE_BITS - 2 * transformShift; +; const uint32_t trSize = 1 << log2TrSize; + +; for (int y = 0; y < MLS_CG_SIZE; y++) +; { +; for (int x = 0; x < MLS_CG_SIZE; x++) +; { +; int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ +; costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits)); +; *totalUncodedCost += costUncoded[blkPos + x]; +; *totalRdCost += costUncoded[blkPos + x]; +; } +; blkPos += trSize; +; } +;} + +;--------------------------------------------------------------------------------------------------------------------------------------------------------- +; void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos) +;--------------------------------------------------------------------------------------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal nonPsyRdoQuant4, 5, 8, 8 + + mov r4d, r4m + lea r0, [r0 + 2 * r4] + lea r7, [4 * r4] + lea r1, [r1 + 2 * r7] + +%if BIT_DEPTH == 12 + mov r5q, [tab_nonpsyRdo12] ; transformShift + mov r6q, [tab_nonpsyRdo12 + 8] ; scaleBits +%elif BIT_DEPTH == 10 + mov r5q, [tab_nonpsyRdo10] + mov r6q, [tab_nonpsyRdo10 + 8] +%elif BIT_DEPTH == 8 + mov r5q, [tab_nonpsyRdo8] + mov r6q, [tab_nonpsyRdo8 + 8] +%else + %error Unsupported BIT_DEPTH! + %endif + + movq xm3, r6 + movq xm6, [r2] + movq xm7, [r3] + vpxor m4, m4 + vpxor m5, m5 + +;Row 1, 2 + movq xm0, [r0] + pinsrq xm0, [r0 + 8], 1 + vpmovsxwq m1, xm0 + vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + + vfmadd132pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 + vfmadd231pd m2, m2, m5 + + vcvtpd2qq m1, m2 + vpsllq m1, xm3 ; costUncoded + paddq m4, m1 + movu [r1], ym1 + vextracti32x8 [r1 + 32], m1 , 1 + + ;Row 3, 4 + movq xm0, [r0 + 16] + pinsrq xm0, [r0 + 24], 1 + vpmovsxwq m1, xm0 + vcvtqq2pd m2, m1 + + vfmadd132pd m2, m2, m5 + vfmadd213pd m2, m2, m5 + vfmadd231pd m2, m2, m5 + + vcvtpd2qq m1, m2 + vpsllq m1, xm3 ; costUncoded + paddq m4, m1 + movu [r1 + 64], ym1 + vextracti32x8 [r1 + 96], m1 , 1 + + vextracti32x8 ym2, m4, 1 + paddq ym4, ym2 + vextracti32x4 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm5 + paddq xm4, xm2 + + paddq xm6, xm4 + paddq xm7, xm4 + + movq [r2], xm6 + movq [r3], xm7 + RET + +INIT_ZMM avx512 +cglobal nonPsyRdoQuant8, 5, 8, 8 + + mov r4d, r4m + lea r0, [r0 + 2 * r4] + lea r7, [4 * r4] + lea r1, [r1 + 2 * r7] + +%if BIT_DEPTH == 12 + mov r5q, [tab_nonpsyRdo12 + 16] ; transformShift + mov r6q, [tab_nonpsyRdo12 + 24] ; scaleBits +%elif BIT_DEPTH == 10 + mov r5q, [tab_nonpsyRdo10 + 16] + mov r6q, [tab_nonpsyRdo10 + 24] +%elif BIT_DEPTH == 8 + mov r5q, [tab_nonpsyRdo8 + 16] + mov r6q, [tab_nonpsyRdo8 + 24] +%else + %error Unsupported BIT_DEPTH! + %endif + + movq xm3, r6 + movq xm6, [r2] + movq xm7, [r3] + vpxor m4, m4 + vpxor m5, m5 + +;Row 1, 2 + movq xm0, [r0] + pinsrq xm0, [r0 + mmsize/4], 1 + vpmovsxwq m1, xm0 + vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + + vfmadd132pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 + vfmadd231pd m2, m2, m5 + + vcvtpd2qq m1, m2 + vpsllq m1, xm3 ; costUncoded + paddq m4, m1 + movu [r1], ym1 + vextracti32x8 [r1 + mmsize], m1 , 1 + + ;Row 3, 4 + movq xm0, [r0 + mmsize/2] + pinsrq xm0, [r0 + 3 * mmsize/4], 1 + vpmovsxwq m1, xm0 + vcvtqq2pd m2, m1 + + vfmadd132pd m2, m2, m5 + vfmadd213pd m2, m2, m5 + vfmadd231pd m2, m2, m5 + + vcvtpd2qq m1, m2 + vpsllq m1, xm3 ; costUncoded + paddq m4, m1 + movu [r1 + 2 * mmsize], ym1 + vextracti32x8 [r1 + 3 * mmsize], m1 , 1 + + vextracti32x8 ym2, m4, 1 + paddq ym4, ym2 + vextracti32x4 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm5 + paddq xm4, xm2 + + paddq xm6, xm4 + paddq xm7, xm4 + + movq [r2], xm6 + movq [r3], xm7 + RET + +INIT_ZMM avx512 +cglobal nonPsyRdoQuant16, 5, 8, 8 + + mov r4d, r4m + lea r0, [r0 + 2 * r4] + lea r7, [4 * r4] + lea r1, [r1 + 2 * r7] + +%if BIT_DEPTH == 12 + mov r5q, [tab_nonpsyRdo12 + 32] ; transformShift + mov r6q, [tab_nonpsyRdo12 + 40] ; scaleBits +%elif BIT_DEPTH == 10 + mov r5q, [tab_nonpsyRdo10 + 32] + mov r6q, [tab_nonpsyRdo10 + 40] +%elif BIT_DEPTH == 8 + mov r5q, [tab_nonpsyRdo8 + 32] + mov r6q, [tab_nonpsyRdo8 + 40] +%else + %error Unsupported BIT_DEPTH! + %endif + + movq xm3, r6 + movq xm6, [r2] + movq xm7, [r3] + vpxor m4, m4 + vpxor m5, m5 + +;Row 1, 2 + movq xm0, [r0] + pinsrq xm0, [r0 + mmsize/2], 1 + vpmovsxwq m1, xm0 + vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + + vfmadd132pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 + vfmadd231pd m2, m2, m5 + + vcvtpd2qq m1, m2 + vpsllq m1, xm3 ; costUncoded + paddq m4, m1 + movu [r1], ym1 + vextracti32x8 [r1 + 2 * mmsize], m1, 1 + + ;Row 3, 4 + movq xm0, [r0 + mmsize] + pinsrq xm0, [r0 + 3 * mmsize/2], 1 + vpmovsxwq m1, xm0 + vcvtqq2pd m2, m1 + + vfmadd132pd m2, m2, m5 + vfmadd213pd m2, m2, m5 + vfmadd231pd m2, m2, m5 + + vcvtpd2qq m1, m2 + vpsllq m1, xm3 ; costUncoded + paddq m4, m1 + movu [r1 + 4 * mmsize], ym1 + vextracti32x8 [r1 + 6 * mmsize], m1 , 1 + + vextracti32x8 ym2, m4, 1 + paddq ym4, ym2 + vextracti32x4 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm5 + paddq xm4, xm2 + + paddq xm6, xm4 + paddq xm7, xm4 + + movq [r2], xm6 + movq [r3], xm7 + RET + +INIT_ZMM avx512 +cglobal nonPsyRdoQuant32, 5, 8, 8 + + mov r4d, r4m + lea r0, [r0 + 2 * r4] + lea r7, [4 * r4] + lea r1, [r1 + 2 * r7] + +%if BIT_DEPTH == 12 + mov r5q, [tab_nonpsyRdo12 + 48] ; transformShift + mov r6q, [tab_nonpsyRdo12 + 56] ; scaleBits +%elif BIT_DEPTH == 10 + mov r5q, [tab_nonpsyRdo10 + 48] + mov r6q, [tab_nonpsyRdo10 + 56] +%elif BIT_DEPTH == 8 + mov r5q, [tab_nonpsyRdo8 + 48] + mov r6q, [tab_nonpsyRdo8 + 56] +%else + %error Unsupported BIT_DEPTH! + %endif + + movq xm3, r6 + movq xm6, [r2] + movq xm7, [r3] + vpxor m4, m4 + vpxor m5, m5 + +;Row 1, 2 + movq xm0, [r0] + pinsrq xm0, [r0 + mmsize], 1 + vpmovsxwq m1, xm0 + vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + + vfmadd132pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 + vfmadd231pd m2, m2, m5 + + vcvtpd2qq m1, m2 + vpsllq m1, xm3 ; costUncoded + paddq m4, m1 + movu [r1], ym1 + vextracti32x8 [r1 + 4 * mmsize], m1, 1 + + ;Row 3, 4 + movq xm0, [r0 + 2 * mmsize] + pinsrq xm0, [r0 + 3 * mmsize], 1 + vpmovsxwq m1, xm0 + vcvtqq2pd m2, m1 + + vfmadd132pd m2, m2, m5 + vfmadd213pd m2, m2, m5 + vfmadd231pd m2, m2, m5 + + vcvtpd2qq m1, m2 + vpsllq m1, xm3 ; costUncoded + paddq m4, m1 + movu [r1 + 8 * mmsize], ym1 + vextracti32x8 [r1 + 12 * mmsize], m1 , 1 + + vextracti32x8 ym2, m4, 1 + paddq ym4, ym2 + vextracti32x4 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm5 + paddq xm4, xm2 + + paddq xm6, xm4 + paddq xm7, xm4 + + movq [r2], xm6 + movq [r3], xm7 + RET + %endif diff -r a2224f4d257c -r 9a2c54117698 source/common/x86/dct8.h --- a/source/common/x86/dct8.h Fri Dec 22 13:52:16 2017 +0530 +++ b/source/common/x86/dct8.h Wed Dec 20 12:15:46 2017 +0530 @@ -34,7 +34,7 @@ FUNCDEF_TU_S2(void, idct, ssse3, const int16_t* src, int16_t* dst, intptr_t dstStride); FUNCDEF_TU_S2(void, idct, sse4, const int16_t* src, int16_t* dst, intptr_t dstStride); FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride); - +FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx512, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos); void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride); void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride); void PFX(idst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride); diff -r a2224f4d257c -r 9a2c54117698 source/test/mbdstharness.cpp --- a/source/test/mbdstharness.cpp Fri Dec 22 13:52:16 2017 +0530 +++ b/source/test/mbdstharness.cpp Wed Dec 20 12:15:46 2017 +0530 @@ -279,9 +279,52 @@ reportfail(); j += INCR; } + return true; +} + +bool MBDstHarness::check_nonPsyRdoQuant_primitive(nonPsyRdoQuant_t ref, nonPsyRdoQuant_t opt) +{ + int j = 0; + int trSize[4] = { 16, 64, 256, 1024 }; + + ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]); + ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]); + + for (int i = 0; i < ITERS; i++) + { + int64_t totalRdCostRef = rand(); + int64_t totalUncodedCostRef = rand(); + int64_t totalRdCostOpt = totalRdCostRef; + int64_t totalUncodedCostOpt = totalUncodedCostRef; + + int index = rand() % 4; + uint32_t blkPos = trSize[index]; + int cmp_size = 4 * MAX_TU_SIZE; + + memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t)); + memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t)); + + int index1 = rand() % TEST_CASES; + + ref(short_test_buff[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, blkPos); + checked(opt, short_test_buff[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, blkPos); + + if (memcmp(ref_dest, opt_dest, cmp_size)) + return false; + + if (totalUncodedCostRef != totalUncodedCostOpt) + return false; + + if (totalRdCostRef != totalRdCostOpt) + return false; + + reportfail(); + j += INCR; + } return true; } + bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt) { int j = 0; @@ -418,6 +461,19 @@ return false; } } + + for (int i = 0; i < NUM_TR_SIZE; i++) + { + if (opt.cu[i].nonPsyRdoQuant) + { + if (!check_nonPsyRdoQuant_primitive(ref.cu[i].nonPsyRdoQuant, opt.cu[i].nonPsyRdoQuant)) + { + printf("nonPsyRdoQuant[%dx%d]: Failed!\n", 4 << i, 4 << i); + return false; + } + } + } + for (int i = 0; i < NUM_TR_SIZE; i++) { if (opt.cu[i].count_nonzero) @@ -505,6 +561,19 @@ printf("nquant\t\t"); REPORT_SPEEDUP(opt.nquant, ref.nquant, short_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32); } + + for (int value = 0; value < NUM_TR_SIZE; value++) + { + if (opt.cu[value].nonPsyRdoQuant) + { + ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]); + int64_t totalRdCost = 0; + int64_t totalUncodedCost = 0; + printf("nonPsyRdoQuant[%dx%d]", 4 << value, 4 << value); + REPORT_SPEEDUP(opt.cu[value].nonPsyRdoQuant, ref.cu[value].nonPsyRdoQuant, short_test_buff[0], opt_dest, &totalUncodedCost, &totalRdCost, 0); + } + } + for (int value = 0; value < NUM_TR_SIZE; value++) { if (opt.cu[value].count_nonzero) diff -r a2224f4d257c -r 9a2c54117698 source/test/mbdstharness.h --- a/source/test/mbdstharness.h Fri Dec 22 13:52:16 2017 +0530 +++ b/source/test/mbdstharness.h Wed Dec 20 12:15:46 2017 +0530 @@ -62,9 +62,9 @@ int16_t short_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE]; int16_t short_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE]; - bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt); bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt); + bool check_nonPsyRdoQuant_primitive(nonPsyRdoQuant_t ref, nonPsyRdoQuant_t opt); bool check_quant_primitive(quant_t ref, quant_t opt); bool check_nquant_primitive(nquant_t ref, nquant_t opt); bool check_dct_primitive(dct_t ref, dct_t opt, intptr_t width); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel