# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com> # Date 1514521347 -19800 # Fri Dec 29 09:52:27 2017 +0530 # Node ID 4e9f2efdfd097910aa5bf704a4bbf38b0a28f2a5 # Parent 80775bda5ec16735e7b1de97dedeb7f7ed391c8f x86: psyRdoQuant primitive
This patch also adds AVX512 assembly code for this primitive AVX512 :231.20c C code :1060.74c diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/dct.cpp --- a/source/common/dct.cpp Tue Jan 02 15:21:08 2018 +0530 +++ b/source/common/dct.cpp Fri Dec 29 09:52:27 2017 +0530 @@ -1001,9 +1001,34 @@ blkPos += trSize; } } +template<int log2TrSize> +static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos) +{ + const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ + const int scaleBits = SCALE_BITS - 2 * transformShift; + const uint32_t trSize = 1 << log2TrSize; + int max = X265_MAX(0, (2 * transformShift + 1)); + + for (int y = 0; y < MLS_CG_SIZE; y++) + { + for (int x = 0; x < MLS_CG_SIZE; x++) + { + int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ + int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/ + + costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits)); + + /* when no residual coefficient is coded, predicted coef == recon coef */ + costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max)); + + *totalUncodedCost += costUncoded[blkPos + x]; + *totalRdCost += costUncoded[blkPos + x]; + } + blkPos += trSize; + } +} namespace X265_NS { // x265 private namespace - void setupDCTPrimitives_c(EncoderPrimitives& p) { p.dequant_scaling = dequant_scaling_c; @@ -1014,6 +1039,10 @@ p.cu[BLOCK_8x8].nonPsyRdoQuant = nonPsyRdoQuant_c<3>; p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_c<4>; p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_c<5>; + p.cu[BLOCK_4x4].psyRdoQuant = psyRdoQuant_c<2>; + p.cu[BLOCK_8x8].psyRdoQuant = psyRdoQuant_c<3>; + p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_c<4>; + p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_c<5>; p.dst4x4 = dst4_c; p.cu[BLOCK_4x4].dct = dct4_c; p.cu[BLOCK_8x8].dct = dct8_c; diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/primitives.h --- a/source/common/primitives.h Tue Jan 02 15:21:08 2018 +0530 +++ b/source/common/primitives.h Fri Dec 29 09:52:27 2017 +0530 @@ -224,6 +224,8 @@ typedef void (*integralv_t)(uint32_t *sum, intptr_t stride); typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride); typedef void(*nonPsyRdoQuant_t)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos); +typedef void(*psyRdoQuant_t)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos); + /* Function pointers to optimized encoder primitives. Each pointer can reference * either an assembly routine, a SIMD intrinsic primitive, or a C function */ struct EncoderPrimitives @@ -297,6 +299,7 @@ intra_filter_t intra_filter; intra_pred_t intra_pred[NUM_INTRA_MODE]; nonPsyRdoQuant_t nonPsyRdoQuant; + psyRdoQuant_t psyRdoQuant; } cu[NUM_CU_SIZES]; /* These remaining primitives work on either fixed block sizes or take diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/quant.cpp --- a/source/common/quant.cpp Tue Jan 02 15:21:08 2018 +0530 +++ b/source/common/quant.cpp Fri Dec 29 09:52:27 2017 +0530 @@ -642,11 +642,9 @@ X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n"); if (!numSig) return 0; - const uint32_t trSize = 1 << log2TrSize; int64_t lambda2 = m_qpParam[ttype].lambda2; - const int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda); - + int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda); /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4) * scale applied that must be removed during unquant. Note that in real dequant there is clipping * at several stages. We skip the clipping for simplicity when measuring RD cost */ @@ -723,25 +721,9 @@ for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++) { X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); - uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); uint32_t blkPos = codeParams.scan[scanPosBase]; - - // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA - for (int y = 0; y < MLS_CG_SIZE; y++) - { - for (int x = 0; x < MLS_CG_SIZE; x++) - { - int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ - int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/ - costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits)); - /* when no residual coefficient is coded, predicted coef == recon coef */ - costUncoded[blkPos + x] -= PSYVALUE(predictedCoef); - totalUncodedCost += costUncoded[blkPos + x]; - totalRdCost += costUncoded[blkPos + x]; - } - blkPos += trSize; - } + primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); } } else @@ -814,22 +796,14 @@ // TODO: does we need zero-coeff cost? const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); uint32_t blkPos = codeParams.scan[scanPosBase]; - if (usePsyMask) { - // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA + primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); + blkPos = codeParams.scan[scanPosBase]; for (int y = 0; y < MLS_CG_SIZE; y++) { for (int x = 0; x < MLS_CG_SIZE; x++) { - int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ - int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/ - costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits)); - /* when no residual coefficient is coded, predicted coef == recon coef */ - costUncoded[blkPos + x] -= PSYVALUE(predictedCoef); - totalUncodedCost += costUncoded[blkPos + x]; - totalRdCost += costUncoded[blkPos + x]; - const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; X265_CHECK(trSize > 4, "trSize check failure\n"); diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jan 02 15:21:08 2018 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Dec 29 09:52:27 2017 +0530 @@ -3120,7 +3120,10 @@ p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx512); p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx512); p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx512); - + p.cu[BLOCK_4x4].psyRdoQuant = PFX(psyRdoQuant4_avx512); + p.cu[BLOCK_8x8].psyRdoQuant = PFX(psyRdoQuant8_avx512); + p.cu[BLOCK_16x16].psyRdoQuant = PFX(psyRdoQuant16_avx512); + p.cu[BLOCK_32x32].psyRdoQuant = PFX(psyRdoQuant32_avx512); } #endif } @@ -5302,10 +5305,16 @@ p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x32>; p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x64>; p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_48x64>; + p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx512); p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx512); p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx512); p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx512); + p.cu[BLOCK_4x4].psyRdoQuant = PFX(psyRdoQuant4_avx512); + p.cu[BLOCK_8x8].psyRdoQuant = PFX(psyRdoQuant8_avx512); + p.cu[BLOCK_16x16].psyRdoQuant = PFX(psyRdoQuant16_avx512); + p.cu[BLOCK_32x32].psyRdoQuant = PFX(psyRdoQuant32_avx512); + } #endif } diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asm Tue Jan 02 15:21:08 2018 +0530 +++ b/source/common/x86/dct8.asm Fri Dec 29 09:52:27 2017 +0530 @@ -516,6 +516,7 @@ tab_nonpsyRdo8 : dq 5, 7, 9, 11 tab_nonpsyRdo10: dq 9, 11, 13, 15 tab_nonpsyRdo12: dq 13, 15, 17, 19 + SECTION .text cextern pd_1 cextern pd_2 @@ -542,6 +543,10 @@ %define DST4_ROUND 16 %define DCT8_SHIFT1 6 %define DCT8_ROUND1 32 + %define RDO_MAX_4 3 + %define RDO_MAX_8 1 + %define RDO_MAX_16 0 + %define RDO_MAX_32 0 %elif BIT_DEPTH == 10 %define DCT4_SHIFT 3 %define DCT4_ROUND 4 @@ -551,6 +556,10 @@ %define DST4_ROUND 4 %define DCT8_SHIFT1 4 %define DCT8_ROUND1 8 + %define RDO_MAX_4 7 + %define RDO_MAX_8 5 + %define RDO_MAX_16 3 + %define RDO_MAX_32 1 %elif BIT_DEPTH == 8 %define DCT4_SHIFT 1 %define DCT4_ROUND 1 @@ -560,6 +569,10 @@ %define DST4_ROUND 1 %define DCT8_SHIFT1 2 %define DCT8_ROUND1 2 + %define RDO_MAX_4 11 + %define RDO_MAX_8 9 + %define RDO_MAX_16 7 + %define RDO_MAX_32 5 %else %error Unsupported BIT_DEPTH! %endif @@ -6650,5 +6663,391 @@ movq [r2], xm6 movq [r3], xm7 RET - +;static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t psyScale, uint32_t blkPos) +;{ +; const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ +; const int scaleBits = SCALE_BITS - 2 * transformShift; +; const uint32_t trSize = 1 << log2TrSize; +; int max = X265_MAX(0, (2 * transformShift + 1)); +; +; for (int y = 0; y < MLS_CG_SIZE; y++) +; { +; for (int x = 0; x < MLS_CG_SIZE; x++) +; { +; int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ +; int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/ +; +; costUncoded[blkPos + x] = static_cast<int64_t>((double)(signCoef * signCoef) << scaleBits); +; +; /* when no residual coefficient is coded, predicted coef == recon coef */ +; costUncoded[blkPos + x] -= static_cast<int64_t>((psyScale * (predictedCoef)) >> max); +; +; *totalUncodedCost += costUncoded[blkPos + x]; +; *totalRdCost += costUncoded[blkPos + x]; +; } +; blkPos += trSize; +; } +;} + +;--------------------------------------------------------------------------------------------------------------------------------------------------------- +; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos) +;--------------------------------------------------------------------------------------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal psyRdoQuant4, 5, 9, 13 + mov r5, r5m + mov r6d, r6m + vpbroadcastq m12, [r5] ; psyScale + lea r0, [r0 + 2 * r6] + lea r1, [r1 + 2 * r6] + lea r6, [4 * r6] + lea r2, [r2 + 2 * r6] + movq xm0, [r3] + movq xm1, [r4] + +%if BIT_DEPTH == 12 + mov r5, [tab_nonpsyRdo12] ; scaleBits +%elif BIT_DEPTH == 10 + mov r5, [tab_nonpsyRdo10] +%elif BIT_DEPTH == 8 + mov r5, [tab_nonpsyRdo8] +%else + %error Unsupported BIT_DEPTH! %endif + + movq xm2, r5 + vpxor m4, m4 + vpxor m3, m3 + +;Row 1, 2 + vpmovsxwq m6, [r0] + vpmovsxwq m7, [r1] + psubq m7, m6 ; predictedCoef + + vcvtqq2pd m9, m6 + vfmadd213pd m9, m9, m3 + vcvtpd2qq m8, m9 + vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits + + vcvtqq2pd m10, m7 + vcvtqq2pd m11, m12 + vfmadd213pd m10, m11, m3 + vcvtpd2qq m9, m10 + vpsraq m9, RDO_MAX_4 ;(psyScale * predictedCoef) >> max + + psubq m8, m9 + paddq m4, m8 + movu [r2], m8 + + ;Row 3, 4 + vpmovsxwq m6, [r0 + 16] + vpmovsxwq m7, [r1 + 16] + psubq m7, m6 ; predictedCoef + + vcvtqq2pd m9, m6 + vfmadd213pd m9, m9, m3 + vcvtpd2qq m8, m9 + vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits + + vcvtqq2pd m10, m7 + vcvtqq2pd m11, m12 + vfmadd213pd m10, m11, m3 + vcvtpd2qq m9, m10 + vpsraq m9, RDO_MAX_4 ;(psyScale * predictedCoef) >> max + + psubq m8, m9 + paddq m4, m8 + movu [r2 + 64], m8 + + vextracti32x8 ym2, m4, 1 + paddq ym4, ym2 + vextracti32x4 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm3 + paddq xm4, xm2 + + paddq xm0, xm4 + paddq xm1, xm4 + + movq [r3], xm0 + movq [r4], xm1 + RET + +;--------------------------------------------------------------------------------------------------------------------------------------------------------- +; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos) +;--------------------------------------------------------------------------------------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal psyRdoQuant8, 5, 9, 15 + mov r5, r5m + mov r6d, r6m + vpbroadcastq m12, [r5] ; psyScale + lea r0, [r0 + 2 * r6] + lea r1, [r1 + 2 * r6] + lea r6, [4 * r6] + lea r2, [r2 + 2 * r6] + movq xm0, [r3] + movq xm1, [r4] + +%if BIT_DEPTH == 12 + mov r5, [tab_nonpsyRdo12 + 8] ; scaleBits +%elif BIT_DEPTH == 10 + mov r5, [tab_nonpsyRdo10 + 8] +%elif BIT_DEPTH == 8 + mov r5, [tab_nonpsyRdo8 + 8] +%else + %error Unsupported BIT_DEPTH! +%endif + + movq xm2, r5 + vpxor m4, m4 + vpxor m3, m3 + +;Row 1, 2 + movq xm13, [r0] + movq xm14, [r1] + pinsrq xm13, [r0 + mmsize/4], 1 + pinsrq xm14, [r1 + mmsize/4], 1 + vpmovsxwq m6, xm13 + vpmovsxwq m7, xm14 + psubq m7, m6 ; predictedCoef + + vcvtqq2pd m9, m6 + vfmadd213pd m9, m9, m3 + vcvtpd2qq m8, m9 + vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits + + vcvtqq2pd m10, m7 + vcvtqq2pd m11, m12 + vfmadd213pd m10, m11, m3 + vcvtpd2qq m9, m10 + vpsraq m9, RDO_MAX_8 ;(psyScale * predictedCoef) >> max + + psubq m8, m9 + paddq m4, m8 + movu [r2], ym8 + vextracti32x8 [r2 + mmsize], m8 , 1 + + ;Row 3, 4 + movq xm13, [r0 + mmsize/2] + movq xm14, [r1 + mmsize/2] + pinsrq xm13, [r0 + 3 * mmsize/4], 1 + pinsrq xm14, [r1 + 3 * mmsize/4], 1 + vpmovsxwq m6, xm13 + vpmovsxwq m7, xm14 + psubq m7, m6 ; predictedCoef + + vcvtqq2pd m9, m6 + vfmadd213pd m9, m9, m3 + vcvtpd2qq m8, m9 + vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits + + vcvtqq2pd m10, m7 + vcvtqq2pd m11, m12 + vfmadd213pd m10, m11, m3 + vcvtpd2qq m9, m10 + vpsraq m9, RDO_MAX_8 ;(psyScale * predictedCoef) >> max + + psubq m8, m9 + paddq m4, m8 + movu [r2 + 2 * mmsize], ym8 + vextracti32x8 [r2 + 3 * mmsize], m8 , 1 + + vextracti32x8 ym2, m4, 1 + paddq ym4, ym2 + vextracti32x4 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm3 + paddq xm4, xm2 + + paddq xm0, xm4 + paddq xm1, xm4 + + movq [r3], xm0 + movq [r4], xm1 + RET + +;--------------------------------------------------------------------------------------------------------------------------------------------------------- +; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos) +;--------------------------------------------------------------------------------------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal psyRdoQuant16, 5, 9, 15 + mov r5, r5m + mov r6d, r6m + vpbroadcastq m12, [r5] ; psyScale + lea r0, [r0 + 2 * r6] + lea r1, [r1 + 2 * r6] + lea r6, [4 * r6] + lea r2, [r2 + 2 * r6] + movq xm0, [r3] + movq xm1, [r4] + +%if BIT_DEPTH == 12 + mov r5, [tab_nonpsyRdo12 + 16] ; scaleBits +%elif BIT_DEPTH == 10 + mov r5, [tab_nonpsyRdo10 + 16] +%elif BIT_DEPTH == 8 + mov r5, [tab_nonpsyRdo8 + 16] +%else + %error Unsupported BIT_DEPTH! +%endif + + movq xm2, r5 + vpxor m4, m4 + vpxor m3, m3 + +;Row 1, 2 + movq xm13, [r0] + movq xm14, [r1] + pinsrq xm13, [r0 + mmsize/2], 1 + pinsrq xm14, [r1 + mmsize/2], 1 + vpmovsxwq m6, xm13 + vpmovsxwq m7, xm14 + psubq m7, m6 ; predictedCoef + + vcvtqq2pd m9, m6 + vfmadd213pd m9, m9, m3 + vcvtpd2qq m8, m9 + vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits + + vcvtqq2pd m10, m7 + vcvtqq2pd m11, m12 + vfmadd213pd m10, m11, m3 + vcvtpd2qq m9, m10 + vpsraq m9, RDO_MAX_16 ;(psyScale * predictedCoef) >> max + + psubq m8, m9 + paddq m4, m8 + movu [r2], ym8 + vextracti32x8 [r2 + 2 * mmsize], m8 , 1 + + ;Row 3, 4 + movq xm13, [r0 + mmsize] + movq xm14, [r1 + mmsize] + pinsrq xm13, [r0 + 3 * mmsize/2], 1 + pinsrq xm14, [r1 + 3 * mmsize/2], 1 + vpmovsxwq m6, xm13 + vpmovsxwq m7, xm14 + psubq m7, m6 ; predictedCoef + + vcvtqq2pd m9, m6 + vfmadd213pd m9, m9, m3 + vcvtpd2qq m8, m9 + vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits + + vcvtqq2pd m10, m7 + vcvtqq2pd m11, m12 + vfmadd213pd m10, m11, m3 + vcvtpd2qq m9, m10 + vpsraq m9, RDO_MAX_16 ;(psyScale * predictedCoef) >> max + + psubq m8, m9 + paddq m4, m8 + movu [r2 + 4 * mmsize], ym8 + vextracti32x8 [r2 + 6 * mmsize], m8 , 1 + + vextracti32x8 ym2, m4, 1 + paddq ym4, ym2 + vextracti32x4 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm3 + paddq xm4, xm2 + + paddq xm0, xm4 + paddq xm1, xm4 + + movq [r3], xm0 + movq [r4], xm1 + RET + +;--------------------------------------------------------------------------------------------------------------------------------------------------------- +; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos) +;--------------------------------------------------------------------------------------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal psyRdoQuant32, 5, 9, 15 + mov r5, r5m + mov r6d, r6m + vpbroadcastq m12, [r5] ; psyScale + lea r0, [r0 + 2 * r6] + lea r1, [r1 + 2 * r6] + lea r6, [4 * r6] + lea r2, [r2 + 2 * r6] + movq xm0, [r3] + movq xm1, [r4] + +%if BIT_DEPTH == 12 + mov r5, [tab_nonpsyRdo12 + 24] ; scaleBits +%elif BIT_DEPTH == 10 + mov r5, [tab_nonpsyRdo10 + 24] +%elif BIT_DEPTH == 8 + mov r5, [tab_nonpsyRdo8 + 24] +%else + %error Unsupported BIT_DEPTH! +%endif + + movq xm2, r5 + vpxor m4, m4 + vpxor m3, m3 + +;Row 1, 2 + movq xm13, [r0] + movq xm14, [r1] + pinsrq xm13, [r0 + mmsize], 1 + pinsrq xm14, [r1 + mmsize], 1 + vpmovsxwq m6, xm13 + vpmovsxwq m7, xm14 + psubq m7, m6 ; predictedCoef + + vcvtqq2pd m9, m6 + vfmadd213pd m9, m9, m3 + vcvtpd2qq m8, m9 + vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits + + vcvtqq2pd m10, m7 + vcvtqq2pd m11, m12 + vfmadd213pd m10, m11, m3 + vcvtpd2qq m9, m10 + vpsraq m9, RDO_MAX_32 ;(psyScale * predictedCoef) >> max + + psubq m8, m9 + paddq m4, m8 + movu [r2], ym8 + vextracti32x8 [r2 + 4 * mmsize], m8 , 1 + + ;Row 3, 4 + movq xm13, [r0 + 2 * mmsize] + movq xm14, [r1 + 2 * mmsize] + pinsrq xm13, [r0 + 3 * mmsize], 1 + pinsrq xm14, [r1 + 3 * mmsize], 1 + vpmovsxwq m6, xm13 + vpmovsxwq m7, xm14 + psubq m7, m6 ; predictedCoef + + vcvtqq2pd m9, m6 + vfmadd213pd m9, m9, m3 + vcvtpd2qq m8, m9 + vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits + + vcvtqq2pd m10, m7 + vcvtqq2pd m11, m12 + vfmadd213pd m10, m11, m3 + vcvtpd2qq m9, m10 + vpsraq m9, RDO_MAX_32 ;(psyScale * predictedCoef) >> max + + psubq m8, m9 + paddq m4, m8 + movu [r2 + 8 * mmsize], ym8 + vextracti32x8 [r2 + 12 * mmsize], m8 , 1 + + vextracti32x8 ym2, m4, 1 + paddq ym4, ym2 + vextracti32x4 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm3 + paddq xm4, xm2 + + paddq xm0, xm4 + paddq xm1, xm4 + + movq [r3], xm0 + movq [r4], xm1 + RET +%endif diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/x86/dct8.h --- a/source/common/x86/dct8.h Tue Jan 02 15:21:08 2018 +0530 +++ b/source/common/x86/dct8.h Fri Dec 29 09:52:27 2017 +0530 @@ -35,6 +35,8 @@ FUNCDEF_TU_S2(void, idct, sse4, const int16_t* src, int16_t* dst, intptr_t dstStride); FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride); FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx512, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos); +FUNCDEF_TU_S2(void, psyRdoQuant, avx512, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos); + void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride); void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride); void PFX(idst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride); diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/test/mbdstharness.cpp --- a/source/test/mbdstharness.cpp Tue Jan 02 15:21:08 2018 +0530 +++ b/source/test/mbdstharness.cpp Fri Dec 29 09:52:27 2017 +0530 @@ -61,16 +61,17 @@ for (int i = 0; i < TEST_BUF_SIZE; i++) { short_test_buff[0][i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX); + short_test_buff1[0][i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX); int_test_buff[0][i] = rand() % PIXEL_MAX; int_idct_test_buff[0][i] = (rand() % (SHORT_MAX - SHORT_MIN)) - SHORT_MAX; short_denoise_test_buff1[0][i] = short_denoise_test_buff2[0][i] = (rand() & SHORT_MAX) - (rand() & SHORT_MAX); - short_test_buff[1][i] = -PIXEL_MAX; + short_test_buff1[1][i] = -PIXEL_MAX; int_test_buff[1][i] = -PIXEL_MAX; int_idct_test_buff[1][i] = SHORT_MIN; short_denoise_test_buff1[1][i] = short_denoise_test_buff2[1][i] = -SHORT_MAX; - short_test_buff[2][i] = PIXEL_MAX; + short_test_buff1[2][i] = PIXEL_MAX; int_test_buff[2][i] = PIXEL_MAX; int_idct_test_buff[2][i] = SHORT_MAX; short_denoise_test_buff1[2][i] = short_denoise_test_buff2[2][i] = SHORT_MAX; @@ -324,6 +325,51 @@ return true; } +bool MBDstHarness::check_psyRdoQuant_primitive(psyRdoQuant_t ref, psyRdoQuant_t opt) +{ + int j = 0; + int trSize[4] = { 16, 64, 256, 1024 }; + + ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]); + ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]); + + for (int i = 0; i < ITERS; i++) + { + int64_t totalRdCostRef = rand(); + int64_t totalUncodedCostRef = rand(); + int64_t totalRdCostOpt = totalRdCostRef; + int64_t totalUncodedCostOpt = totalUncodedCostRef; + int64_t *psyScale = X265_MALLOC(int64_t, 1); + *psyScale = rand(); + + int index = rand() % 4; + uint32_t blkPos = trSize[index]; + int cmp_size = 4 * MAX_TU_SIZE; + + memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t)); + memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t)); + + int index1 = rand() % TEST_CASES; + + ref(short_test_buff[index1] + j, short_test_buff1[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, psyScale, blkPos); + checked(opt, short_test_buff[index1] + j, short_test_buff1[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, psyScale, blkPos); + + X265_FREE(psyScale); + if (memcmp(ref_dest, opt_dest, cmp_size)) + return false; + + if (totalUncodedCostRef != totalUncodedCostOpt) + return false; + + if (totalRdCostRef != totalRdCostOpt) + return false; + + reportfail(); + j += INCR; + } + + return true; +} bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt) { @@ -473,6 +519,17 @@ } } } + for (int i = 0; i < NUM_TR_SIZE; i++) + { + if (opt.cu[i].psyRdoQuant) + { + if (!check_psyRdoQuant_primitive(ref.cu[i].psyRdoQuant, opt.cu[i].psyRdoQuant)) + { + printf("psyRdoQuant[%dx%d]: Failed!\n", 4 << i, 4 << i); + return false; + } + } + } for (int i = 0; i < NUM_TR_SIZE; i++) { @@ -573,6 +630,19 @@ REPORT_SPEEDUP(opt.cu[value].nonPsyRdoQuant, ref.cu[value].nonPsyRdoQuant, short_test_buff[0], opt_dest, &totalUncodedCost, &totalRdCost, 0); } } + for (int value = 0; value < NUM_TR_SIZE; value++) + { + if (opt.cu[value].psyRdoQuant) + { + ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]); + int64_t totalRdCost = 0; + int64_t totalUncodedCost = 0; + int64_t *psyScale = X265_MALLOC(int64_t, 1); + *psyScale = 0; + printf("psyRdoQuant[%dx%d]", 4 << value, 4 << value); + REPORT_SPEEDUP(opt.cu[value].psyRdoQuant, ref.cu[value].psyRdoQuant, short_test_buff[0], short_test_buff1[0], opt_dest, &totalUncodedCost, &totalRdCost, psyScale, 0); + } + } for (int value = 0; value < NUM_TR_SIZE; value++) { diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/test/mbdstharness.h --- a/source/test/mbdstharness.h Tue Jan 02 15:21:08 2018 +0530 +++ b/source/test/mbdstharness.h Fri Dec 29 09:52:27 2017 +0530 @@ -51,11 +51,10 @@ int mintbuf2[MAX_TU_SIZE]; int mintbuf3[MAX_TU_SIZE]; int mintbuf4[MAX_TU_SIZE]; - int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE]; + int16_t short_test_buff1[TEST_CASES][TEST_BUF_SIZE]; int int_test_buff[TEST_CASES][TEST_BUF_SIZE]; int int_idct_test_buff[TEST_CASES][TEST_BUF_SIZE]; - uint32_t mubuf1[MAX_TU_SIZE]; uint32_t mubuf2[MAX_TU_SIZE]; uint16_t mushortbuf1[MAX_TU_SIZE]; @@ -65,6 +64,7 @@ bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt); bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt); bool check_nonPsyRdoQuant_primitive(nonPsyRdoQuant_t ref, nonPsyRdoQuant_t opt); + bool check_psyRdoQuant_primitive(psyRdoQuant_t ref, psyRdoQuant_t opt); bool check_quant_primitive(quant_t ref, quant_t opt); bool check_nquant_primitive(nquant_t ref, nquant_t opt); bool check_dct_primitive(dct_t ref, dct_t opt, intptr_t width); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel