# HG changeset patch # User Jayashree <jayashre...@multicorewareinc.com> # Date 1520395477 -19800 # Wed Mar 07 09:34:37 2018 +0530 # Node ID c9f622347ce51cf90b593e8500ee5a40888c6f29 # Parent f377b028f4a91715372a6241fc80e78a672dbd06 x86:AVX2 nonpsyRdoQuant primitive for all sizes
diff -r f377b028f4a9 -r c9f622347ce5 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 05 17:58:06 2018 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Mar 07 09:34:37 2018 +0530 @@ -2310,6 +2310,10 @@ p.integral_inith[INTEGRAL_8] = PFX(integral8h_avx2); p.integral_inith[INTEGRAL_12] = PFX(integral12h_avx2); p.integral_inith[INTEGRAL_16] = PFX(integral16h_avx2); + p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx2); + p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx2); + p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx2); + p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx2); /* TODO: This kernel needs to be modified to work with HIGH_BIT_DEPTH only p.planeClipAndMax = PFX(planeClipAndMax_avx2); */ @@ -4689,6 +4693,10 @@ p.integral_inith[INTEGRAL_16] = PFX(integral16h_avx2); p.integral_inith[INTEGRAL_24] = PFX(integral24h_avx2); p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2); + p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx2); + p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx2); + p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx2); + p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx2); } if (cpuMask & X265_CPU_AVX512) diff -r f377b028f4a9 -r c9f622347ce5 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asm Thu Jul 05 17:58:06 2018 +0530 +++ b/source/common/x86/dct8.asm Wed Mar 07 09:34:37 2018 +0530 @@ -7058,4 +7058,299 @@ movq [r3], xm0 movq [r4], xm1 RET + +INIT_YMM avx2 +cglobal nonPsyRdoQuant4, 5, 9, 16 + mov r4d, r4m + lea r0, [r0 + 2 * r4] + lea r4, [4 * r4] + lea r1, [r1 + 2 * r4] + movq xm0, [r2] + movq xm1, [r3] + +%if BIT_DEPTH == 12 + mov r5, [tab_nonpsyRdo12] ; scaleBits +%elif BIT_DEPTH == 10 + mov r5, [tab_nonpsyRdo10] +%elif BIT_DEPTH == 8 + mov r5, [tab_nonpsyRdo8] +%else + %error Unsupported BIT_DEPTH! %endif + movq xm2, r5 + vpxor m4, m4 + vpxor m3, m3 + vpxor m13, m13 + + vpmovsxwd m6, [r0] + vcvtdq2pd m9, xm6 + vfmadd213pd m9, m9, m3 + vcvtpd2dq xm8, m9 + vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int + vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits + paddq m4, m13 + movu [r1], m13 + + vpmovsxwd m6, [r0 + 8] + vcvtdq2pd m9, xm6 + vfmadd213pd m9, m9, m3 + vcvtpd2dq xm8, m9 + vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int + vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits + paddq m4, m13 + movu [r1 + 32], m13 + + vpmovsxwd m6, [r0 + 16] + vcvtdq2pd m9, xm6 + vfmadd213pd m9, m9, m3 + vcvtpd2dq xm8, m9 + vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int + vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits + paddq m4, m13 + movu [r1 + 64], m13 + + vpmovsxwd m6, [r0 +24] + vcvtdq2pd m9, xm6 + vfmadd213pd m9, m9, m3 + vcvtpd2dq xm8, m9 + vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int + vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits + paddq m4, m13 + movu [r1 + 96], m13 + + + vextracti128 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm3 + paddq xm4, xm2 + + paddq xm0, xm4 + paddq xm1, xm4 + + movq [r2], xm0 + movq [r3], xm1 + RET + + + +INIT_YMM avx2 +cglobal nonPsyRdoQuant8, 5, 5, 8 + mov r4d, r4m + lea r0, [r0 + 2 * r4] + lea r4, [4 * r4] + lea r1, [r1 + 2 * r4] +%if BIT_DEPTH == 12 + mov r4, [tab_nonpsyRdo12 + 8] +%elif BIT_DEPTH == 10 + mov r4, [tab_nonpsyRdo10 + 8] +%elif BIT_DEPTH == 8 + mov r4, [tab_nonpsyRdo8 + 8] +%else + %error Unsupported BIT_DEPTH! + %endif + movq xm3, r4 + movq xm6, [r2] + movq xm7, [r3] + vpxor m4, m4 + vpxor m5, m5 + movq xm0, [r0] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1], ym0 + vpxor m0, m0 + movq xm0, [r0 +mmsize/2] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1 +2*mmsize], m0 + vpxor m0, m0 + movq xm0, [r0 +mmsize] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1 +4*mmsize], m0 + vpxor m0, m0 + movq xm0, [r0 +3*mmsize/2] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1 +6*mmsize], m0 + + vextracti128 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm5 + paddq xm4, xm2 + + paddq xm6, xm4 + paddq xm7, xm4 + + movq [r2], xm6 + movq [r3], xm7 + RET +INIT_YMM avx2 +cglobal nonPsyRdoQuant16, 5, 5, 8 + mov r4d, r4m + lea r0, [r0 + 2 * r4] + lea r4, [4 * r4] + lea r1, [r1 + 2 * r4] +%if BIT_DEPTH == 12 + mov r4, [tab_nonpsyRdo12 + 16] +%elif BIT_DEPTH == 10 + mov r4, [tab_nonpsyRdo10 + 16] +%elif BIT_DEPTH == 8 + mov r4, [tab_nonpsyRdo8 + 16] +%else + %error Unsupported BIT_DEPTH! + %endif + movq xm3, r4 + movq xm6, [r2] + movq xm7, [r3] + vpxor m4, m4 + vpxor m5, m5 + +;Row 1, 2 + movq xm0, [r0] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1], ym0 + + movq xm0, [r0 +mmsize] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1+4*mmsize], ym0 + + movq xm0, [r0 + 2*mmsize] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1+8*mmsize], ym0 + + movq xm0, [r0 + 3*mmsize] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1+12*mmsize], ym0 + + + vextracti128 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm5 + paddq xm4, xm2 + + paddq xm6, xm4 + paddq xm7, xm4 + + movq [r2], xm6 + movq [r3], xm7 + RET +INIT_YMM avx2 +cglobal nonPsyRdoQuant32, 5, 5, 8 + mov r4d, r4m + lea r0, [r0 + 2 * r4] + lea r4, [4 * r4] + lea r1, [r1 + 2 * r4] +%if BIT_DEPTH == 12 + mov r4, [tab_nonpsyRdo12 + 24] +%elif BIT_DEPTH == 10 + mov r4, [tab_nonpsyRdo10 + 24] +%elif BIT_DEPTH == 8 + mov r4, [tab_nonpsyRdo8 + 24] +%else + %error Unsupported BIT_DEPTH! + %endif + movq xm3, r4 + movq xm6, [r2] + movq xm7, [r3] + vpxor m4, m4 + vpxor m5, m5 + + movq xm0, [r0] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1], m0 + vpxor m0, m0 + + movq xm0, [r0 +2*mmsize] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1 + 8*mmsize], m0 + vpxor m0, m0 + + movq xm0, [r0 +4*mmsize] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1 +16*mmsize], m0 + vpxor m0, m0 + + movq xm0, [r0 +6*mmsize] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1 +24*mmsize], m0 + + vextracti128 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm5 + paddq xm4, xm2 + + paddq xm6, xm4 + paddq xm7, xm4 + + movq [r2], xm6 + movq [r3], xm7 + RET +%endif diff -r f377b028f4a9 -r c9f622347ce5 source/common/x86/dct8.h --- a/source/common/x86/dct8.h Thu Jul 05 17:58:06 2018 +0530 +++ b/source/common/x86/dct8.h Wed Mar 07 09:34:37 2018 +0530 @@ -36,6 +36,7 @@ FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride); FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx512, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos); FUNCDEF_TU_S2(void, psyRdoQuant, avx512, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos); +FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx2, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos); void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride); void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
# HG changeset patch # User Jayashree <jayashre...@multicorewareinc.com> # Date 1520395477 -19800 # Wed Mar 07 09:34:37 2018 +0530 # Node ID c9f622347ce51cf90b593e8500ee5a40888c6f29 # Parent f377b028f4a91715372a6241fc80e78a672dbd06 x86:AVX2 nonpsyRdoQuant primitive for all sizes diff -r f377b028f4a9 -r c9f622347ce5 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 05 17:58:06 2018 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Mar 07 09:34:37 2018 +0530 @@ -2310,6 +2310,10 @@ p.integral_inith[INTEGRAL_8] = PFX(integral8h_avx2); p.integral_inith[INTEGRAL_12] = PFX(integral12h_avx2); p.integral_inith[INTEGRAL_16] = PFX(integral16h_avx2); + p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx2); + p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx2); + p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx2); + p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx2); /* TODO: This kernel needs to be modified to work with HIGH_BIT_DEPTH only p.planeClipAndMax = PFX(planeClipAndMax_avx2); */ @@ -4689,6 +4693,10 @@ p.integral_inith[INTEGRAL_16] = PFX(integral16h_avx2); p.integral_inith[INTEGRAL_24] = PFX(integral24h_avx2); p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2); + p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx2); + p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx2); + p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx2); + p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx2); } if (cpuMask & X265_CPU_AVX512) diff -r f377b028f4a9 -r c9f622347ce5 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asm Thu Jul 05 17:58:06 2018 +0530 +++ b/source/common/x86/dct8.asm Wed Mar 07 09:34:37 2018 +0530 @@ -7058,4 +7058,299 @@ movq [r3], xm0 movq [r4], xm1 RET + +INIT_YMM avx2 +cglobal nonPsyRdoQuant4, 5, 9, 16 + mov r4d, r4m + lea r0, [r0 + 2 * r4] + lea r4, [4 * r4] + lea r1, [r1 + 2 * r4] + movq xm0, [r2] + movq xm1, [r3] + +%if BIT_DEPTH == 12 + mov r5, [tab_nonpsyRdo12] ; scaleBits +%elif BIT_DEPTH == 10 + mov r5, [tab_nonpsyRdo10] +%elif BIT_DEPTH == 8 + mov r5, [tab_nonpsyRdo8] +%else + %error Unsupported BIT_DEPTH! %endif + movq xm2, r5 + vpxor m4, m4 + vpxor m3, m3 + vpxor m13, m13 + + vpmovsxwd m6, [r0] + vcvtdq2pd m9, xm6 + vfmadd213pd m9, m9, m3 + vcvtpd2dq xm8, m9 + vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int + vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits + paddq m4, m13 + movu [r1], m13 + + vpmovsxwd m6, [r0 + 8] + vcvtdq2pd m9, xm6 + vfmadd213pd m9, m9, m3 + vcvtpd2dq xm8, m9 + vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int + vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits + paddq m4, m13 + movu [r1 + 32], m13 + + vpmovsxwd m6, [r0 + 16] + vcvtdq2pd m9, xm6 + vfmadd213pd m9, m9, m3 + vcvtpd2dq xm8, m9 + vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int + vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits + paddq m4, m13 + movu [r1 + 64], m13 + + vpmovsxwd m6, [r0 +24] + vcvtdq2pd m9, xm6 + vfmadd213pd m9, m9, m3 + vcvtpd2dq xm8, m9 + vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int + vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits + paddq m4, m13 + movu [r1 + 96], m13 + + + vextracti128 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm3 + paddq xm4, xm2 + + paddq xm0, xm4 + paddq xm1, xm4 + + movq [r2], xm0 + movq [r3], xm1 + RET + + + +INIT_YMM avx2 +cglobal nonPsyRdoQuant8, 5, 5, 8 + mov r4d, r4m + lea r0, [r0 + 2 * r4] + lea r4, [4 * r4] + lea r1, [r1 + 2 * r4] +%if BIT_DEPTH == 12 + mov r4, [tab_nonpsyRdo12 + 8] +%elif BIT_DEPTH == 10 + mov r4, [tab_nonpsyRdo10 + 8] +%elif BIT_DEPTH == 8 + mov r4, [tab_nonpsyRdo8 + 8] +%else + %error Unsupported BIT_DEPTH! + %endif + movq xm3, r4 + movq xm6, [r2] + movq xm7, [r3] + vpxor m4, m4 + vpxor m5, m5 + movq xm0, [r0] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1], ym0 + vpxor m0, m0 + movq xm0, [r0 +mmsize/2] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1 +2*mmsize], m0 + vpxor m0, m0 + movq xm0, [r0 +mmsize] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1 +4*mmsize], m0 + vpxor m0, m0 + movq xm0, [r0 +3*mmsize/2] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1 +6*mmsize], m0 + + vextracti128 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm5 + paddq xm4, xm2 + + paddq xm6, xm4 + paddq xm7, xm4 + + movq [r2], xm6 + movq [r3], xm7 + RET +INIT_YMM avx2 +cglobal nonPsyRdoQuant16, 5, 5, 8 + mov r4d, r4m + lea r0, [r0 + 2 * r4] + lea r4, [4 * r4] + lea r1, [r1 + 2 * r4] +%if BIT_DEPTH == 12 + mov r4, [tab_nonpsyRdo12 + 16] +%elif BIT_DEPTH == 10 + mov r4, [tab_nonpsyRdo10 + 16] +%elif BIT_DEPTH == 8 + mov r4, [tab_nonpsyRdo8 + 16] +%else + %error Unsupported BIT_DEPTH! + %endif + movq xm3, r4 + movq xm6, [r2] + movq xm7, [r3] + vpxor m4, m4 + vpxor m5, m5 + +;Row 1, 2 + movq xm0, [r0] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1], ym0 + + movq xm0, [r0 +mmsize] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1+4*mmsize], ym0 + + movq xm0, [r0 + 2*mmsize] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1+8*mmsize], ym0 + + movq xm0, [r0 + 3*mmsize] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1+12*mmsize], ym0 + + + vextracti128 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm5 + paddq xm4, xm2 + + paddq xm6, xm4 + paddq xm7, xm4 + + movq [r2], xm6 + movq [r3], xm7 + RET +INIT_YMM avx2 +cglobal nonPsyRdoQuant32, 5, 5, 8 + mov r4d, r4m + lea r0, [r0 + 2 * r4] + lea r4, [4 * r4] + lea r1, [r1 + 2 * r4] +%if BIT_DEPTH == 12 + mov r4, [tab_nonpsyRdo12 + 24] +%elif BIT_DEPTH == 10 + mov r4, [tab_nonpsyRdo10 + 24] +%elif BIT_DEPTH == 8 + mov r4, [tab_nonpsyRdo8 + 24] +%else + %error Unsupported BIT_DEPTH! + %endif + movq xm3, r4 + movq xm6, [r2] + movq xm7, [r3] + vpxor m4, m4 + vpxor m5, m5 + + movq xm0, [r0] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1], m0 + vpxor m0, m0 + + movq xm0, [r0 +2*mmsize] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1 + 8*mmsize], m0 + vpxor m0, m0 + + movq xm0, [r0 +4*mmsize] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1 +16*mmsize], m0 + vpxor m0, m0 + + movq xm0, [r0 +6*mmsize] + vpmovsxwd m1, xm0 + vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements + vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements + vcvtpd2dq xm1, m2 + vpmovsxdq m0 , xm1 + vpsllq m0, xm3 ; costUncoded + paddq m4, m0 + movu [r1 +24*mmsize], m0 + + vextracti128 xm2, m4, 1 + paddq xm4, xm2 + punpckhqdq xm2, xm4, xm5 + paddq xm4, xm2 + + paddq xm6, xm4 + paddq xm7, xm4 + + movq [r2], xm6 + movq [r3], xm7 + RET +%endif diff -r f377b028f4a9 -r c9f622347ce5 source/common/x86/dct8.h --- a/source/common/x86/dct8.h Thu Jul 05 17:58:06 2018 +0530 +++ b/source/common/x86/dct8.h Wed Mar 07 09:34:37 2018 +0530 @@ -36,6 +36,7 @@ FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride); FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx512, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos); FUNCDEF_TU_S2(void, psyRdoQuant, avx512, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos); +FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx2, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos); void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride); void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel