# HG changeset patch # User Jayashri Murugan <jayas...@multicorewareinc.com> # Date 1500627732 -19800 # Fri Jul 21 14:32:12 2017 +0530 # Node ID 49123506b563fd44378e856e6833c77812d0349e # Parent ef8989f43083cd5195ff3ba360959fe3900399e5 x86: AVX512 getResidual32
BIT_DEPTH = 8 AVX2 performance over C code : 2.99x AVX512 performance over C code : 5.46x HIGH_BIT_DEPTH AVX2 performance over C code : 3.10x AVX512 performance over C code : 5.60x diff -r ef8989f43083 -r 49123506b563 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 04 18:02:59 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jul 21 14:32:12 2017 +0530 @@ -3723,6 +3723,7 @@ p.integral_inith[INTEGRAL_24] = PFX(integral24h_avx2); p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2); + p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512); } if (cpuMask & X265_CPU_AVX512) { @@ -3859,6 +3860,8 @@ p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512); p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512); + p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512); + } #endif } diff -r ef8989f43083 -r 49123506b563 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Tue Jul 04 18:02:59 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Fri Jul 21 14:32:12 2017 +0530 @@ -554,6 +554,135 @@ %endrep RET %endif + +%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512 0 + movu m0, [r0] + movu m1, [r0 + r3] + movu m2, [r0 + r3 * 2] + movu m3, [r0 + r4] + lea r0, [r0 + r3 * 4] + + movu m4, [r1] + movu m5, [r1 + r3] + movu m6, [r1 + r3 * 2] + movu m7, [r1 + r4] + lea r1, [r1 + r3 * 4] + + psubw m0, m4 + psubw m1, m5 + psubw m2, m6 + psubw m3, m7 + + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r4], m3 + lea r2, [r2 + r3 * 4] +%endmacro + +%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END 0 + movu m0, [r0] + movu m1, [r0 + r3] + movu m2, [r0 + r3 * 2] + movu m3, [r0 + r4] + + movu m4, [r1] + movu m5, [r1 + r3] + movu m6, [r1 + r3 * 2] + movu m7, [r1 + r4] + + psubw m0, m4 + psubw m1, m5 + psubw m2, m6 + psubw m3, m7 + + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r4], m3 +%endmacro + +%macro PROCESS_GETRESIDUAL32_W4_AVX512 0 + pmovzxbw m0, [r0] + pmovzxbw m1, [r0 + r3] + pmovzxbw m2, [r0 + r3 * 2] + pmovzxbw m3, [r0 + r4] + lea r0, [r0 + r3 * 4] + + pmovzxbw m4, [r1] + pmovzxbw m5, [r1 + r3] + pmovzxbw m6, [r1 + r3 * 2] + pmovzxbw m7, [r1 + r4] + lea r1, [r1 + r3 * 4] + + psubw m0, m4 + psubw m1, m5 + psubw m2, m6 + psubw m3, m7 + + movu [r2], m0 + movu [r2 + r3 * 2], m1 + lea r2, [r2 + r3 * 4] + movu [r2], m2 + movu [r2 + r3 * 2], m3 + lea r2, [r2 + r3 * 4] +%endmacro + +%macro PROCESS_GETRESIDUAL32_W4_AVX512_END 0 + pmovzxbw m0, [r0] + pmovzxbw m1, [r0 + r3] + pmovzxbw m2, [r0 + r3 * 2] + pmovzxbw m3, [r0 + r4] + + pmovzxbw m4, [r1] + pmovzxbw m5, [r1 + r3] + pmovzxbw m6, [r1 + r3 * 2] + pmovzxbw m7, [r1 + r4] + + psubw m0, m4 + psubw m1, m5 + psubw m2, m6 + psubw m3, m7 + + movu [r2], m0 + movu [r2 + r3 * 2], m1 + lea r2, [r2 + r3 * 4] + movu [r2], m2 + movu [r2 + r3 * 2], m3 +%endmacro + + +%if HIGH_BIT_DEPTH +INIT_ZMM avx512 +cglobal getResidual32, 4,5,8 + add r3, r3 + lea r4, [r3 * 3] + + PROCESS_GETRESIDUAL32_W4_HBD_AVX512 + PROCESS_GETRESIDUAL32_W4_HBD_AVX512 + PROCESS_GETRESIDUAL32_W4_HBD_AVX512 + PROCESS_GETRESIDUAL32_W4_HBD_AVX512 + PROCESS_GETRESIDUAL32_W4_HBD_AVX512 + PROCESS_GETRESIDUAL32_W4_HBD_AVX512 + PROCESS_GETRESIDUAL32_W4_HBD_AVX512 + PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END + RET +%else +INIT_ZMM avx512 +cglobal getResidual32, 4,5,8 + lea r4, [r3 * 3] + + PROCESS_GETRESIDUAL32_W4_AVX512 + PROCESS_GETRESIDUAL32_W4_AVX512 + PROCESS_GETRESIDUAL32_W4_AVX512 + PROCESS_GETRESIDUAL32_W4_AVX512 + PROCESS_GETRESIDUAL32_W4_AVX512 + PROCESS_GETRESIDUAL32_W4_AVX512 + PROCESS_GETRESIDUAL32_W4_AVX512 + PROCESS_GETRESIDUAL32_W4_AVX512_END + RET +%endif + ;----------------------------------------------------------------------------- ; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); ;----------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel