# HG changeset patch # User Gopi Satykrishna Akisetty <gopi.satykris...@multicorewareinc.com> # Date 1512721859 -19800 # Fri Dec 08 14:00:59 2017 +0530 # Node ID ab5b1becd807647d5264381c1fb74750c20fdfae # Parent 42fe321e5cdf9ad260e4e5c7a64137a8b7601915 [x265-avx512] x86: AVX512 psyCost_pp for main10 and main12
Size | AVX2 performance | AVX512 performance ---------------------------------------------- 16x16 | 13.86x | 18.45x 32x32 | 13.48x | 19.86x 64x64 | 13.51x | 18.33x diff -r 42fe321e5cdf -r ab5b1becd807 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 12 16:48:04 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Dec 08 14:00:59 2017 +0530 @@ -3080,6 +3080,9 @@ p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x64>; p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_48x64>; + p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx512); + p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx512); + p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx512); } #endif } diff -r 42fe321e5cdf -r ab5b1becd807 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Tue Dec 12 16:48:04 2017 +0530 +++ b/source/common/x86/pixel-a.asm Fri Dec 08 14:00:59 2017 +0530 @@ -45,6 +45,8 @@ times 2 dw 1, -1 times 4 dw 1 times 2 dw 1, -1 +psy_pp_shuff1: dq 0, 1, 8, 9, 4, 5, 12, 13 +psy_pp_shuff2: dq 2, 3, 10, 11, 6, 7, 14, 15 ALIGN 32 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14 @@ -10403,6 +10405,369 @@ pabsd m11, m11 %endmacro +%macro PSY_COST_PP_8x8_AVX512_MAIN12 0 + ; load source and recon pixels + lea r4, [r1 * 3] + pmovzxwd ym0, [r0] + pmovzxwd ym1, [r0 + r1] + pmovzxwd ym2, [r0 + r1 * 2] + pmovzxwd ym3, [r0 + r4] + lea r5, [r0 + r1 * 4] + pmovzxwd ym4, [r5] + pmovzxwd ym5, [r5 + r1] + pmovzxwd ym6, [r5 + r1 * 2] + pmovzxwd ym7, [r5 + r4] + + lea r4, [r3 * 3] + pmovzxwd ym16, [r2] + pmovzxwd ym17, [r2 + r3] + pmovzxwd ym18, [r2 + r3 * 2] + pmovzxwd ym19, [r2 + r4] + lea r5, [r2 + r3 * 4] + pmovzxwd ym20, [r5] + pmovzxwd ym21, [r5 + r3] + pmovzxwd ym22, [r5 + r3 * 2] + pmovzxwd ym23, [r5 + r4] + + vinserti64x4 m0, m0, ym16, 1 + vinserti64x4 m1, m1, ym17, 1 + vinserti64x4 m2, m2, ym18, 1 + vinserti64x4 m3, m3, ym19, 1 + vinserti64x4 m4, m4, ym20, 1 + vinserti64x4 m5, m5, ym21, 1 + vinserti64x4 m6, m6, ym22, 1 + vinserti64x4 m7, m7, ym23, 1 + + ; source + recon SAD + paddd m8, m0, m1 + paddd m8, m2 + paddd m8, m3 + paddd m8, m4 + paddd m8, m5 + paddd m8, m6 + paddd m8, m7 + + vextracti64x4 ym15, m8, 1 + + vextracti128 xm9, ym8, 1 + paddd ym8, ym9 ; sad_8x8 + movhlps xm9, xm8 + paddd xm8, xm9 + pshuflw xm9, xm8, 0Eh + paddd xm8, xm9 + psrld ym8, 2 + + vextracti128 xm9, ym15, 1 + paddd ym15, ym9 ; sad_8x8 + movhlps xm9, xm15 + paddd xm15, xm9 + pshuflw xm9, xm15, 0Eh + paddd xm15, xm9 + psrld ym15, 2 + + ; source and recon SA8D + psubd m9, m1, m0 + paddd m0, m1 + psubd m1, m3, m2 + paddd m2, m3 + punpckhdq m3, m0, m9 + punpckldq m0, m9 + psubd m9, m3, m0 + paddd m0, m3 + punpckhdq m3, m2, m1 + punpckldq m2, m1 + psubd m10, m3, m2 + paddd m2, m3 + psubd m3, m5, m4 + paddd m4, m5 + psubd m5, m7, m6 + paddd m6, m7 + punpckhdq m1, m4, m3 + punpckldq m4, m3 + psubd m7, m1, m4 + paddd m4, m1 + punpckhdq m3, m6, m5 + punpckldq m6, m5 + psubd m1, m3, m6 + paddd m6, m3 + psubd m3, m2, m0 + paddd m0, m2 + psubd m2, m10, m9 + paddd m9, m10 + punpckhqdq m5, m0, m3 + punpcklqdq m0, m3 + psubd m10, m5, m0 + paddd m0, m5 + punpckhqdq m3, m9, m2 + punpcklqdq m9, m2 + psubd m5, m3, m9 + paddd m9, m3 + psubd m3, m6, m4 + paddd m4, m6 + psubd m6, m1, m7 + paddd m7, m1 + punpckhqdq m2, m4, m3 + punpcklqdq m4, m3 + psubd m1, m2, m4 + paddd m4, m2 + punpckhqdq m3, m7, m6 + punpcklqdq m7, m6 + + psubd m2, m3, m7 + paddd m7, m3 + psubd m3, m4, m0 + paddd m0, m4 + psubd m4, m1, m10 + paddd m10, m1 + + mova m16, m13 + mova m17, m14 + vpermi2q m16, m0, m3 + vpermi2q m17, m0, m3 + + pabsd m17, m17 + pabsd m16, m16 + pmaxsd m17, m16 + + mova m18, m13 + mova m19, m14 + vpermi2q m18, m10, m4 + vpermi2q m19, m10, m4 + + pabsd m19, m19 + pabsd m18, m18 + pmaxsd m19, m18 + psubd m18, m7, m9 + paddd m9, m7 + psubd m7, m2, m5 + paddd m5, m2 + + mova m20, m13 + mova m21, m14 + vpermi2q m20, m9, m18 + vpermi2q m21, m9, m18 + + pabsd m21, m21 + pabsd m20, m20 + pmaxsd m21, m20 + + mova m22, m13 + mova m23, m14 + vpermi2q m22, m5, m7 + vpermi2q m23, m5, m7 + + pabsd m23, m23 + pabsd m22, m22 + pmaxsd m23, m22 + paddd m17, m21 + paddd m17, m19 + paddd m17, m23 + + vextracti64x4 ym26, m17, 1 + + vextracti128 xm9, m17, 1 + paddd ym17, ym9 ; sad_8x8 + movhlps xm9, xm17 + paddd xm17, xm9 + pshuflw xm9, xm17, 0Eh + paddd xm17, xm9 + paddd ym17, [pd_1] + psrld ym17, 1 ; sa8d_8x8 + + vextracti128 xm9, ym26, 1 + paddd ym26, ym9 ; sad_8x8 + movhlps xm9, xm26 + paddd xm26, xm9 + pshuflw xm9, xm26, 0Eh + paddd xm26, xm9 + paddd ym26, [pd_1] + psrld ym26, 1 ; sa8d_8x8 + + + + psubd ym11, ym17, ym8 ; sa8d_8x8 - sad_8x8 + psubd ym12, ym26, ym15 ; sa8d_8x8 - sad_8x8 + + psubd ym11, ym12 + pabsd ym11, ym11 +%endmacro + +%macro PSY_PP_INPUT_AVX512_MAIN10 0 + lea r4, [r1 * 3] + movu xm0, [r0] + movu xm1, [r0 + r1] + movu xm2, [r0 + r1 * 2] + movu xm3, [r0 + r4] + lea r5, [r0 + r1 * 4] + movu xm4, [r5] + movu xm5, [r5 + r1] + movu xm6, [r5 + r1 * 2] + movu xm7, [r5 + r4] + + lea r4, [r3 * 3] + vinserti128 ym0, ym0, [r2], 1 + vinserti128 ym1, ym1, [r2 + r3], 1 + vinserti128 ym2, ym2, [r2 + r3 * 2], 1 + vinserti128 ym3, ym3, [r2 + r4], 1 + lea r5, [r2 + r3 * 4] + vinserti128 ym4, ym4, [r5], 1 + vinserti128 ym5, ym5, [r5 + r3], 1 + vinserti128 ym6, ym6, [r5 + r3 * 2], 1 + vinserti128 ym7, ym7, [r5 + r4], 1 + + add r0, 16 + add r2, 16 + + lea r4, [r1 * 3] + vinserti32x4 m0, m0, [r0], 2 + vinserti32x4 m1, m1, [r0 + r1], 2 + vinserti32x4 m2, m2, [r0 + r1 * 2], 2 + vinserti32x4 m3, m3, [r0 + r4], 2 + lea r5, [r0 + r1 * 4] + vinserti32x4 m4, m4, [r5], 2 + vinserti32x4 m5, m5, [r5 + r1], 2 + vinserti32x4 m6, m6, [r5 + r1 * 2], 2 + vinserti32x4 m7, m7, [r5 + r4], 2 + + lea r4, [r3 * 3] + vinserti32x4 m0, m0, [r2], 3 + vinserti32x4 m1, m1, [r2 + r3], 3 + vinserti32x4 m2, m2, [r2 + r3 * 2], 3 + vinserti32x4 m3, m3, [r2 + r4], 3 + lea r5, [r2 + r3 * 4] + vinserti32x4 m4, m4, [r5], 3 + vinserti32x4 m5, m5, [r5 + r3], 3 + vinserti32x4 m6, m6, [r5 + r3 * 2], 3 + vinserti32x4 m7, m7, [r5 + r4], 3 +%endmacro + + +%macro PSY_PP_16x8_AVX512_MAIN10 0 + paddw m8, m0, m1 + paddw m8, m2 + paddw m8, m3 + paddw m8, m4 + paddw m8, m5 + paddw m8, m6 + paddw m8, m7 + pmaddwd m8, m14 + + psrldq m9, m8, 8 + paddd m8, m9 + psrldq m9, m8, 4 + paddd m8, m9 + psrld m8, 2 + + psubw m9, m1, m0 + paddw m0, m1 + psubw m1, m3, m2 + paddw m2, m3 + punpckhwd m3, m0, m9 + punpcklwd m0, m9 + psubw m9, m3, m0 + paddw m0, m3 + punpckhwd m3, m2, m1 + punpcklwd m2, m1 + psubw m10, m3, m2 + paddw m2, m3 + + psubw m3, m5, m4 + paddw m4, m5 + psubw m5, m7, m6 + paddw m6, m7 + punpckhwd m1, m4, m3 + punpcklwd m4, m3 + psubw m7, m1, m4 + paddw m4, m1 + punpckhwd m3, m6, m5 + punpcklwd m6, m5 + psubw m1, m3, m6 + paddw m6, m3 + + psubw m3, m2, m0 + paddw m0, m2 + psubw m2, m10, m9 + paddw m9, m10 + punpckhdq m5, m0, m3 + punpckldq m0, m3 + psubw m10, m5, m0 + paddw m0, m5 + punpckhdq m3, m9, m2 + punpckldq m9, m2 + psubw m5, m3, m9 + paddw m9, m3 + + psubw m3, m6, m4 + paddw m4, m6 + psubw m6, m1, m7 + paddw m7, m1 + punpckhdq m2, m4, m3 + punpckldq m4, m3 + psubw m1, m2, m4 + paddw m4, m2 + punpckhdq m3, m7, m6 + punpckldq m7, m6 + psubw m2, m3, m7 + paddw m7, m3 + + psubw m3, m4, m0 + paddw m0, m4 + psubw m4, m1, m10 + paddw m10, m1 + punpckhqdq m6, m0, m3 + punpcklqdq m0, m3 + pabsw m0, m0 + pabsw m6, m6 + pmaxsw m0, m6 + punpckhqdq m3, m10, m4 + punpcklqdq m10, m4 + pabsw m10, m10 + pabsw m3, m3 + pmaxsw m10, m3 + + psubw m3, m7, m9 + paddw m9, m7 + psubw m7, m2, m5 + paddw m5, m2 + punpckhqdq m4, m9, m3 + punpcklqdq m9, m3 + pabsw m9, m9 + pabsw m4, m4 + pmaxsw m9, m4 + punpckhqdq m3, m5, m7 + punpcklqdq m5, m7 + pabsw m5, m5 + pabsw m3, m3 + pmaxsw m5, m3 + + paddd m0, m9 + paddd m0, m10 + paddd m0, m5 + psrld m9, m0, 16 + pslld m0, 16 + psrld m0, 16 + paddd m0, m9 + psrldq m9, m0, 8 + paddd m0, m9 + psrldq m9, m0, 4 + paddd m0, m9 + paddd m0, m15 + psrld m0, 1 + psubd m0, m8 + + vextracti64x4 ym2, m0, 1 + + vextracti128 xm3, ym2, 1 + psubd xm3, xm2 + pabsd xm3, xm3 + + vextracti128 xm1, ym0, 1 + psubd xm1, xm0 + pabsd xm1, xm1 + paddd xm1, xm3 +%endmacro + + %if ARCH_X86_64 INIT_YMM avx2 %if HIGH_BIT_DEPTH && BIT_DEPTH == 12 @@ -10672,6 +11037,173 @@ RET %endif %endif +%if ARCH_X86_64 +INIT_ZMM avx512 +%if HIGH_BIT_DEPTH && BIT_DEPTH == 12 +cglobal psyCost_pp_16x16, 4, 10, 27 + add r1d, r1d + add r3d, r3d + pxor m24, m24 + movu m13, [psy_pp_shuff1] + movu m14, [psy_pp_shuff2] + + mov r8d, 2 +.loopH: + mov r9d, 2 +.loopW: + PSY_COST_PP_8x8_AVX512_MAIN12 + + paddd xm24, xm11 + add r0, 16 + add r2, 16 + dec r9d + jnz .loopW + lea r0, [r0 + r1 * 8 - 32] + lea r2, [r2 + r3 * 8 - 32] + dec r8d + jnz .loopH + movd eax, xm24 + RET +%endif + +%if HIGH_BIT_DEPTH && BIT_DEPTH == 10 +cglobal psyCost_pp_16x16, 4, 10, 16 + add r1d, r1d + add r3d, r3d + pxor m11, m11 + vbroadcasti32x8 m14, [pw_1] + vbroadcasti32x8 m15, [pd_1] + + mov r8d, 2 +.loopH: + PSY_PP_INPUT_AVX512_MAIN10 + PSY_PP_16x8_AVX512_MAIN10 + + paddd xm11, xm1 + lea r0, [r0 + r1 * 8 - 16] + lea r2, [r2 + r3 * 8 - 16] + dec r8d + jnz .loopH + movd eax, xm11 + RET +%endif +%endif + +%if ARCH_X86_64 +INIT_ZMM avx512 +%if HIGH_BIT_DEPTH && BIT_DEPTH == 12 +cglobal psyCost_pp_32x32, 4, 10, 27 + add r1d, r1d + add r3d, r3d + pxor m24, m24 + movu m13, [psy_pp_shuff1] + movu m14, [psy_pp_shuff2] + + mov r8d, 4 +.loopH: + mov r9d, 4 +.loopW: + PSY_COST_PP_8x8_AVX512_MAIN12 + + paddd xm24, xm11 + add r0, 16 + add r2, 16 + dec r9d + jnz .loopW + lea r0, [r0 + r1 * 8 - 64] + lea r2, [r2 + r3 * 8 - 64] + dec r8d + jnz .loopH + movd eax, xm24 + RET +%endif + +%if HIGH_BIT_DEPTH && BIT_DEPTH == 10 +cglobal psyCost_pp_32x32, 4, 10, 16 + add r1d, r1d + add r3d, r3d + pxor m11, m11 + vbroadcasti32x8 m14, [pw_1] + vbroadcasti32x8 m15, [pd_1] + + mov r8d, 4 +.loopH: + mov r9d, 2 +.loopW: + PSY_PP_INPUT_AVX512_MAIN10 + PSY_PP_16x8_AVX512_MAIN10 + + paddd xm11, xm1 + add r0, 16 + add r2, 16 + dec r9d + jnz .loopW + lea r0, [r0 + r1 * 8 - 64] + lea r2, [r2 + r3 * 8 - 64] + dec r8d + jnz .loopH + movd eax, xm11 + RET +%endif +%endif + +%if ARCH_X86_64 +INIT_ZMM avx512 +%if HIGH_BIT_DEPTH && BIT_DEPTH == 12 +cglobal psyCost_pp_64x64, 4, 10, 27 + add r1d, r1d + add r3d, r3d + pxor m24, m24 + movu m13, [psy_pp_shuff1] + movu m14, [psy_pp_shuff2] + + mov r8d, 8 +.loopH: + mov r9d, 8 +.loopW: + PSY_COST_PP_8x8_AVX512_MAIN12 + + paddd xm24, xm11 + add r0, 16 + add r2, 16 + dec r9d + jnz .loopW + lea r0, [r0 + r1 * 8 - 128] + lea r2, [r2 + r3 * 8 - 128] + dec r8d + jnz .loopH + movd eax, xm24 + RET +%endif + +%if HIGH_BIT_DEPTH && BIT_DEPTH == 10 +cglobal psyCost_pp_64x64, 4, 10, 16 + add r1d, r1d + add r3d, r3d + pxor m11, m11 + vbroadcasti32x8 m14, [pw_1] + vbroadcasti32x8 m15, [pd_1] + + mov r8d, 8 +.loopH: + mov r9d, 4 +.loopW: + PSY_PP_INPUT_AVX512_MAIN10 + PSY_PP_16x8_AVX512_MAIN10 + + paddd xm11, xm1 + add r0, 16 + add r2, 16 + dec r9d + jnz .loopW + lea r0, [r0 + r1 * 8 - 128] + lea r2, [r2 + r3 * 8 - 128] + dec r8d + jnz .loopH + movd eax, xm11 + RET +%endif +%endif ;--------------------------------------------------------------------------------------------------------------------- ;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel