# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com> # Date 1515745345 -19800 # Fri Jan 12 13:52:25 2018 +0530 # Node ID 1c2875198a213a5f8d84bff57fcec15727f94a4f # Parent d43237051962eab3cd761cf24f3971de09c07aa5 x86: AVX512 pixel_satd_32xN
Size | AVX2 performance | AVX512 performance ----------------------------------------------- 32x8 | 10.34x | 12.26x 32x16 | 10.21x | 12.40x 32x24 | 10.47x | 13.23x 32x32 | 10.55x | 12.46x 32x48 | 10.60x | 12.59x 32x64 | 10.56x | 12.65x diff -r d43237051962 -r 1c2875198a21 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jan 11 14:31:13 2018 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jan 12 13:52:25 2018 +0530 @@ -5342,6 +5342,20 @@ p.cu[BLOCK_8x8].psyRdoQuant = PFX(psyRdoQuant8_avx512); p.cu[BLOCK_16x16].psyRdoQuant = PFX(psyRdoQuant16_avx512); p.cu[BLOCK_32x32].psyRdoQuant = PFX(psyRdoQuant32_avx512); + p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx512); + p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx512); + p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx512); + p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx512); + p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx512); + + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512); } #endif diff -r d43237051962 -r 1c2875198a21 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Thu Jan 11 14:31:13 2018 +0530 +++ b/source/common/x86/pixel-a.asm Fri Jan 12 13:52:25 2018 +0530 @@ -14068,8 +14068,102 @@ paddd xm0, xm1 movd eax, xm0 RET + +%macro PROCESS_SATD_32x4_AVX512 0 ; function to compute satd cost for 32 columns, 4 rows + ; rows 0-3 + pmovzxbw m0, [r0] + pmovzxbw m4, [r2] + psubw m0, m4 + pmovzxbw m1, [r0 + r1] + pmovzxbw m5, [r2 + r3] + psubw m1, m5 + pmovzxbw m2, [r0 + r1 * 2] + pmovzxbw m4, [r2 + r3 * 2] + psubw m2, m4 + pmovzxbw m3, [r0 + r4] + pmovzxbw m5, [r2 + r5] + psubw m3, m5 + paddw m4, m0, m1 + psubw m1, m0 + paddw m0, m2, m3 + psubw m3, m2 + punpckhwd m2, m4, m1 + punpcklwd m4, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + paddw m3, m4, m0 + psubw m0, m4 + paddw m4, m2, m1 + psubw m1, m2 + punpckhdq m2, m3, m0 + punpckldq m3, m0 + paddw m0, m3, m2 + psubw m2, m3 + punpckhdq m3, m4, m1 + punpckldq m4, m1 + paddw m1, m4, m3 + psubw m3, m4 + punpckhqdq m4, m0, m1 + punpcklqdq m0, m1 + pabsw m0, m0 + pabsw m4, m4 + pmaxsw m0, m0, m4 + punpckhqdq m1, m2, m3 + punpcklqdq m2, m3 + pabsw m2, m2 + pabsw m1, m1 + pmaxsw m2, m1 + pxor m7, m7 + mova m1, m0 + punpcklwd m1, m7 + paddd m6, m1 + mova m1, m0 + punpckhwd m1, m7 + paddd m6, m1 + pxor m7, m7 + mova m1, m2 + punpcklwd m1, m7 + paddd m6, m1 + mova m1, m2 + punpckhwd m1, m7 + paddd m6, m1 +%endmacro + +%macro SATD_MAIN_AVX512_END 0 + vextracti32x8 ym7, m6, 1 + paddd ym6, ym7 + vextracti128 xm7, ym6, 1 + paddd xm6, xm6, xm7 + punpckhqdq xm7, xm6, xm6 + paddd xm6, xm7 + movq rax, xm6 + rorx rdx, rax, 32 + add eax, edx +%endmacro + +%macro SATD_32xN_AVX512 1 +INIT_ZMM avx512 +cglobal pixel_satd_32x%1, 4,6,8 + lea r4, [3 * r1] + lea r5, [3 * r3] + pxor m6, m6 +%rep %1/4 - 1 + PROCESS_SATD_32x4_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] +%endrep + PROCESS_SATD_32x4_AVX512 + SATD_MAIN_AVX512_END + RET +%endmacro + +SATD_32xN_AVX512 8 +SATD_32xN_AVX512 16 +SATD_32xN_AVX512 24 +SATD_32xN_AVX512 32 +SATD_32xN_AVX512 48 +SATD_32xN_AVX512 64 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 - %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1 INIT_YMM avx2 cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel