# HG changeset patch # User Vignesh Vijayakumar # Date 1500967696 -19800 # Tue Jul 25 12:58:16 2017 +0530 # Node ID 09159f73f47b7eda15c8d0294774fe6eafdadea7 # Parent a75dd880817adddafac5e1105e512ea79c7a089b x86: AVX512 ssd_s_16 This patch also reworks ssd_s_32 to support high bit depth
ssd_s_16 AVX2 performance : 14.11x AVX512 performance : 16.14x ssd_s_32 for high bit depth AVX2 performance : 14.78x AVX512 performance : 20.54x diff -r a75dd880817a -r 09159f73f47b source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jul 26 10:04:24 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 25 12:58:16 2017 +0530 @@ -2249,6 +2249,8 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); + p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512); + } } #else // if HIGH_BIT_DEPTH @@ -3919,6 +3921,7 @@ p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512); p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512); + p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx512); p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512); p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512); diff -r a75dd880817a -r 09159f73f47b source/common/x86/ssd-a.asm --- a/source/common/x86/ssd-a.asm Wed Jul 26 10:04:24 2017 +0530 +++ b/source/common/x86/ssd-a.asm Tue Jul 25 12:58:16 2017 +0530 @@ -3425,10 +3425,28 @@ paddd m0, m1 %endmacro +%macro PROCESS_SSD_S_16x8_AVX512 0 + movu ym1, [r0] + vinserti32x8 m1, [r0 + r1], 1 + movu ym2, [r0 + 2 * r1] + vinserti32x8 m2, [r0 + r3], 1 + lea r0, [r0 + 4 * r1] + movu ym3, [r0] + vinserti32x8 m3, [r0 + r1], 1 + movu ym4, [r0 + 2 * r1] + vinserti32x8 m4, [r0 + r3], 1 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 + paddd m1, m3 + paddd m0, m1 +%endmacro ;----------------------------------------------------------------------------- ; int pixel_ssd_s( int16_t *ref, intptr_t i_stride ) ;----------------------------------------------------------------------------- -%if HIGH_BIT_DEPTH==0 INIT_ZMM avx512 cglobal pixel_ssd_s_32, 2,4,5 add r1, r1 @@ -3444,10 +3462,39 @@ PROCESS_SSD_S_32x8_AVX512 ; calculate sum and return +%if BIT_DEPTH >= 10 + movu m1, m0 + pxor m2, m2 + punpckldq m0, m2 + punpckhdq m1, m2 + paddq m0, m1 + vextracti32x8 ym2, m0, 1 + paddq ym0, ym2 + vextracti32x4 xm2, m0, 1 + paddq xm2, xm0 + movhlps xm1, xm2 + paddq xm2, xm1 + movq rax, xm2 +%else + HADDD m0, m1 + movd eax, xm0 +%endif + RET + +INIT_ZMM avx512 +cglobal pixel_ssd_s_16, 2,4,5 + add r1, r1 + lea r3, [r1 * 3] + pxor m0, m0 + + PROCESS_SSD_S_16x8_AVX512 + lea r0, [r0 + 4 * r1] + PROCESS_SSD_S_16x8_AVX512 + + ; calculate sum and return HADDD m0, m1 movd eax, xm0 RET -%endif ;----------------------------------------------------------------------------- ; ssd_s avx512 code end ;----------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel