# HG changeset patch # User Vignesh Vijayakumar # Date 1500528397 -19800 # Thu Jul 20 10:56:37 2017 +0530 # Node ID 0320e60b3323546eb6767508f1c39cd088e9f03e # Parent bf9a9cd255216300408506d10d4ff8bc87a15845 x86: AVX512 ssd_ss_64x64
AVX2 performance : 14.85x AVX512 performance : 21.35x diff -r bf9a9cd25521 -r 0320e60b3323 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 20 13:12:52 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jul 20 10:56:37 2017 +0530 @@ -3851,6 +3851,8 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); + p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); + } #endif } diff -r bf9a9cd25521 -r 0320e60b3323 source/common/x86/ssd-a.asm --- a/source/common/x86/ssd-a.asm Thu Jul 20 13:12:52 2017 +0530 +++ b/source/common/x86/ssd-a.asm Thu Jul 20 10:56:37 2017 +0530 @@ -1377,7 +1377,124 @@ HADDD m2, m0 movd eax, xm2 RET +;----------------------------------------------------------------------------- +; ssd_ss avx512 code start +;----------------------------------------------------------------------------- +%macro PROCESS_SSD_SS_64x8_AVX512 0 + movu m0, [r0] + movu m1, [r0 + mmsize] + movu m2, [r0 + r1] + movu m3, [r0 + r1 + mmsize] + psubw m0, [r2] + psubw m1, [r2 + mmsize] + psubw m2, [r2 + r3] + psubw m3, [r2 + r3 + mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m4, m0 + paddd m5, m1 + paddd m4, m2 + paddd m5, m3 + + movu m0, [r0 + 2 * r1] + movu m1, [r0 + 2 * r1 + mmsize] + movu m2, [r0 + r5] + movu m3, [r0 + r5 + mmsize] + + psubw m0, [r2 + 2 * r3] + psubw m1, [r2 + 2 * r3 + mmsize] + psubw m2, [r2 + r6] + psubw m3, [r2 + r6 + mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m4, m0 + paddd m5, m1 + paddd m4, m2 + paddd m5, m3 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + + movu m0, [r0] + movu m1, [r0 + mmsize] + movu m2, [r0 + r1] + movu m3, [r0 + r1 + mmsize] + + psubw m0, [r2] + psubw m1, [r2 + mmsize] + psubw m2, [r2 + r3] + psubw m3, [r2 + r3 + mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m4, m0 + paddd m5, m1 + paddd m4, m2 + paddd m5, m3 + + movu m0, [r0 + 2 * r1] + movu m1, [r0 + 2 * r1 + mmsize] + movu m2, [r0 + r5] + movu m3, [r0 + r5 + mmsize] + + psubw m0, [r2 + 2 * r3] + psubw m1, [r2 + 2 * r3 + mmsize] + psubw m2, [r2 + r6] + psubw m3, [r2 + r6 + mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m4, m0 + paddd m5, m1 + paddd m4, m2 + paddd m5, m3 +%endmacro + +INIT_ZMM avx512 +cglobal pixel_ssd_ss_64x64, 4,7,6 + add r1d, r1d + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + pxor m4, m4 + pxor m5, m5 + + PROCESS_SSD_SS_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_SSD_SS_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_SSD_SS_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_SSD_SS_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_SSD_SS_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_SSD_SS_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_SSD_SS_64x8_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + PROCESS_SSD_SS_64x8_AVX512 + paddd m4, m5 + HADDD m4, m0 + movd eax, xm4 + RET +;----------------------------------------------------------------------------- +; ssd_ss avx512 code end +;----------------------------------------------------------------------------- %endif ; !HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel