# HG changeset patch # User Vignesh Vijayakumar # Date 1499923196 -19800 # Thu Jul 13 10:49:56 2017 +0530 # Node ID 238c5ee9ad24dc6b283bb399eb013d937bc9ac1e # Parent a32718b2358bab3f19861d8402fe9adc8e312633 x86: AVX512 pixel_add_ps_64x64
AVX2 performance: 13.99x AVX512 performance: 21.64x diff -r a32718b2358b -r 238c5ee9ad24 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 11 12:24:29 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jul 13 10:49:56 2017 +0530 @@ -3805,6 +3805,8 @@ p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); + p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512); + } #endif } diff -r a32718b2358b -r 238c5ee9ad24 source/common/x86/pixeladd8.asm --- a/source/common/x86/pixeladd8.asm Tue Jul 11 12:24:29 2017 +0530 +++ b/source/common/x86/pixeladd8.asm Thu Jul 13 10:49:56 2017 +0530 @@ -1145,3 +1145,147 @@ RET %endif + +;----------------------------------------------------------------------------- +; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%macro PROCESS_ADD_PS_64x8_AVX512 0 + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 32] + movu m2, [r3] + movu m3, [r3 + 64] + pmovzxbw m4, [r2 + r4] + pmovzxbw m5, [r2 + r4 + 32] + movu m6, [r3 + r5] + movu m7, [r3 + r5 + 64] + + paddw m0, m2 + paddw m1, m3 + paddw m4, m6 + paddw m5, m7 + packuswb m0, m1 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m4, m4, 11011000b + vshufi64x2 m0, m0, 11011000b + vshufi64x2 m4, m4, 11011000b + movu [r0], m0 + movu [r0 + r1], m4 + + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 32] + movu m2, [r3] + movu m3, [r3 + 64] + pmovzxbw m4, [r2 + r4] + pmovzxbw m5, [r2 + r4 + 32] + movu m6, [r3 + r5] + movu m7, [r3 + r5 + 64] + + paddw m0, m2 + paddw m1, m3 + paddw m4, m6 + paddw m5, m7 + packuswb m0, m1 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m4, m4, 11011000b + vshufi64x2 m0, m0, 11011000b + vshufi64x2 m4, m4, 11011000b + movu [r0], m0 + movu [r0 + r1], m4 + + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 32] + movu m2, [r3] + movu m3, [r3 + 64] + pmovzxbw m4, [r2 + r4] + pmovzxbw m5, [r2 + r4 + 32] + movu m6, [r3 + r5] + movu m7, [r3 + r5 + 64] + + paddw m0, m2 + paddw m1, m3 + paddw m4, m6 + paddw m5, m7 + packuswb m0, m1 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m4, m4, 11011000b + vshufi64x2 m0, m0, 11011000b + vshufi64x2 m4, m4, 11011000b + movu [r0], m0 + movu [r0 + r1], m4 + + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 32] + movu m2, [r3] + movu m3, [r3 + 64] + pmovzxbw m4, [r2 + r4] + pmovzxbw m5, [r2 + r4 + 32] + movu m6, [r3 + r5] + movu m7, [r3 + r5 + 64] + + paddw m0, m2 + paddw m1, m3 + paddw m4, m6 + paddw m5, m7 + packuswb m0, m1 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m4, m4, 11011000b + vshufi64x2 m0, m0, 11011000b + vshufi64x2 m4, m4, 11011000b + movu [r0], m0 + movu [r0 + r1], m4 +%endmacro + +%if ARCH_X86_64 +%if HIGH_BIT_DEPTH==0 +INIT_ZMM avx512 +cglobal pixel_add_ps_64x64, 6, 7, 8 + add r5, r5 + PROCESS_ADD_PS_64x8_AVX512 + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + PROCESS_ADD_PS_64x8_AVX512 + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + PROCESS_ADD_PS_64x8_AVX512 + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + PROCESS_ADD_PS_64x8_AVX512 + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + PROCESS_ADD_PS_64x8_AVX512 + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + PROCESS_ADD_PS_64x8_AVX512 + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + PROCESS_ADD_PS_64x8_AVX512 + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + PROCESS_ADD_PS_64x8_AVX512 + RET + +%endif +%endif _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel