# HG changeset patch # User Jayashri Murugan <jayas...@multicorewareinc.com> # Date 1499162011 -19800 # Tue Jul 04 15:23:31 2017 +0530 # Node ID 3e3a44c6d77c0c0a7b3a084127a0dc6c835ff392 # Parent 2eda6628c75302a10d59918a58740d6e27434293 x86: AVX512 blockcopy_ss_64x64
AVX2 performance over C code : 1.32x AVX512 performance over C code : 3.00x diff -r 2eda6628c753 -r 3e3a44c6d77c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 20 16:59:52 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 04 15:23:31 2017 +0530 @@ -3854,6 +3854,8 @@ p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512); + p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512); + } #endif } diff -r 2eda6628c753 -r 3e3a44c6d77c source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Thu Jul 20 16:59:52 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Tue Jul 04 15:23:31 2017 +0530 @@ -4462,6 +4462,154 @@ BLOCKCOPY_SS_W64_H4_avx 64, 48 BLOCKCOPY_SS_W64_H4_avx 64, 64 +%macro PROCESS_BLOCKCOPY_SS_W64_H8_avx512 0 + movu m0, [r2] + movu m1, [r2 + mmsize] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + mmsize] + + movu [r0], m0 + movu [r0 + mmsize], m1 + movu [r0 + r1], m2 + movu [r0 + r1 + mmsize], m3 + + movu m0, [r2 + 2 * r3] + movu m1, [r2 + 2 * r3 + mmsize] + movu m2, [r2 + r6] + movu m3, [r2 + r6 + mmsize] + lea r2, [r2 + 4 * r3] + + movu [r0 + 2 * r1], m0 + movu [r0 + 2 * r1 + mmsize], m1 + movu [r0 + r5], m2 + movu [r0 + r5 + mmsize], m3 + lea r0, [r0 + 4 * r1] + + movu m0, [r2] + movu m1, [r2 + mmsize] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + mmsize] + + movu [r0], m0 + movu [r0 + mmsize], m1 + movu [r0 + r1], m2 + movu [r0 + r1 + mmsize], m3 + + movu m0, [r2 + 2 * r3] + movu m1, [r2 + 2 * r3 + mmsize] + movu m2, [r2 + r6] + movu m3, [r2 + r6 + mmsize] + lea r2, [r2 + 4 * r3] + + movu [r0 + 2 * r1], m0 + movu [r0 + 2 * r1 + mmsize], m1 + movu [r0 + r5], m2 + movu [r0 + r5 + mmsize], m3 + lea r0, [r0 + 4 * r1] +%endmacro + +%macro PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 0 + movu m0, [r2] + movu m1, [r2 + mmsize] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + mmsize] + + movu [r0], m0 + movu [r0 + mmsize], m1 + movu [r0 + r1], m2 + movu [r0 + r1 + mmsize], m3 + + movu m0, [r2 + 2 * r3] + movu m1, [r2 + 2 * r3 + mmsize] + movu m2, [r2 + r6] + movu m3, [r2 + r6 + mmsize] + lea r2, [r2 + 4 * r3] + + movu [r0 + 2 * r1], m0 + movu [r0 + 2 * r1 + mmsize], m1 + movu [r0 + r5], m2 + movu [r0 + r5 + mmsize], m3 + lea r0, [r0 + 4 * r1] + + movu m0, [r2] + movu m1, [r2 + mmsize] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + mmsize] + + movu [r0], m0 + movu [r0 + mmsize], m1 + movu [r0 + r1], m2 + movu [r0 + r1 + mmsize], m3 + + movu m0, [r2 + 2 * r3] + movu m1, [r2 + 2 * r3 + mmsize] + movu m2, [r2 + r6] + movu m3, [r2 + r6 + mmsize] + + movu [r0 + 2 * r1], m0 + movu [r0 + 2 * r1 + mmsize], m1 + movu [r0 + r5], m2 + movu [r0 + r5 + mmsize], m3 +%endmacro + +;----------------------------------------------------------------------------- +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal blockcopy_ss_64x16, 4, 7, 4 + add r1, r1 + add r3, r3 + lea r5, [3 * r1] + lea r6, [3 * r3] + + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 + RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_64x32, 4, 7, 4 + add r1, r1 + add r3, r3 + lea r5, [3 * r1] + lea r6, [3 * r3] + + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 + RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_64x48, 4, 7, 4 + add r1, r1 + add r3, r3 + lea r5, [3 * r1] + lea r6, [3 * r3] + + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 + RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_64x64, 4, 7, 4 + add r1, r1 + add r3, r3 + lea r5, [3 * r1] + lea r6, [3 * r3] + + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_avx512 + PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 + RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- diff -r 2eda6628c753 -r 3e3a44c6d77c source/common/x86/blockcopy8.h --- a/source/common/x86/blockcopy8.h Thu Jul 20 16:59:52 2017 +0530 +++ b/source/common/x86/blockcopy8.h Tue Jul 04 15:23:31 2017 +0530 @@ -50,6 +50,7 @@ FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx512, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); FUNCDEF_CHROMA_PU(void, blockcopy_pp, sse2, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel