# HG changeset patch # User Vignesh Vijayakumar # Date 1501663291 -19800 # Wed Aug 02 14:11:31 2017 +0530 # Node ID ce93c1b1894ae7d789e451f65479f018ba90ec76 # Parent aac415b7223acced7fc844c4a07225704b811df0 x86: AVX512 cpy2Dto1D_shl_32 and cpy2Dto1D_shl_16
Size | BitDepth | AVX2 performance | AVX512 performance ------------------------------------------------------- 16x16| 8 | 15.09x | 21.16 16x16| 10 | 16.05x | 17.86 32x32| 8 | 13.90x | 25.62 32x32| 10 | 11.69x | 23.24 diff -r aac415b7223a -r ce93c1b1894a source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 01 17:37:05 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 02 14:11:31 2017 +0530 @@ -2309,6 +2309,8 @@ p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512); + p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512); + p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); } } #else // if HIGH_BIT_DEPTH @@ -3988,6 +3990,8 @@ p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512); p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512); + p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512); + p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); } #endif diff -r aac415b7223a -r ce93c1b1894a source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Tue Aug 01 17:37:05 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Wed Aug 02 14:11:31 2017 +0530 @@ -6140,6 +6140,102 @@ RET ;-------------------------------------------------------------------------------------- +; cpy2Dto1D_shl avx512 code start +;-------------------------------------------------------------------------------------- +%macro PROCESS_CPY2Dto1D_SHL_16x8_AVX512 0 + movu m1, [r1] + vinserti32x8 m1, [r1 + r2], 1 + movu m2, [r1 + 2 * r2] + vinserti32x8 m2, [r1 + r3], 1 + + psllw m1, xm0 + psllw m2, xm0 + movu [r0], m1 + movu [r0 + mmsize], m2 + + add r0, 2 * mmsize + lea r1, [r1 + r2 * 4] + + movu m1, [r1] + vinserti32x8 m1, [r1 + r2], 1 + movu m2, [r1 + 2 * r2] + vinserti32x8 m2, [r1 + r3], 1 + + psllw m1, xm0 + psllw m2, xm0 + movu [r0], m1 + movu [r0 + mmsize], m2 +%endmacro + +%macro PROCESS_CPY2Dto1D_SHL_32x8_AVX512 0 + movu m1, [r1] + movu m2, [r1 + r2] + movu m3, [r1 + 2 * r2] + movu m4, [r1 + r3] + + psllw m1, xm0 + psllw m2, xm0 + psllw m3, xm0 + psllw m4, xm0 + movu [r0], m1 + movu [r0 + mmsize], m2 + movu [r0 + 2 * mmsize], m3 + movu [r0 + 3 * mmsize], m4 + + add r0, 4 * mmsize + lea r1, [r1 + r2 * 4] + + movu m1, [r1] + movu m2, [r1 + r2] + movu m3, [r1 + 2 * r2] + movu m4, [r1 + r3] + + psllw m1, xm0 + psllw m2, xm0 + psllw m3, xm0 + psllw m4, xm0 + movu [r0], m1 + movu [r0 + mmsize], m2 + movu [r0 + 2 * mmsize], m3 + movu [r0 + 3 * mmsize], m4 +%endmacro + +;-------------------------------------------------------------------------------------- +; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +;-------------------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal cpy2Dto1D_shl_32, 4, 4, 5 + add r2d, r2d + movd xm0, r3d + lea r3, [3 * r2] + + PROCESS_CPY2Dto1D_SHL_32x8_AVX512 + add r0, 4 * mmsize + lea r1, [r1 + r2 * 4] + PROCESS_CPY2Dto1D_SHL_32x8_AVX512 + add r0, 4 * mmsize + lea r1, [r1 + r2 * 4] + PROCESS_CPY2Dto1D_SHL_32x8_AVX512 + add r0, 4 * mmsize + lea r1, [r1 + r2 * 4] + PROCESS_CPY2Dto1D_SHL_32x8_AVX512 + RET + +INIT_ZMM avx512 +cglobal cpy2Dto1D_shl_16, 4, 4, 3 + add r2d, r2d + movd xm0, r3d + lea r3, [3 * r2] + + PROCESS_CPY2Dto1D_SHL_16x8_AVX512 + add r0, 2 * mmsize + lea r1, [r1 + r2 * 4] + PROCESS_CPY2Dto1D_SHL_16x8_AVX512 + RET +;-------------------------------------------------------------------------------------- +; cpy2Dto1D_shl avx512 code end +;-------------------------------------------------------------------------------------- +;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 diff -r aac415b7223a -r ce93c1b1894a source/common/x86/blockcopy8.h --- a/source/common/x86/blockcopy8.h Tue Aug 01 17:37:05 2017 +0530 +++ b/source/common/x86/blockcopy8.h Wed Aug 02 14:11:31 2017 +0530 @@ -28,6 +28,7 @@ FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy2Dto1D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +FUNCDEF_TU_S(void, cpy2Dto1D_shl, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel