# HG changeset patch # User Vignesh Vijayakumar # Date 1499340573 -19800 # Thu Jul 06 16:59:33 2017 +0530 # Node ID f5c54a1c4a550e9c6df6a1ef0a4462fd23c4a530 # Parent 1321369efdf990d960db9a6fbe0181f086ba90f9 x86: AVX512 blockcopy_ps_32xN
AVX2 performance over C code: 2.39x AVX512 performance over C code : 3.62x diff -r 1321369efdf9 -r f5c54a1c4a55 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 06 11:32:12 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jul 06 16:59:33 2017 +0530 @@ -3778,6 +3778,9 @@ p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512); p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512); + p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512); + p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512); + p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx512); } #endif diff -r 1321369efdf9 -r f5c54a1c4a55 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Thu Jul 06 11:32:12 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Thu Jul 06 16:59:33 2017 +0530 @@ -3124,6 +3124,36 @@ BLOCKCOPY_PS_W32_H4_avx2 32, 32 BLOCKCOPY_PS_W32_H4_avx2 32, 64 +%macro BLOCKCOPY_PS_W32_H4_avx512 2 +INIT_ZMM avx512 +cglobal blockcopy_ps_%1x%2, 4, 7, 4 + add r1, r1 + mov r4d, %2/8 + lea r5, [3 * r3] + lea r6, [3 * r1] +.loop: +%rep 2 + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + r3] + pmovzxbw m2, [r2 + r3 * 2] + pmovzxbw m3, [r2 + r5] + + movu [r0], m0 + movu [r0 + r1], m1 + movu [r0 + r1 * 2], m2 + movu [r0 + r6], m3 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] +%endrep + dec r4d + jnz .loop + RET +%endmacro + +BLOCKCOPY_PS_W32_H4_avx512 32, 32 +BLOCKCOPY_PS_W32_H4_avx512 32, 64 + ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- diff -r 1321369efdf9 -r f5c54a1c4a55 source/common/x86/blockcopy8.h --- a/source/common/x86/blockcopy8.h Thu Jul 06 11:32:12 2017 +0530 +++ b/source/common/x86/blockcopy8.h Thu Jul 06 16:59:33 2017 +0530 @@ -61,5 +61,6 @@ FUNCDEF_PU(void, blockcopy_ps, sse2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); FUNCDEF_PU(void, blockcopy_ps, sse4, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); FUNCDEF_PU(void, blockcopy_ps, avx2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); +FUNCDEF_PU(void, blockcopy_ps, avx512, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); #endif // ifndef X265_I386_PIXEL_H _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel