# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com> # Date 1515407502 -19800 # Mon Jan 08 16:01:42 2018 +0530 # Node ID 48917be3e409f917468ff2f73302b62afef492fb # Parent c9f8c315a900c488e41bb39955a1c893e35a66d4 x86: AVX512 cpy1Dto2D_shr_32
AVX2 performance : 21.03x AVX512 performance : 34.55x diff -r c9f8c315a900 -r 48917be3e409 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jan 08 15:55:34 2018 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jan 08 16:01:42 2018 +0530 @@ -2557,6 +2557,8 @@ p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512); p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512); p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512); + p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx512); + p.weight_pp = PFX(weight_pp_avx512); p.weight_sp = PFX(weight_sp_avx512); p.dequant_normal = PFX(dequant_normal_avx512); @@ -4908,6 +4910,7 @@ p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512); p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512); p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512); + p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx512); p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512); p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512); diff -r c9f8c315a900 -r 48917be3e409 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Mon Jan 08 15:55:34 2018 +0530 +++ b/source/common/x86/blockcopy8.asm Mon Jan 08 16:01:42 2018 +0530 @@ -6809,3 +6809,30 @@ dec r3d jnz .loop RET + +INIT_ZMM avx512 +cglobal cpy1Dto2D_shr_32, 3, 4, 6 + shl r2d, 1 + movd xm0, r3m + pcmpeqw ymm1, ymm1 + psllw ym1, ymm1, xm0 + psraw ym1, 1 + vinserti32x8 m1, ym1, 1 + mov r3d, 16 + +.loop: + ; Row 0-1 + movu m2, [r1] + movu m3, [r1 + mmsize] + psubw m2, m1 + psubw m3, m1 + psraw m2, xm0 + psraw m3, xm0 + movu [r0], m2 + movu [r0 + r2], m3 + + add r1, 2 * mmsize + lea r0, [r0 + r2 * 2] + dec r3d + jnz .loop + RET _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel