# HG changeset patch # User Gopi Satykrishna Akisetty <gopi.satykris...@multicorewareinc.com> # Date 1507109312 -19800 # Wed Oct 04 14:58:32 2017 +0530 # Node ID 2221c70ef3b9b416b0ad491cd2325ccb595df8bb # Parent c726239a07580fd13c4177f0206d615ee02c5975 [x265-avx512]x86: AVX512 dequant_normal
AVX2 Performance : 9.81x AVX512 Performance : 15.37x diff -r c726239a0758 -r 2221c70ef3b9 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 31 15:21:25 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Oct 04 14:58:32 2017 +0530 @@ -2350,6 +2350,7 @@ p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); p.weight_pp = PFX(weight_pp_avx512); + p.dequant_normal = PFX(dequant_normal_avx512); p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512); p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512); @@ -4132,6 +4133,8 @@ p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512); p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512); + p.dequant_normal = PFX(dequant_normal_avx512); + //i444 chroma_hpp p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512); diff -r c726239a0758 -r 2221c70ef3b9 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Thu Aug 31 15:21:25 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Wed Oct 04 14:58:32 2017 +0530 @@ -1237,6 +1237,58 @@ jnz .loop RET +INIT_ZMM avx512 +cglobal dequant_normal, 5,5,7 + vpbroadcastd m2, [pw_1] ; m2 = word [1] + vpbroadcastd m5, [pd_32767] ; m5 = dword [32767] + vpbroadcastd m6, [pd_n32768] ; m6 = dword [-32768] +%if HIGH_BIT_DEPTH + cmp r3d, 32767 + jle .skip + shr r3d, (BIT_DEPTH - 8) + sub r4d, (BIT_DEPTH - 8) +.skip: +%endif + movd xm0, r4d ; m0 = shift + add r4d, -1+16 + bts r3d, r4d + + movd xm1, r3d + vpbroadcastd m1, xm1 ; m1 = dword [add scale] + + ; m0 = shift + ; m1 = scale + ; m2 = word [1] + mov r3d, r2d + shr r2d, 5 +.loop: + movu m3, [r0] + punpckhwd m4, m3, m2 + punpcklwd m3, m2 + pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add) + pmaddwd m4, m1 + psrad m3, xm0 + psrad m4, xm0 + pminsd m3, m5 + pmaxsd m3, m6 + pminsd m4, m5 + pmaxsd m4, m6 + packssdw m3, m4 + + mova [r1 + 0 * mmsize/2], ym3 + cmp r3d, 16 + je .num16 + vextracti32x8 [r1 + 1 * mmsize/2], m3, 1 + + add r0, mmsize + add r1, mmsize + + dec r2d + jnz .loop + RET +.num16: + RET + ;----------------------------------------------------------------------------- ; int x265_count_nonzero_4x4_sse2(const int16_t *quantCoeff); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel