# HG changeset patch # User Gopi Satykrishna Akisetty <gopi.satykris...@multicorewareinc.com> # Date 1504246803 -19800 # Fri Sep 01 11:50:03 2017 +0530 # Node ID e1348316cd4431a5d39c8a9457d865f0f9d546cc # Parent 2221c70ef3b9b416b0ad491cd2325ccb595df8bb [x265-avx512]x86: AVX512 dequant_scaling
AVX2 Performance : 8.65x AVX512 Performance : 15.55x diff -r 2221c70ef3b9 -r e1348316cd44 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Oct 04 14:58:32 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Sep 01 11:50:03 2017 +0530 @@ -2351,7 +2351,7 @@ p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); p.weight_pp = PFX(weight_pp_avx512); p.dequant_normal = PFX(dequant_normal_avx512); - + p.dequant_scaling = PFX(dequant_scaling_avx512); p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512); p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512); @@ -4134,7 +4134,7 @@ p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512); p.dequant_normal = PFX(dequant_normal_avx512); - + p.dequant_scaling = PFX(dequant_scaling_avx512); //i444 chroma_hpp p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512); diff -r 2221c70ef3b9 -r e1348316cd44 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Wed Oct 04 14:58:32 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Fri Sep 01 11:50:03 2017 +0530 @@ -30,6 +30,10 @@ var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1 db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1 +ALIGN 64 +const dequant_shuf1_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7 +const dequant_shuf2_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7 + %if BIT_DEPTH == 12 ssim_c1: times 4 dd 107321.76 ; .01*.01*4095*4095*64 ssim_c2: times 4 dd 60851437.92 ; .03*.03*4095*4095*64*63 @@ -1237,6 +1241,90 @@ jnz .loop RET +;---------------------------------------------------------------------------------------------------------------------- +;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift) +;---------------------------------------------------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal dequant_scaling, 6,7,8 + mova m6, [dequant_shuf1_avx512] + mova m7, [dequant_shuf2_avx512] + add r5d, 4 + mov r6d, r3d + shr r3d, 5 ; num/32 + cmp r5d, r4d + jle .skip + sub r5d, r4d + vpbroadcastd m0, [pd_1] + movd xm1, r5d ; shift - per + dec r5d + movd xm2, r5d ; shift - per - 1 + pslld m0, xm2 ; 1 << shift - per - 1 + +.part0: + pmovsxwd m2, [r0] + pmovsxwd m4, [r0 + 32] + movu m3, [r1] + movu m5, [r1 + 64] + pmulld m2, m3 + pmulld m4, m5 + paddd m2, m0 + paddd m4, m0 + psrad m2, xm1 + psrad m4, xm1 + packssdw m2, m4 + vpermq m2, m6, m2 + cmp r6d, 16 + je .num16part0 + movu [r2], m2 + + add r0, 64 + add r1, 128 + add r2, 64 + dec r3d + jnz .part0 + jmp .end + +.num16part0: + movu [r2], ym2 + jmp .end + +.skip: + sub r4d, r5d ; per - shift + movd xm0, r4d + +.part1: + pmovsxwd m2, [r0] + pmovsxwd m4, [r0 + 32] + movu m3, [r1] + movu m5, [r1 + 64] + pmulld m2, m3 + pmulld m4, m5 + packssdw m2, m4 + + vextracti32x8 ym4, m2, 1 + pmovsxwd m1, ym2 + pmovsxwd m2, ym4 + pslld m1, xm0 + pslld m2, xm0 + packssdw m1, m2 + + vpermq m1, m7, m1 + cmp r6d, 16 + je .num16part1 + movu [r2], m1 + + add r0, 64 + add r1, 128 + add r2, 64 + dec r3d + jnz .part1 + +.num16part1: + movu [r2], ym1 + +.end: + RET + INIT_ZMM avx512 cglobal dequant_normal, 5,5,7 vpbroadcastd m2, [pw_1] ; m2 = word [1] _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel