# HG changeset patch # User Gopi Satykrishna Akisetty <gopi.satykris...@multicorewareinc.com> # Date 1507288098 -19800 # Fri Oct 06 16:38:18 2017 +0530 # Node ID dc2d7a2515fdc434744f97a9dd34edcd670bbffa # Parent 7f92fdd23823946026a4f55cb8c0f252cd658d07 [x265-avx512]x86: AVX512 weight_sp
AVX2 Performance : 6.78x AVX512 Performance : 11.09x diff -r 7f92fdd23823 -r dc2d7a2515fd source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 31 14:10:30 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Oct 06 16:38:18 2017 +0530 @@ -2418,6 +2418,7 @@ p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); p.weight_pp = PFX(weight_pp_avx512); + p.weight_sp = PFX(weight_sp_avx512); p.dequant_normal = PFX(dequant_normal_avx512); p.dequant_scaling = PFX(dequant_scaling_avx512); p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512); @@ -4356,6 +4357,7 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512); p.weight_pp = PFX(weight_pp_avx512); + p.weight_sp = PFX(weight_sp_avx512); //i444 chroma_hps p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hps = PFX(interp_4tap_horiz_ps_64x64_avx512); diff -r 7f92fdd23823 -r dc2d7a2515fd source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Thu Aug 31 14:10:30 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Fri Oct 06 16:38:18 2017 +0530 @@ -2400,6 +2400,149 @@ %endif %endif +%if ARCH_X86_64 == 1 +%if HIGH_BIT_DEPTH +INIT_ZMM avx512 +cglobal weight_sp, 6,9,8 + vbroadcasti32x8 m1, [pw_pixel_max] + vbroadcasti32x8 m2, [pw_1] + + mov r6d, r7m + shl r6d, 16 + or r6d, r6m + movd xm3, r6d + vpbroadcastd m3, xm3 ; m3 = [round w0] + movd xm4, r8m ; m4 = [shift] + vpbroadcastd m5, r9m ; m5 = [offset] + + ; correct row stride + add r3d, r3d + add r2d, r2d + mov r6d, r4d + and r6d, ~(mmsize / SIZEOF_PIXEL - 1) + shl r6d, 1 + sub r3d, r6d + sub r2d, r6d + + mov r6d, r4d + and r6d, (mmsize / SIZEOF_PIXEL - 1) + +.loopH: + mov r6d, r4d + +.loopW: + movu m6, [r0] + vbroadcasti32x8 m8, [pw_2000] + paddw m6, m8 + + punpcklwd m7, m6, m2 + pmaddwd m7, m3 ;(round w0) + psrad m7, xm4 ;(shift) + paddd m7, m5 ;(offset) + + punpckhwd m6, m2 + pmaddwd m6, m3 + psrad m6, xm4 + paddd m6, m5 + + packusdw m7, m6 + pminuw m7, m1 + + sub r6d, (mmsize / SIZEOF_PIXEL) + jl .widthLess30 + movu [r1], m7 + lea r0, [r0 + mmsize] + lea r1, [r1 + mmsize] + je .nextH + jmp .loopW + +.widthLess30: + mov r8d, 0xFFFFFFFF + NEG r6d + shrx r8d, r8d, r6d + kmovd k1, r8d + vmovdqu16 [r1] {k1}, m7 + jmp .nextH + +.nextH: + add r0, r2 + add r1, r3 + + dec r5d + jnz .loopH + RET + +%else +INIT_ZMM avx512 +cglobal weight_sp, 6, 10, 7 + mov r7d, r7m + shl r7d, 16 + or r7d, r6m + movd xm0, r7d + vpbroadcastd m0, xm0 ; m0 = times 8 dw w0, round + movd xm1, r8m ; m1 = [shift] + vpbroadcastd m2, r9m ; m2 = times 16 dw offset + vpbroadcastw m3, [pw_1] + vpbroadcastw m4, [pw_2000] + + add r2d, r2d ; 2 * srcstride + + mov r7, r0 + mov r8, r1 +.loopH: + mov r6d, r4d ; width + + ; save old src and dst + mov r0, r7 ; src + mov r1, r8 ; dst + +.loopW: + movu m5, [r0] + paddw m5, m4 + + punpcklwd m6, m5, m3 + pmaddwd m6, m0 + psrad m6, xm1 + paddd m6, m2 + + punpckhwd m5, m3 + pmaddwd m5, m0 + psrad m5, xm1 + paddd m5, m2 + + packssdw m6, m5 + vextracti64x4 ym5, m6, 1 + packuswb ym6, ym5 + vpermq ym6, ym6, q3120 + + sub r6d, 32 + jl .widthLess30 + movu [r1], ym6 + je .nextH + add r0, 64 + add r1, 32 + jmp .loopW + + +.widthLess30: + mov r9d, 0xFFFFFFFF + NEG r6d + shrx r9d, r9d, r6d + kmovd k1, r9d + vmovdqu8 [r1] {k1}, ym6 + jmp .nextH + +.nextH: + lea r7, [r7 + r2] + lea r8, [r8 + r3] + + dec r5d + jnz .loopH + RET +%endif +%endif + + ;----------------------------------------------------------------- ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride) ;----------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel