# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com> # Date 1512715363 -19800 # Fri Dec 08 12:12:43 2017 +0530 # Node ID fa954ed4a1e7ce2741f3cac14006f78c3199191b # Parent 86d3d34de566d7696028b5e798a79b9de3a6e62b x86: AVX512 pixel_var_64x64
AVX2 performance : 8.84x AVX512 performance : 19.93x diff -r 86d3d34de566 -r fa954ed4a1e7 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Dec 07 17:32:55 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Dec 08 12:12:43 2017 +0530 @@ -4650,6 +4650,7 @@ p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512); p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512); p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx512); + p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx512); p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx512); p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx512); p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512); diff -r 86d3d34de566 -r fa954ed4a1e7 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Thu Dec 07 17:32:55 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Fri Dec 08 12:12:43 2017 +0530 @@ -7934,8 +7934,7 @@ movd edx, xm5 %endif %endmacro - -%if HIGH_BIT_DEPTH==0 +%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- ; int pixel_var_wxh( uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- @@ -7954,8 +7953,55 @@ PROCESS_VAR_32x8_AVX512 PROCESS_VAR_AVX512_END RET + +INIT_ZMM avx512 +cglobal pixel_var_64x64, 2,4,7 + pxor m5, m5 ; sum + pxor m6, m6 ; sum squared + mov r2d, 32 + +.loop: + pmovzxbw m0, [r0] + pmovzxbw m3, [r0 + mmsize/2] + pmovzxbw m1, [r0 + r1] + pmovzxbw m4, [r0 + r1 + mmsize/2] + + lea r0, [r0 + 2 * r1] + + paddw m5, m0 + paddw m5, m3 + paddw m5, m1 + paddw m5, m4 + pmaddwd m0, m0 + pmaddwd m3, m3 + pmaddwd m1, m1 + pmaddwd m4, m4 + paddd m6, m0 + paddd m6, m3 + paddd m6, m1 + paddd m6, m4 + + dec r2d + jg .loop + + pxor m1, m1 + punpcklwd m0, m5, m1 + punpckhwd m5, m1 + paddd m5, m0 + vextracti32x8 ym2, m5, 1 + vextracti32x8 ym1, m6, 1 + paddd ym5, ym2 + paddd ym6, ym1 + vextracti32x4 xm2, m5, 1 + vextracti32x4 xm1, m6, 1 + paddd xm5, xm2 + paddd xm6, xm1 + HADDD xm5, xm2 + HADDD xm6, xm1 + punpckldq xm5, xm6 + movq rax, xm5 + RET %endif - %macro VAR_AVX512_CORE 1 ; accum %if %1 paddw m0, m2 _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel