# HG changeset patch # User Vignesh Vijayakumar # Date 1501843838 -19800 # Fri Aug 04 16:20:38 2017 +0530 # Node ID 039ed71e123c3e14bfaabbe3aada944157784b36 # Parent c5b5b7cb9bbef4365692bfaf05a2a83796d5f1b0 x86: AVX512 pixel_var_32x32
AVX2 performance : 9.15x AVX512 performance : 13.49x diff -r c5b5b7cb9bbe -r 039ed71e123c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Aug 04 14:27:51 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Aug 04 16:20:38 2017 +0530 @@ -3929,6 +3929,7 @@ p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512); p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512); + p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx512); p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx512); p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx512); p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512); diff -r c5b5b7cb9bbe -r 039ed71e123c source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Fri Aug 04 14:27:51 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Fri Aug 04 16:20:38 2017 +0530 @@ -7105,6 +7105,82 @@ RET %endif ; !HIGH_BIT_DEPTH +%macro PROCESS_VAR_32x8_AVX512 0 + pmovzxbw m0, [r0] + pmovzxbw m1, [r0 + r1] + pmovzxbw m2, [r0 + 2 * r1] + pmovzxbw m3, [r0 + r2] + + paddw m4, m0 + paddw m4, m1 + paddw m4, m2 + paddw m4, m3 + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m5, m0 + paddd m5, m1 + paddd m5, m2 + paddd m5, m3 + + lea r0, [r0 + r1 * 4] + + pmovzxbw m0, [r0] + pmovzxbw m1, [r0 + r1] + pmovzxbw m2, [r0 + 2 * r1] + pmovzxbw m3, [r0 + r2] + + paddw m4, m0 + paddw m4, m1 + paddw m4, m2 + paddw m4, m3 + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m5, m0 + paddd m5, m1 + paddd m5, m2 + paddd m5, m3 +%endmacro + +%macro PROCESS_VAR_AVX512_END 0 + vextracti32x8 ym0, m4, 1 + vextracti32x8 ym1, m5, 1 + paddw ym4, ym0 + paddd ym5, ym1 + vextracti32x4 xm0, m4, 1 + vextracti32x4 xm1, m5, 1 + paddw xm4, xm0 + paddd xm5, xm1 + HADDW xm4, xm2 + HADDD xm5, xm1 + punpckldq xm4, xm5 + movq rax, xm4 +%endmacro + +%if HIGH_BIT_DEPTH==0 +;----------------------------------------------------------------------------- +; int pixel_var_wxh( uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal pixel_var_32x32, 2,4,6 + pxor m4, m4 ; sum + pxor m5, m5 ; sum squared + lea r2, [3 * r1] + + PROCESS_VAR_32x8_AVX512 + lea r0, [r0 + r1 * 4] + PROCESS_VAR_32x8_AVX512 + lea r0, [r0 + r1 * 4] + PROCESS_VAR_32x8_AVX512 + lea r0, [r0 + r1 * 4] + PROCESS_VAR_32x8_AVX512 + PROCESS_VAR_AVX512_END + RET +%endif + %macro VAR_AVX512_CORE 1 ; accum %if %1 paddw m0, m2 _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel