# HG changeset patch # User Jayashri Murugan <jayas...@multicorewareinc.com> # Date 1498474912 -19800 # Mon Jun 26 16:31:52 2017 +0530 # Node ID 69b61721fa2ffdf1a0f6609a299c4e0104b48628 # Parent 03a532a9ab714b0081aede28e1773022d2be20b6 x86: AVX-512 pixel_sad for 4x4, 4x8, 4x16, 8x4, 8x8, 8x16, 16x8, and 16x16.
diff -r 03a532a9ab71 -r 69b61721fa2f source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jun 26 16:11:04 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jun 26 16:31:52 2017 +0530 @@ -3763,6 +3763,15 @@ p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx512); p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx512); + p.pu[LUMA_4x4].sad = PFX(pixel_sad_4x4_avx512); + p.pu[LUMA_4x8].sad = PFX(pixel_sad_4x8_avx512); + p.pu[LUMA_4x16].sad = PFX(pixel_sad_4x16_avx512); + p.pu[LUMA_8x4].sad = PFX(pixel_sad_8x4_avx512); + p.pu[LUMA_8x8].sad = PFX(pixel_sad_8x8_avx512); + p.pu[LUMA_8x16].sad = PFX(pixel_sad_8x16_avx512); + p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_avx512); + p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512); + } #endif } diff -r 03a532a9ab71 -r 69b61721fa2f source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Mon Jun 26 16:11:04 2017 +0530 +++ b/source/common/x86/sad-a.asm Mon Jun 26 16:31:52 2017 +0530 @@ -380,6 +380,9 @@ %macro SAD_W16 1 ; h cglobal pixel_sad_16x%1, 4,4 +%ifidn cpuname, sse2 +.skip_prologue: +%endif %assign %%i 0 %if ARCH_X86_64 lea r6, [3*r1] ; r6 results in fewer REX prefixes than r4 and both are volatile @@ -790,7 +793,132 @@ SAD_INC_4x8P_SSE 1 SAD_INC_4x8P_SSE 1 SAD_END_SSE2 + +%macro SAD_W48_AVX512 3 ; w, h, d/q +cglobal pixel_sad_%1x%2, 4,4 + kxnorb k1, k1, k1 + kaddb k1, k1, k1 +%assign %%i 0 +%if ARCH_X86_64 && %2 != 4 + lea r6, [3*r1] + lea r5, [3*r3] +%rep %2/4 + mov%3 m1, [r0] + vpbroadcast%3 m1 {k1}, [r0+r1] + mov%3 m3, [r2] + vpbroadcast%3 m3 {k1}, [r2+r3] + mov%3 m2, [r0+2*r1] + vpbroadcast%3 m2 {k1}, [r0+r6] + mov%3 m4, [r2+2*r3] + vpbroadcast%3 m4 {k1}, [r2+r5] +%if %%i != %2/4-1 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +%endif + psadbw m1, m3 + psadbw m2, m4 + ACCUM paddd, 0, 1, %%i + paddd m0, m2 + %assign %%i %%i+1 +%endrep +%else +%rep %2/2 + mov%3 m1, [r0] + vpbroadcast%3 m1 {k1}, [r0+r1] + mov%3 m2, [r2] + vpbroadcast%3 m2 {k1}, [r2+r3] +%if %%i != %2/2-1 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] +%endif + psadbw m1, m2 + ACCUM paddd, 0, 1, %%i + %assign %%i %%i+1 +%endrep +%endif +%if %1 == 8 + punpckhqdq m1, m0, m0 + paddd m0, m1 +%endif + movd eax, m0 RET +%endmacro + +INIT_XMM avx512 +SAD_W48_AVX512 4, 4, d +SAD_W48_AVX512 4, 8, d +SAD_W48_AVX512 4, 16, d +SAD_W48_AVX512 8, 4, q +SAD_W48_AVX512 8, 8, q +SAD_W48_AVX512 8, 16, q + +%macro SAD_W16_AVX512_START 1 ; h + cmp r1d, 16 ; optimized for width = 16, which has the + jne pixel_sad_16x%1_sse2.skip_prologue ; rows laid out contiguously in memory + lea r1, [3*r3] +%endmacro + +%macro SAD_W16_AVX512_END 0 + paddd m0, m1 + paddd m0, m2 + paddd m0, m3 +%if mmsize == 64 + vextracti32x8 ym1, m0, 1 + paddd ym0, ym1 +%endif + vextracti128 xm1, ym0, 1 + paddd xmm0, xm0, xm1 + punpckhqdq xmm1, xmm0, xmm0 + paddd xmm0, xmm1 + movd eax, xmm0 + RET +%endmacro + +INIT_YMM avx512 +cglobal pixel_sad_16x8, 4,4 + SAD_W16_AVX512_START 8 + movu xm0, [r2] + vinserti128 m0, [r2+r3], 1 + psadbw m0, [r0+0*32] + movu xm1, [r2+2*r3] + vinserti128 m1, [r2+r1], 1 + lea r2, [r2+4*r3] + psadbw m1, [r0+1*32] + movu xm2, [r2] + vinserti128 m2, [r2+r3], 1 + psadbw m2, [r0+2*32] + movu xm3, [r2+2*r3] + vinserti128 m3, [r2+r1], 1 + psadbw m3, [r0+3*32] + SAD_W16_AVX512_END + +INIT_ZMM avx512 +cglobal pixel_sad_16x16, 4,4 + SAD_W16_AVX512_START 16 + movu xm0, [r2] + vinserti128 ym0, [r2+r3], 1 + movu xm1, [r2+4*r3] + vinserti32x4 m0, [r2+2*r3], 2 + vinserti32x4 m1, [r2+2*r1], 2 + vinserti32x4 m0, [r2+r1], 3 + lea r2, [r2+4*r3] + vinserti32x4 m1, [r2+r3], 1 + psadbw m0, [r0+0*64] + vinserti32x4 m1, [r2+r1], 3 + lea r2, [r2+4*r3] + psadbw m1, [r0+1*64] + movu xm2, [r2] + vinserti128 ym2, [r2+r3], 1 + movu xm3, [r2+4*r3] + vinserti32x4 m2, [r2+2*r3], 2 + vinserti32x4 m3, [r2+2*r1], 2 + vinserti32x4 m2, [r2+r1], 3 + lea r2, [r2+4*r3] + vinserti32x4 m3, [r2+r3], 1 + psadbw m2, [r0+2*64] + vinserti32x4 m3, [r2+r1], 3 + psadbw m3, [r0+3*64] + SAD_W16_AVX512_END ;============================================================================= ; SAD x3/x4 MMX _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel