# HG changeset patch # User Vignesh Vijayakumar # Date 1498474278 -19800 # Mon Jun 26 16:21:18 2017 +0530 # Node ID 5309fe76c442d720d2d3419eefab11f2a1f9731a # Parent 2e5128235d577806f16e5cf93266dcd7f4155a63 x86: AVX-512 pixel_avg_weight_w16
diff -r 2e5128235d57 -r 5309fe76c442 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jun 23 17:25:27 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jun 26 16:21:18 2017 +0530 @@ -3754,6 +3754,8 @@ p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512); p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512); + p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512); + p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx512); } #endif } diff -r 2e5128235d57 -r 5309fe76c442 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asm Fri Jun 23 17:25:27 2017 +0530 +++ b/source/common/x86/mc-a.asm Mon Jun 26 16:21:18 2017 +0530 @@ -3367,11 +3367,11 @@ %endmacro %endif -%macro AVG_END 0 +%macro AVG_END 0-1 2;rows + lea t2, [t2+t3*2*SIZEOF_PIXEL] lea t4, [t4+t5*2*SIZEOF_PIXEL] - lea t2, [t2+t3*2*SIZEOF_PIXEL] lea t0, [t0+t1*2*SIZEOF_PIXEL] - sub eax, 2 + sub eax, %1 jg .height_loop %ifidn movu,movq ; detect MMX EMMS @@ -3434,17 +3434,24 @@ %endmacro %macro BIWEIGHT_START_SSSE3 0 - movzx t6d, byte r6m ; FIXME x86_64 - mov t7d, 64 - sub t7d, t6d - shl t7d, 8 - add t6d, t7d - mova m4, [pw_512] - movd xm3, t6d + movzx t6d, byte r6m ; FIXME x86_64 +%if mmsize > 16 + vbroadcasti128 m4, [pw_512] +%else + mova m4, [pw_512] +%endif + lea t7d, [t6+(64<<8)] + shl t6d, 8 + sub t7d, t6d +%if cpuflag(avx512) + vpbroadcastw m3, t7d +%else + movd xm3, t7d %if cpuflag(avx2) - vpbroadcastw m3, xm3 + vpbroadcastw m3, xm3 %else - SPLATW m3, m3 ; weight_dst,src + SPLATW m3, m3 ; weight_dst,src +%endif %endif %endmacro @@ -3586,6 +3593,34 @@ vextracti128 [t0+t1], m0, 1 AVG_END +INIT_ZMM avx512 + cglobal pixel_avg_weight_w16 + BIWEIGHT_START + AVG_START 5 +.height_loop: + movu xm0, [t2] + movu xm1, [t4] + vinserti128 ym0, [t2+t3], 1 + vinserti128 ym1, [t4+t5], 1 + lea t2, [t2+t3*2] + lea t4, [t4+t5*2] + vinserti32x4 m0, [t2], 2 + vinserti32x4 m1, [t4], 2 + vinserti32x4 m0, [t2+t3], 3 + vinserti32x4 m1, [t4+t5], 3 + SBUTTERFLY bw, 0, 1, 2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 + mova [t0], xm0 + vextracti128 [t0+t1], ym0, 1 + lea t0, [t0+t1*2] + vextracti32x4 [t0], m0, 2 + vextracti32x4 [t0+t1], m0, 3 + AVG_END 4 + cglobal pixel_avg_weight_w32 BIWEIGHT_START AVG_START 5 @@ -4345,6 +4380,10 @@ AVGH 16, 8 AVGH 16, 4 +INIT_XMM avx512 +AVGH 16, 16 +AVGH 16, 8 + %endif ;HIGH_BIT_DEPTH ;------------------------------------------------------------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel