# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com> # Date 1507804221 -19800 # Thu Oct 12 16:00:21 2017 +0530 # Node ID 260bcd977f4408260e97e38160a4c96a17ea0931 # Parent a8aec1ea5e6f67ddca8d0c5e1d6b68f5b1f5d531 x86: AVX512 interp_8tap_horiz_pp_48x64 for high bit depth
AVX2 performance : 8.66x AVX512 performance : 23.00x diff -r a8aec1ea5e6f -r 260bcd977f44 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Oct 12 15:59:48 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Oct 12 16:00:21 2017 +0530 @@ -2531,6 +2531,7 @@ p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx512); p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512); p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512); + p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx512); } } diff -r a8aec1ea5e6f -r 260bcd977f44 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Thu Oct 12 15:59:48 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Thu Oct 12 16:00:21 2017 +0530 @@ -6040,6 +6040,240 @@ movu [r2 + r3], m10 %endmacro +%macro PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 0 + ; register map + ; m0 , m1, m2, m3 - interpolate coeff + ; m4 , m5 load shuffle order table + ; m6 - pd_32 + ; m7 - zero + ; m8 - pw_pixel_max + ; m9 - store shuffle order table + + movu m10, [r0] + movu m11, [r0 + 8] + movu m12, [r0 + 16] + + pshufb m13, m10, m5 + pshufb m10, m4 + pshufb m14, m11, m5 + pshufb m11, m4 + pshufb m15, m12, m5 + pshufb m12, m4 + + pmaddwd m10, m0 + pmaddwd m13, m1 + paddd m10, m13 + pmaddwd m13, m14, m3 + pmaddwd m16, m11, m2 + paddd m13, m16 + paddd m10, m13 + paddd m10, m6 + psrad m10, INTERP_SHIFT_PP + + pmaddwd m11, m0 + pmaddwd m14, m1 + paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 + paddd m11, m6 + psrad m11, INTERP_SHIFT_PP + + packusdw m10, m11 + CLIPW m10, m7, m8 + pshufb m10, m9 + movu [r2], m10 + + movu m10, [r0 + r1] + movu m11, [r0 + r1 + 8] + movu m12, [r0 + r1 + 16] + + pshufb m13, m10, m5 + pshufb m10, m4 + pshufb m14, m11, m5 + pshufb m11, m4 + pshufb m15, m12, m5 + pshufb m12, m4 + + pmaddwd m10, m0 + pmaddwd m13, m1 + paddd m10, m13 + pmaddwd m13, m14, m3 + pmaddwd m16, m11, m2 + paddd m13, m16 + paddd m10, m13 + paddd m10, m6 + psrad m10, INTERP_SHIFT_PP + + pmaddwd m11, m0 + pmaddwd m14, m1 + paddd m11, m14 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 + paddd m11, m6 + psrad m11, INTERP_SHIFT_PP + + packusdw m10, m11 + CLIPW m10, m7, m8 + pshufb m10, m9 + movu [r2 + r3], m10 + + movu m10, [r0 + 2 * r1] + movu m11, [r0 + 2 * r1 + 8] + movu m12, [r0 + 2 * r1 + 16] + + pshufb m13, m10, m5 + pshufb m10, m4 + pshufb m14, m11, m5 + pshufb m11, m4 + pshufb m15, m12, m5 + pshufb m12, m4 + + pmaddwd m10, m0 + pmaddwd m13, m1 + paddd m10, m13 + pmaddwd m13, m14, m3 + pmaddwd m16, m11, m2 + paddd m13, m16 + paddd m10, m13 + paddd m10, m6 + psrad m10, INTERP_SHIFT_PP + + pmaddwd m11, m0 + pmaddwd m14, m1 + paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 + paddd m11, m6 + psrad m11, INTERP_SHIFT_PP + + packusdw m10, m11 + CLIPW m10, m7, m8 + pshufb m10, m9 + movu [r2 + 2 * r3], m10 + + movu m10, [r0 + r6] + movu m11, [r0 + r6 + 8] + movu m12, [r0 + r6 + 16] + + pshufb m13, m10, m5 + pshufb m10, m4 + pshufb m14, m11, m5 + pshufb m11, m4 + pshufb m15, m12, m5 + pshufb m12, m4 + + pmaddwd m10, m0 + pmaddwd m13, m1 + paddd m10, m13 + pmaddwd m13, m14, m3 + pmaddwd m16, m11, m2 + paddd m13, m16 + paddd m10, m13 + paddd m10, m6 + psrad m10, INTERP_SHIFT_PP + + pmaddwd m11, m0 + pmaddwd m14, m1 + paddd m11, m14 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 + paddd m11, m6 + psrad m11, INTERP_SHIFT_PP + + packusdw m10, m11 + CLIPW m10, m7, m8 + pshufb m10, m9 + movu [r2 + r7], m10 + + movu ym10, [r0 + mmsize] + vinserti32x8 m10, [r0 + r1 + mmsize], 1 + movu ym11, [r0 + mmsize + 8] + vinserti32x8 m11, [r0 + r1 + mmsize + 8], 1 + movu ym12, [r0 + mmsize + 16] + vinserti32x8 m12, [r0 + r1 + mmsize + 16], 1 + + pshufb m13, m10, m5 + pshufb m10, m4 + pshufb m14, m11, m5 + pshufb m11, m4 + pshufb m15, m12, m5 + pshufb m12, m4 + + pmaddwd m10, m0 + pmaddwd m13, m1 + paddd m10, m13 + pmaddwd m13, m14, m3 + pmaddwd m16, m11, m2 + paddd m13, m16 + paddd m10, m13 + paddd m10, m6 + psrad m10, INTERP_SHIFT_PP + + pmaddwd m11, m0 + pmaddwd m14, m1 + paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 + paddd m11, m6 + psrad m11, INTERP_SHIFT_PP + + packusdw m10, m11 + CLIPW m10, m7, m8 + pshufb m10, m9 + movu [r2 + mmsize], ym10 + vextracti32x8 [r2 + r3 + mmsize], m10, 1 + + movu ym10, [r0 + 2 * r1 + mmsize] + vinserti32x8 m10, [r0 + r6 + mmsize], 1 + movu ym11, [r0 + 2 * r1 + mmsize + 8] + vinserti32x8 m11, [r0 + r6 + mmsize + 8], 1 + movu ym12, [r0 + 2 * r1 + mmsize + 16] + vinserti32x8 m12, [r0 + r6 + mmsize + 16], 1 + + pshufb m13, m10, m5 + pshufb m10, m4 + pshufb m14, m11, m5 + pshufb m11, m4 + pshufb m15, m12, m5 + pshufb m12, m4 + + pmaddwd m10, m0 + pmaddwd m13, m1 + paddd m10, m13 + pmaddwd m13, m14, m3 + pmaddwd m16, m11, m2 + paddd m13, m16 + paddd m10, m13 + paddd m10, m6 + psrad m10, INTERP_SHIFT_PP + + pmaddwd m11, m0 + pmaddwd m14, m1 + paddd m11, m14 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 + paddd m11, m6 + psrad m11, INTERP_SHIFT_PP + + packusdw m10, m11 + CLIPW m10, m7, m8 + pshufb m10, m9 + movu [r2 + 2 * r3 + mmsize], ym10 + vextracti32x8 [r2 + r7 + mmsize], m10, 1 +%endmacro + %macro PROCESS_IPFILTER_LUMA_PP_64x2_AVX512 0 ; register map ; m0 , m1, m2, m3 - interpolate coeff @@ -6324,6 +6558,43 @@ IPFILTER_LUMA_AVX512_64xN 32 IPFILTER_LUMA_AVX512_64xN 48 IPFILTER_LUMA_AVX512_64xN 64 + +INIT_ZMM avx512 +cglobal interp_8tap_horiz_pp_48x64, 5,8,17 + add r1d, r1d + add r3d, r3d + sub r0, 6 + mov r4d, r4m + shl r4d, 4 + +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4] + vpbroadcastd m1, [r5 + r4 + 4] + vpbroadcastd m2, [r5 + r4 + 8] + vpbroadcastd m3, [r5 + r4 + 12] +%else + vpbroadcastd m0, [tab_LumaCoeff + r4] + vpbroadcastd m1, [tab_LumaCoeff + r4 + 4] + vpbroadcastd m2, [tab_LumaCoeff + r4 + 8] + vpbroadcastd m3, [tab_LumaCoeff + r4 + 12] +%endif + vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512] + vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512] + vbroadcasti32x8 m6, [pd_32] + pxor m7, m7 + vbroadcasti32x8 m8, [pw_pixel_max] + vbroadcasti32x8 m9, [interp8_hpp_shuf1_store_avx512] + lea r6, [3 * r1] + lea r7, [3 * r3] + +%rep 15 + PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] +%endrep + PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 + RET ;------------------------------------------------------------------------------------------------------------- ;ipfilter_luma_avx512 code end ;------------------------------------------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel