# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com> # Date 1507804188 -19800 # Thu Oct 12 15:59:48 2017 +0530 # Node ID a8aec1ea5e6f67ddca8d0c5e1d6b68f5b1f5d531 # Parent 2bfc582cecbf51da0c29d130402f1983526ff6db x86: AVX512 interp_8tap_horiz_pp_16xN for high bit depth
Size | AVX2 performance | AVX512 performance ---------------------------------------------- 16x4 | 9.20x | 17.30x 16x8 | 9.52x | 22.97x 16x12 | 8.97x | 20.37x 16x16 | 8.82x | 21.81x 16x32 | 9.16x | 23.05x 16x64 | 9.13x | 23.58x diff -r 2bfc582cecbf -r a8aec1ea5e6f source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Oct 12 15:59:20 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Oct 12 15:59:48 2017 +0530 @@ -2516,6 +2516,12 @@ p.cu[BLOCK_32x32].blockfill_s[ALIGNED] = PFX(blockfill_s_aligned_32x32_avx512); + p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_avx512); + p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_avx512); + p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_avx512); + p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_avx512); + p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_avx512); + p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_avx512); p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_avx512); p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_avx512); p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_avx512); diff -r 2bfc582cecbf -r a8aec1ea5e6f source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Thu Oct 12 15:59:20 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Thu Oct 12 15:59:48 2017 +0530 @@ -5868,7 +5868,7 @@ ;------------------------------------------------------------------------------------------------------------- ;ipfilter_luma_avx512 code start ;------------------------------------------------------------------------------------------------------------- -%macro PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 0 +%macro PROCESS_IPFILTER_LUMA_PP_16x4_AVX512 0 ; register map ; m0 , m1, m2, m3 - interpolate coeff ; m4 , m5 load shuffle order table @@ -5877,9 +5877,12 @@ ; m8 - pw_pixel_max ; m9 - store shuffle order table - movu m10, [r0] - movu m11, [r0 + 8] - movu m12, [r0 + 16] + movu ym10, [r0] + vinserti32x8 m10, [r0 + r1], 1 + movu ym11, [r0 + 8] + vinserti32x8 m11, [r0 + r1 + 8], 1 + movu ym12, [r0 + 16] + vinserti32x8 m12, [r0 + r1 + 16], 1 pshufb m13, m10, m5 pshufb m10, m4 @@ -5911,11 +5914,15 @@ packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2], m10 - - movu m10, [r0 + r1] - movu m11, [r0 + r1 + 8] - movu m12, [r0 + r1 + 16] + movu [r2], ym10 + vextracti32x8 [r2 + r3], m10, 1 + + movu ym10, [r0 + 2 * r1] + vinserti32x8 m10, [r0 + r6], 1 + movu ym11, [r0 + 2 * r1 + 8] + vinserti32x8 m11, [r0 + r6 + 8], 1 + movu ym12, [r0 + 2 * r1 + 16] + vinserti32x8 m12, [r0 + r6 + 16], 1 pshufb m13, m10, m5 pshufb m10, m4 @@ -5947,10 +5954,11 @@ packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + r3], m10 + movu [r2 + 2 * r3], ym10 + vextracti32x8 [r2 + r7], m10, 1 %endmacro -%macro PROCESS_IPFILTER_LUMA_PP_64x2_AVX512 0 +%macro PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 0 ; register map ; m0 , m1, m2, m3 - interpolate coeff ; m4 , m5 load shuffle order table @@ -5995,9 +6003,9 @@ pshufb m10, m9 movu [r2], m10 - movu m10, [r0 + mmsize] - movu m11, [r0 + mmsize + 8] - movu m12, [r0 + mmsize + 16] + movu m10, [r0 + r1] + movu m11, [r0 + r1 + 8] + movu m12, [r0 + r1 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -6019,21 +6027,31 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m15, m3 - pmaddwd m12, m2 - paddd m12, m15 - paddd m11, m12 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + mmsize], m10 - - movu m10, [r0 + r1] - movu m11, [r0 + r1 + 8] - movu m12, [r0 + r1 + 16] + movu [r2 + r3], m10 +%endmacro + +%macro PROCESS_IPFILTER_LUMA_PP_64x2_AVX512 0 + ; register map + ; m0 , m1, m2, m3 - interpolate coeff + ; m4 , m5 load shuffle order table + ; m6 - pd_32 + ; m7 - zero + ; m8 - pw_pixel_max + ; m9 - store shuffle order table + + movu m10, [r0] + movu m11, [r0 + 8] + movu m12, [r0 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -6055,21 +6073,21 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m14, m15, m3 - pmaddwd m16, m12, m2 - paddd m14, m16 - paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + r3], m10 - - movu m10, [r0 + r1 + mmsize] - movu m11, [r0 + r1 + mmsize + 8] - movu m12, [r0 + r1 + mmsize + 16] + movu [r2], m10 + + movu m10, [r0 + mmsize] + movu m11, [r0 + mmsize + 8] + movu m12, [r0 + mmsize + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -6091,6 +6109,42 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 + paddd m11, m6 + psrad m11, INTERP_SHIFT_PP + + packusdw m10, m11 + CLIPW m10, m7, m8 + pshufb m10, m9 + movu [r2 + mmsize], m10 + + movu m10, [r0 + r1] + movu m11, [r0 + r1 + 8] + movu m12, [r0 + r1 + 16] + + pshufb m13, m10, m5 + pshufb m10, m4 + pshufb m14, m11, m5 + pshufb m11, m4 + pshufb m15, m12, m5 + pshufb m12, m4 + + pmaddwd m10, m0 + pmaddwd m13, m1 + paddd m10, m13 + pmaddwd m13, m14, m3 + pmaddwd m16, m11, m2 + paddd m13, m16 + paddd m10, m13 + paddd m10, m6 + psrad m10, INTERP_SHIFT_PP + + pmaddwd m11, m0 + pmaddwd m14, m1 + paddd m11, m14 pmaddwd m14, m15, m3 pmaddwd m16, m12, m2 paddd m14, m16 @@ -6101,12 +6155,48 @@ packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 + movu [r2 + r3], m10 + + movu m10, [r0 + r1 + mmsize] + movu m11, [r0 + r1 + mmsize + 8] + movu m12, [r0 + r1 + mmsize + 16] + + pshufb m13, m10, m5 + pshufb m10, m4 + pshufb m14, m11, m5 + pshufb m11, m4 + pshufb m15, m12, m5 + pshufb m12, m4 + + pmaddwd m10, m0 + pmaddwd m13, m1 + paddd m10, m13 + pmaddwd m13, m14, m3 + pmaddwd m16, m11, m2 + paddd m13, m16 + paddd m10, m13 + paddd m10, m6 + psrad m10, INTERP_SHIFT_PP + + pmaddwd m11, m0 + pmaddwd m14, m1 + paddd m11, m14 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 + paddd m11, m6 + psrad m11, INTERP_SHIFT_PP + + packusdw m10, m11 + CLIPW m10, m7, m8 + pshufb m10, m9 movu [r2 + r3 + mmsize], m10 %endmacro INIT_ZMM avx512 -%macro IPFILTER_LUMA_AVX512_32xN 1 -cglobal interp_8tap_horiz_pp_32x%1, 5,6,17 +%macro IPFILTER_LUMA_AVX512_16xN 1 +cglobal interp_8tap_horiz_pp_16x%1, 5,8,17 add r1d, r1d add r3d, r3d sub r0, 6 @@ -6131,25 +6221,28 @@ pxor m7, m7 vbroadcasti32x8 m8, [pw_pixel_max] vbroadcasti32x8 m9, [interp8_hpp_shuf1_store_avx512] - -%rep %1/2 - 1 - PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] + lea r6, [3 * r1] + lea r7, [3 * r3] + +%rep %1/4 - 1 + PROCESS_IPFILTER_LUMA_PP_16x4_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] %endrep - PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 + PROCESS_IPFILTER_LUMA_PP_16x4_AVX512 RET %endmacro -IPFILTER_LUMA_AVX512_32xN 8 -IPFILTER_LUMA_AVX512_32xN 16 -IPFILTER_LUMA_AVX512_32xN 24 -IPFILTER_LUMA_AVX512_32xN 32 -IPFILTER_LUMA_AVX512_32xN 64 +IPFILTER_LUMA_AVX512_16xN 4 +IPFILTER_LUMA_AVX512_16xN 8 +IPFILTER_LUMA_AVX512_16xN 12 +IPFILTER_LUMA_AVX512_16xN 16 +IPFILTER_LUMA_AVX512_16xN 32 +IPFILTER_LUMA_AVX512_16xN 64 INIT_ZMM avx512 -%macro IPFILTER_LUMA_AVX512_64xN 1 -cglobal interp_8tap_horiz_pp_64x%1, 5,6,17 +%macro IPFILTER_LUMA_AVX512_32xN 1 +cglobal interp_8tap_horiz_pp_32x%1, 5,6,17 add r1d, r1d add r3d, r3d sub r0, 6 @@ -6176,6 +6269,49 @@ vbroadcasti32x8 m9, [interp8_hpp_shuf1_store_avx512] %rep %1/2 - 1 + PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] +%endrep + PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 + RET +%endmacro + +IPFILTER_LUMA_AVX512_32xN 8 +IPFILTER_LUMA_AVX512_32xN 16 +IPFILTER_LUMA_AVX512_32xN 24 +IPFILTER_LUMA_AVX512_32xN 32 +IPFILTER_LUMA_AVX512_32xN 64 + +INIT_ZMM avx512 +%macro IPFILTER_LUMA_AVX512_64xN 1 +cglobal interp_8tap_horiz_pp_64x%1, 5,6,17 + add r1d, r1d + add r3d, r3d + sub r0, 6 + mov r4d, r4m + shl r4d, 4 + +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4] + vpbroadcastd m1, [r5 + r4 + 4] + vpbroadcastd m2, [r5 + r4 + 8] + vpbroadcastd m3, [r5 + r4 + 12] +%else + vpbroadcastd m0, [tab_LumaCoeff + r4] + vpbroadcastd m1, [tab_LumaCoeff + r4 + 4] + vpbroadcastd m2, [tab_LumaCoeff + r4 + 8] + vpbroadcastd m3, [tab_LumaCoeff + r4 + 12] +%endif + vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512] + vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512] + vbroadcasti32x8 m6, [pd_32] + pxor m7, m7 + vbroadcasti32x8 m8, [pw_pixel_max] + vbroadcasti32x8 m9, [interp8_hpp_shuf1_store_avx512] + +%rep %1/2 - 1 PROCESS_IPFILTER_LUMA_PP_64x2_AVX512 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel