# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com> # Date 1511929966 -19800 # Wed Nov 29 10:02:46 2017 +0530 # Node ID 2ebaab8b8d9be6f8c1c89699f818f6426cc3fbea # Parent 240ae5a46e63d3bebd8a4db63a5662a4000d70a7 x86: AVX512 interp_8tap_horiz_pp_8xN for high bit depth
Size | AVX2 performance | AVX512 performance ---------------------------------------------- 8x8 | 6.77x | 11.23x 8x8 | 7.94x | 13.31x 8x16 | 8.10x | 16.28x 8x32 | 7.89x | 16.26x diff -r 240ae5a46e63 -r 2ebaab8b8d9b source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Nov 28 17:37:57 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Nov 29 10:02:46 2017 +0530 @@ -2618,6 +2618,10 @@ p.cu[BLOCK_32x32].blockfill_s[ALIGNED] = PFX(blockfill_s_aligned_32x32_avx512); + p.pu[LUMA_8x4].luma_hpp = PFX(interp_8tap_horiz_pp_8x4_avx512); + p.pu[LUMA_8x8].luma_hpp = PFX(interp_8tap_horiz_pp_8x8_avx512); + p.pu[LUMA_8x16].luma_hpp = PFX(interp_8tap_horiz_pp_8x16_avx512); + p.pu[LUMA_8x32].luma_hpp = PFX(interp_8tap_horiz_pp_8x32_avx512); p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_avx512); p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_avx512); p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_avx512); diff -r 240ae5a46e63 -r 2ebaab8b8d9b source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Tue Nov 28 17:37:57 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Wed Nov 29 10:02:46 2017 +0530 @@ -9605,7 +9605,7 @@ ;------------------------------------------------------------------------------------------------------------- ;ipfilter_luma_avx512 code start ;------------------------------------------------------------------------------------------------------------- -%macro PROCESS_IPFILTER_LUMA_PP_16x4_AVX512 0 +%macro PROCESS_IPFILTER_LUMA_PP_8x4_AVX512 0 ; register map ; m0 , m1, m2, m3 - interpolate coeff ; m4 , m5 load shuffle order table @@ -9614,12 +9614,21 @@ ; m8 - pw_pixel_max ; m9 - store shuffle order table - movu ym10, [r0] - vinserti32x8 m10, [r0 + r1], 1 - movu ym11, [r0 + 8] - vinserti32x8 m11, [r0 + r1 + 8], 1 - movu ym12, [r0 + 16] - vinserti32x8 m12, [r0 + r1 + 16], 1 + movu xm10, [r0] + movu xm11, [r0 + 8] + movu xm12, [r0 + 16] + + vinserti32x4 m10, [r0 + r1], 1 + vinserti32x4 m11, [r0 + r1 + 8], 1 + vinserti32x4 m12, [r0 + r1 + 16], 1 + + vinserti32x4 m10, [r0 + 2 * r1], 2 + vinserti32x4 m11, [r0 + 2 * r1 + 8], 2 + vinserti32x4 m12, [r0 + 2 * r1 + 16], 2 + + vinserti32x4 m10, [r0 + r6], 3 + vinserti32x4 m11, [r0 + r6 + 8], 3 + vinserti32x4 m12, [r0 + r6 + 16], 3 pshufb m13, m10, m5 pshufb m10, m4 @@ -9651,15 +9660,27 @@ packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2], ym10 - vextracti32x8 [r2 + r3], m10, 1 - - movu ym10, [r0 + 2 * r1] - vinserti32x8 m10, [r0 + r6], 1 - movu ym11, [r0 + 2 * r1 + 8] - vinserti32x8 m11, [r0 + r6 + 8], 1 - movu ym12, [r0 + 2 * r1 + 16] - vinserti32x8 m12, [r0 + r6 + 16], 1 + movu [r2], xm10 + vextracti32x4 [r2 + r3], m10, 1 + vextracti32x4 [r2 + 2 * r3], m10, 2 + vextracti32x4 [r2 + r7], m10, 3 +%endmacro + +%macro PROCESS_IPFILTER_LUMA_PP_16x4_AVX512 0 + ; register map + ; m0 , m1, m2, m3 - interpolate coeff + ; m4 , m5 load shuffle order table + ; m6 - pd_32 + ; m7 - zero + ; m8 - pw_pixel_max + ; m9 - store shuffle order table + + movu ym10, [r0] + vinserti32x8 m10, [r0 + r1], 1 + movu ym11, [r0 + 8] + vinserti32x8 m11, [r0 + r1 + 8], 1 + movu ym12, [r0 + 16] + vinserti32x8 m12, [r0 + r1 + 16], 1 pshufb m13, m10, m5 pshufb m10, m4 @@ -9681,32 +9702,25 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m14, m15, m3 - pmaddwd m16, m12, m2 - paddd m14, m16 - paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + 2 * r3], ym10 - vextracti32x8 [r2 + r7], m10, 1 -%endmacro - -%macro PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 0 - ; register map - ; m0 , m1, m2, m3 - interpolate coeff - ; m4 , m5 load shuffle order table - ; m6 - pd_32 - ; m7 - zero - ; m8 - pw_pixel_max - ; m9 - store shuffle order table - - movu m10, [r0] - movu m11, [r0 + 8] - movu m12, [r0 + 16] + movu [r2], ym10 + vextracti32x8 [r2 + r3], m10, 1 + + movu ym10, [r0 + 2 * r1] + vinserti32x8 m10, [r0 + r6], 1 + movu ym11, [r0 + 2 * r1 + 8] + vinserti32x8 m11, [r0 + r6 + 8], 1 + movu ym12, [r0 + 2 * r1 + 16] + vinserti32x8 m12, [r0 + r6 + 16], 1 pshufb m13, m10, m5 pshufb m10, m4 @@ -9728,21 +9742,32 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m15, m3 - pmaddwd m12, m2 - paddd m12, m15 - paddd m11, m12 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2], m10 - - movu m10, [r0 + r1] - movu m11, [r0 + r1 + 8] - movu m12, [r0 + r1 + 16] + movu [r2 + 2 * r3], ym10 + vextracti32x8 [r2 + r7], m10, 1 +%endmacro + +%macro PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 0 + ; register map + ; m0 , m1, m2, m3 - interpolate coeff + ; m4 , m5 load shuffle order table + ; m6 - pd_32 + ; m7 - zero + ; m8 - pw_pixel_max + ; m9 - store shuffle order table + + movu m10, [r0] + movu m11, [r0 + 8] + movu m12, [r0 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -9764,31 +9789,21 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m14, m15, m3 - pmaddwd m16, m12, m2 - paddd m14, m16 - paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + r3], m10 -%endmacro - -%macro PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 0 - ; register map - ; m0 , m1, m2, m3 - interpolate coeff - ; m4 , m5 load shuffle order table - ; m6 - pd_32 - ; m7 - zero - ; m8 - pw_pixel_max - ; m9 - store shuffle order table - - movu m10, [r0] - movu m11, [r0 + 8] - movu m12, [r0 + 16] + movu [r2], m10 + + movu m10, [r0 + r1] + movu m11, [r0 + r1 + 8] + movu m12, [r0 + r1 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -9810,21 +9825,31 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m15, m3 - pmaddwd m12, m2 - paddd m12, m15 - paddd m11, m12 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2], m10 - - movu m10, [r0 + r1] - movu m11, [r0 + r1 + 8] - movu m12, [r0 + r1 + 16] + movu [r2 + r3], m10 +%endmacro + +%macro PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 0 + ; register map + ; m0 , m1, m2, m3 - interpolate coeff + ; m4 , m5 load shuffle order table + ; m6 - pd_32 + ; m7 - zero + ; m8 - pw_pixel_max + ; m9 - store shuffle order table + + movu m10, [r0] + movu m11, [r0 + 8] + movu m12, [r0 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -9846,21 +9871,21 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m14, m15, m3 - pmaddwd m16, m12, m2 - paddd m14, m16 - paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + r3], m10 - - movu m10, [r0 + 2 * r1] - movu m11, [r0 + 2 * r1 + 8] - movu m12, [r0 + 2 * r1 + 16] + movu [r2], m10 + + movu m10, [r0 + r1] + movu m11, [r0 + r1 + 8] + movu m12, [r0 + r1 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -9882,21 +9907,21 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m15, m3 - pmaddwd m12, m2 - paddd m12, m15 - paddd m11, m12 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + 2 * r3], m10 - - movu m10, [r0 + r6] - movu m11, [r0 + r6 + 8] - movu m12, [r0 + r6 + 16] + movu [r2 + r3], m10 + + movu m10, [r0 + 2 * r1] + movu m11, [r0 + 2 * r1 + 8] + movu m12, [r0 + 2 * r1 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -9918,24 +9943,21 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m14, m15, m3 - pmaddwd m16, m12, m2 - paddd m14, m16 - paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + r7], m10 - - movu ym10, [r0 + mmsize] - vinserti32x8 m10, [r0 + r1 + mmsize], 1 - movu ym11, [r0 + mmsize + 8] - vinserti32x8 m11, [r0 + r1 + mmsize + 8], 1 - movu ym12, [r0 + mmsize + 16] - vinserti32x8 m12, [r0 + r1 + mmsize + 16], 1 + movu [r2 + 2 * r3], m10 + + movu m10, [r0 + r6] + movu m11, [r0 + r6 + 8] + movu m12, [r0 + r6 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -9957,25 +9979,24 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m15, m3 - pmaddwd m12, m2 - paddd m12, m15 - paddd m11, m12 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + mmsize], ym10 - vextracti32x8 [r2 + r3 + mmsize], m10, 1 - - movu ym10, [r0 + 2 * r1 + mmsize] - vinserti32x8 m10, [r0 + r6 + mmsize], 1 - movu ym11, [r0 + 2 * r1 + mmsize + 8] - vinserti32x8 m11, [r0 + r6 + mmsize + 8], 1 - movu ym12, [r0 + 2 * r1 + mmsize + 16] - vinserti32x8 m12, [r0 + r6 + mmsize + 16], 1 + movu [r2 + r7], m10 + + movu ym10, [r0 + mmsize] + vinserti32x8 m10, [r0 + r1 + mmsize], 1 + movu ym11, [r0 + mmsize + 8] + vinserti32x8 m11, [r0 + r1 + mmsize + 8], 1 + movu ym12, [r0 + mmsize + 16] + vinserti32x8 m12, [r0 + r1 + mmsize + 16], 1 pshufb m13, m10, m5 pshufb m10, m4 @@ -9997,32 +10018,25 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m14, m15, m3 - pmaddwd m16, m12, m2 - paddd m14, m16 - paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + 2 * r3 + mmsize], ym10 - vextracti32x8 [r2 + r7 + mmsize], m10, 1 -%endmacro - -%macro PROCESS_IPFILTER_LUMA_PP_64x2_AVX512 0 - ; register map - ; m0 , m1, m2, m3 - interpolate coeff - ; m4 , m5 load shuffle order table - ; m6 - pd_32 - ; m7 - zero - ; m8 - pw_pixel_max - ; m9 - store shuffle order table - - movu m10, [r0] - movu m11, [r0 + 8] - movu m12, [r0 + 16] + movu [r2 + mmsize], ym10 + vextracti32x8 [r2 + r3 + mmsize], m10, 1 + + movu ym10, [r0 + 2 * r1 + mmsize] + vinserti32x8 m10, [r0 + r6 + mmsize], 1 + movu ym11, [r0 + 2 * r1 + mmsize + 8] + vinserti32x8 m11, [r0 + r6 + mmsize + 8], 1 + movu ym12, [r0 + 2 * r1 + mmsize + 16] + vinserti32x8 m12, [r0 + r6 + mmsize + 16], 1 pshufb m13, m10, m5 pshufb m10, m4 @@ -10044,21 +10058,32 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m15, m3 - pmaddwd m12, m2 - paddd m12, m15 - paddd m11, m12 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2], m10 - - movu m10, [r0 + mmsize] - movu m11, [r0 + mmsize + 8] - movu m12, [r0 + mmsize + 16] + movu [r2 + 2 * r3 + mmsize], ym10 + vextracti32x8 [r2 + r7 + mmsize], m10, 1 +%endmacro + +%macro PROCESS_IPFILTER_LUMA_PP_64x2_AVX512 0 + ; register map + ; m0 , m1, m2, m3 - interpolate coeff + ; m4 , m5 load shuffle order table + ; m6 - pd_32 + ; m7 - zero + ; m8 - pw_pixel_max + ; m9 - store shuffle order table + + movu m10, [r0] + movu m11, [r0 + 8] + movu m12, [r0 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -10090,11 +10115,11 @@ packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + mmsize], m10 - - movu m10, [r0 + r1] - movu m11, [r0 + r1 + 8] - movu m12, [r0 + r1 + 16] + movu [r2], m10 + + movu m10, [r0 + mmsize] + movu m11, [r0 + mmsize + 8] + movu m12, [r0 + mmsize + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -10116,21 +10141,21 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m14, m15, m3 - pmaddwd m16, m12, m2 - paddd m14, m16 - paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + r3], m10 - - movu m10, [r0 + r1 + mmsize] - movu m11, [r0 + r1 + mmsize + 8] - movu m12, [r0 + r1 + mmsize + 16] + movu [r2 + mmsize], m10 + + movu m10, [r0 + r1] + movu m11, [r0 + r1 + 8] + movu m12, [r0 + r1 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -10162,9 +10187,91 @@ packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 + movu [r2 + r3], m10 + + movu m10, [r0 + r1 + mmsize] + movu m11, [r0 + r1 + mmsize + 8] + movu m12, [r0 + r1 + mmsize + 16] + + pshufb m13, m10, m5 + pshufb m10, m4 + pshufb m14, m11, m5 + pshufb m11, m4 + pshufb m15, m12, m5 + pshufb m12, m4 + + pmaddwd m10, m0 + pmaddwd m13, m1 + paddd m10, m13 + pmaddwd m13, m14, m3 + pmaddwd m16, m11, m2 + paddd m13, m16 + paddd m10, m13 + paddd m10, m6 + psrad m10, INTERP_SHIFT_PP + + pmaddwd m11, m0 + pmaddwd m14, m1 + paddd m11, m14 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 + paddd m11, m6 + psrad m11, INTERP_SHIFT_PP + + packusdw m10, m11 + CLIPW m10, m7, m8 + pshufb m10, m9 movu [r2 + r3 + mmsize], m10 %endmacro +%macro IPFILTER_LUMA_AVX512_8xN 1 +INIT_ZMM avx512 +cglobal interp_8tap_horiz_pp_8x%1, 5, 8, 17 + add r1d, r1d + add r3d, r3d + sub r0, 6 + mov r4d, r4m + shl r4d, 4 + +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4] + vpbroadcastd m1, [r5 + r4 + 4] + vpbroadcastd m2, [r5 + r4 + 8] + vpbroadcastd m3, [r5 + r4 + 12] +%else + vpbroadcastd m0, [tab_LumaCoeff + r4] + vpbroadcastd m1, [tab_LumaCoeff + r4 + 4] + vpbroadcastd m2, [tab_LumaCoeff + r4 + 8] + vpbroadcastd m3, [tab_LumaCoeff + r4 + 12] +%endif + vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512] + vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512] + vbroadcasti32x8 m6, [pd_32] + pxor m7, m7 + vbroadcasti32x8 m8, [pw_pixel_max] + vbroadcasti32x8 m9, [interp8_hpp_shuf1_store_avx512] + lea r6, [3 * r1] + lea r7, [3 * r3] + +%rep %1/4 - 1 + PROCESS_IPFILTER_LUMA_PP_8x4_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] +%endrep + PROCESS_IPFILTER_LUMA_PP_8x4_AVX512 + RET +%endmacro + +%if ARCH_X86_64 + IPFILTER_LUMA_AVX512_8xN 4 + IPFILTER_LUMA_AVX512_8xN 8 + IPFILTER_LUMA_AVX512_8xN 16 + IPFILTER_LUMA_AVX512_8xN 32 +%endif + %macro IPFILTER_LUMA_AVX512_16xN 1 INIT_ZMM avx512 cglobal interp_8tap_horiz_pp_16x%1, 5,8,17 _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel