# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com> # Date 1511937076 -19800 # Wed Nov 29 12:01:16 2017 +0530 # Node ID 8eeff916ebe608526f167177a19c4516266ba513 # Parent 2ebaab8b8d9be6f8c1c89699f818f6426cc3fbea x86: AVX512 interp_8tap_horiz_pp_24x32 for high bit depth
AVX2 performance : 9.14x AVX512 performance : 20.85x diff -r 2ebaab8b8d9b -r 8eeff916ebe6 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Nov 29 10:02:46 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Nov 29 12:01:16 2017 +0530 @@ -2628,6 +2628,7 @@ p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_avx512); p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_avx512); p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_avx512); + p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_avx512); p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_avx512); p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_avx512); p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_avx512); diff -r 2ebaab8b8d9b -r 8eeff916ebe6 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Wed Nov 29 10:02:46 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Wed Nov 29 12:01:16 2017 +0530 @@ -9756,7 +9756,7 @@ vextracti32x8 [r2 + r7], m10, 1 %endmacro -%macro PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 0 +%macro PROCESS_IPFILTER_LUMA_PP_24x4_AVX512 0 ; register map ; m0 , m1, m2, m3 - interpolate coeff ; m4 , m5 load shuffle order table @@ -9765,9 +9765,23 @@ ; m8 - pw_pixel_max ; m9 - store shuffle order table - movu m10, [r0] - movu m11, [r0 + 8] - movu m12, [r0 + 16] + PROCESS_IPFILTER_LUMA_PP_16x4_AVX512 + + movu xm10, [r0 + mmsize/2] + movu xm11, [r0 + mmsize/2 + 8] + movu xm12, [r0 + mmsize/2 + 16] + + vinserti32x4 m10, [r0 + r1 + mmsize/2], 1 + vinserti32x4 m11, [r0 + r1 + mmsize/2 + 8], 1 + vinserti32x4 m12, [r0 + r1 + mmsize/2 + 16], 1 + + vinserti32x4 m10, [r0 + 2 * r1 + mmsize/2], 2 + vinserti32x4 m11, [r0 + 2 * r1 + mmsize/2 + 8], 2 + vinserti32x4 m12, [r0 + 2 * r1 + mmsize/2 + 16], 2 + + vinserti32x4 m10, [r0 + r6 + mmsize/2], 3 + vinserti32x4 m11, [r0 + r6 + mmsize/2 + 8], 3 + vinserti32x4 m12, [r0 + r6 + mmsize/2 + 16], 3 pshufb m13, m10, m5 pshufb m10, m4 @@ -9799,11 +9813,24 @@ packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2], m10 - - movu m10, [r0 + r1] - movu m11, [r0 + r1 + 8] - movu m12, [r0 + r1 + 16] + movu [r2 + mmsize/2], xm10 + vextracti32x4 [r2 + r3 + mmsize/2], m10, 1 + vextracti32x4 [r2 + 2 * r3 + mmsize/2], m10, 2 + vextracti32x4 [r2 + r7 + mmsize/2], m10, 3 +%endmacro + +%macro PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 0 + ; register map + ; m0 , m1, m2, m3 - interpolate coeff + ; m4 , m5 load shuffle order table + ; m6 - pd_32 + ; m7 - zero + ; m8 - pw_pixel_max + ; m9 - store shuffle order table + + movu m10, [r0] + movu m11, [r0 + 8] + movu m12, [r0 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -9825,31 +9852,21 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m14, m15, m3 - pmaddwd m16, m12, m2 - paddd m14, m16 - paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + r3], m10 -%endmacro - -%macro PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 0 - ; register map - ; m0 , m1, m2, m3 - interpolate coeff - ; m4 , m5 load shuffle order table - ; m6 - pd_32 - ; m7 - zero - ; m8 - pw_pixel_max - ; m9 - store shuffle order table - - movu m10, [r0] - movu m11, [r0 + 8] - movu m12, [r0 + 16] + movu [r2], m10 + + movu m10, [r0 + r1] + movu m11, [r0 + r1 + 8] + movu m12, [r0 + r1 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -9871,21 +9888,31 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m15, m3 - pmaddwd m12, m2 - paddd m12, m15 - paddd m11, m12 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2], m10 - - movu m10, [r0 + r1] - movu m11, [r0 + r1 + 8] - movu m12, [r0 + r1 + 16] + movu [r2 + r3], m10 +%endmacro + +%macro PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 0 + ; register map + ; m0 , m1, m2, m3 - interpolate coeff + ; m4 , m5 load shuffle order table + ; m6 - pd_32 + ; m7 - zero + ; m8 - pw_pixel_max + ; m9 - store shuffle order table + + movu m10, [r0] + movu m11, [r0 + 8] + movu m12, [r0 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -9907,21 +9934,21 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m14, m15, m3 - pmaddwd m16, m12, m2 - paddd m14, m16 - paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + r3], m10 - - movu m10, [r0 + 2 * r1] - movu m11, [r0 + 2 * r1 + 8] - movu m12, [r0 + 2 * r1 + 16] + movu [r2], m10 + + movu m10, [r0 + r1] + movu m11, [r0 + r1 + 8] + movu m12, [r0 + r1 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -9943,21 +9970,21 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m15, m3 - pmaddwd m12, m2 - paddd m12, m15 - paddd m11, m12 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + 2 * r3], m10 - - movu m10, [r0 + r6] - movu m11, [r0 + r6 + 8] - movu m12, [r0 + r6 + 16] + movu [r2 + r3], m10 + + movu m10, [r0 + 2 * r1] + movu m11, [r0 + 2 * r1 + 8] + movu m12, [r0 + 2 * r1 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -9979,24 +10006,21 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m14, m15, m3 - pmaddwd m16, m12, m2 - paddd m14, m16 - paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + r7], m10 - - movu ym10, [r0 + mmsize] - vinserti32x8 m10, [r0 + r1 + mmsize], 1 - movu ym11, [r0 + mmsize + 8] - vinserti32x8 m11, [r0 + r1 + mmsize + 8], 1 - movu ym12, [r0 + mmsize + 16] - vinserti32x8 m12, [r0 + r1 + mmsize + 16], 1 + movu [r2 + 2 * r3], m10 + + movu m10, [r0 + r6] + movu m11, [r0 + r6 + 8] + movu m12, [r0 + r6 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -10018,25 +10042,24 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m15, m3 - pmaddwd m12, m2 - paddd m12, m15 - paddd m11, m12 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + mmsize], ym10 - vextracti32x8 [r2 + r3 + mmsize], m10, 1 - - movu ym10, [r0 + 2 * r1 + mmsize] - vinserti32x8 m10, [r0 + r6 + mmsize], 1 - movu ym11, [r0 + 2 * r1 + mmsize + 8] - vinserti32x8 m11, [r0 + r6 + mmsize + 8], 1 - movu ym12, [r0 + 2 * r1 + mmsize + 16] - vinserti32x8 m12, [r0 + r6 + mmsize + 16], 1 + movu [r2 + r7], m10 + + movu ym10, [r0 + mmsize] + vinserti32x8 m10, [r0 + r1 + mmsize], 1 + movu ym11, [r0 + mmsize + 8] + vinserti32x8 m11, [r0 + r1 + mmsize + 8], 1 + movu ym12, [r0 + mmsize + 16] + vinserti32x8 m12, [r0 + r1 + mmsize + 16], 1 pshufb m13, m10, m5 pshufb m10, m4 @@ -10058,32 +10081,25 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m14, m15, m3 - pmaddwd m16, m12, m2 - paddd m14, m16 - paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + 2 * r3 + mmsize], ym10 - vextracti32x8 [r2 + r7 + mmsize], m10, 1 -%endmacro - -%macro PROCESS_IPFILTER_LUMA_PP_64x2_AVX512 0 - ; register map - ; m0 , m1, m2, m3 - interpolate coeff - ; m4 , m5 load shuffle order table - ; m6 - pd_32 - ; m7 - zero - ; m8 - pw_pixel_max - ; m9 - store shuffle order table - - movu m10, [r0] - movu m11, [r0 + 8] - movu m12, [r0 + 16] + movu [r2 + mmsize], ym10 + vextracti32x8 [r2 + r3 + mmsize], m10, 1 + + movu ym10, [r0 + 2 * r1 + mmsize] + vinserti32x8 m10, [r0 + r6 + mmsize], 1 + movu ym11, [r0 + 2 * r1 + mmsize + 8] + vinserti32x8 m11, [r0 + r6 + mmsize + 8], 1 + movu ym12, [r0 + 2 * r1 + mmsize + 16] + vinserti32x8 m12, [r0 + r6 + mmsize + 16], 1 pshufb m13, m10, m5 pshufb m10, m4 @@ -10105,21 +10121,32 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m15, m3 - pmaddwd m12, m2 - paddd m12, m15 - paddd m11, m12 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2], m10 - - movu m10, [r0 + mmsize] - movu m11, [r0 + mmsize + 8] - movu m12, [r0 + mmsize + 16] + movu [r2 + 2 * r3 + mmsize], ym10 + vextracti32x8 [r2 + r7 + mmsize], m10, 1 +%endmacro + +%macro PROCESS_IPFILTER_LUMA_PP_64x2_AVX512 0 + ; register map + ; m0 , m1, m2, m3 - interpolate coeff + ; m4 , m5 load shuffle order table + ; m6 - pd_32 + ; m7 - zero + ; m8 - pw_pixel_max + ; m9 - store shuffle order table + + movu m10, [r0] + movu m11, [r0 + 8] + movu m12, [r0 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -10151,11 +10178,11 @@ packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + mmsize], m10 - - movu m10, [r0 + r1] - movu m11, [r0 + r1 + 8] - movu m12, [r0 + r1 + 16] + movu [r2], m10 + + movu m10, [r0 + mmsize] + movu m11, [r0 + mmsize + 8] + movu m12, [r0 + mmsize + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -10177,21 +10204,21 @@ pmaddwd m11, m0 pmaddwd m14, m1 paddd m11, m14 - pmaddwd m14, m15, m3 - pmaddwd m16, m12, m2 - paddd m14, m16 - paddd m11, m14 + pmaddwd m15, m3 + pmaddwd m12, m2 + paddd m12, m15 + paddd m11, m12 paddd m11, m6 psrad m11, INTERP_SHIFT_PP packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 - movu [r2 + r3], m10 - - movu m10, [r0 + r1 + mmsize] - movu m11, [r0 + r1 + mmsize + 8] - movu m12, [r0 + r1 + mmsize + 16] + movu [r2 + mmsize], m10 + + movu m10, [r0 + r1] + movu m11, [r0 + r1 + 8] + movu m12, [r0 + r1 + 16] pshufb m13, m10, m5 pshufb m10, m4 @@ -10223,6 +10250,42 @@ packusdw m10, m11 CLIPW m10, m7, m8 pshufb m10, m9 + movu [r2 + r3], m10 + + movu m10, [r0 + r1 + mmsize] + movu m11, [r0 + r1 + mmsize + 8] + movu m12, [r0 + r1 + mmsize + 16] + + pshufb m13, m10, m5 + pshufb m10, m4 + pshufb m14, m11, m5 + pshufb m11, m4 + pshufb m15, m12, m5 + pshufb m12, m4 + + pmaddwd m10, m0 + pmaddwd m13, m1 + paddd m10, m13 + pmaddwd m13, m14, m3 + pmaddwd m16, m11, m2 + paddd m13, m16 + paddd m10, m13 + paddd m10, m6 + psrad m10, INTERP_SHIFT_PP + + pmaddwd m11, m0 + pmaddwd m14, m1 + paddd m11, m14 + pmaddwd m14, m15, m3 + pmaddwd m16, m12, m2 + paddd m14, m16 + paddd m11, m14 + paddd m11, m6 + psrad m11, INTERP_SHIFT_PP + + packusdw m10, m11 + CLIPW m10, m7, m8 + pshufb m10, m9 movu [r2 + r3 + mmsize], m10 %endmacro @@ -10320,6 +10383,45 @@ IPFILTER_LUMA_AVX512_16xN 64 %endif +%if ARCH_X86_64 +INIT_ZMM avx512 +cglobal interp_8tap_horiz_pp_24x32, 5, 8, 17 + add r1d, r1d + add r3d, r3d + sub r0, 6 + mov r4d, r4m + shl r4d, 4 + +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4] + vpbroadcastd m1, [r5 + r4 + 4] + vpbroadcastd m2, [r5 + r4 + 8] + vpbroadcastd m3, [r5 + r4 + 12] +%else + vpbroadcastd m0, [tab_LumaCoeff + r4] + vpbroadcastd m1, [tab_LumaCoeff + r4 + 4] + vpbroadcastd m2, [tab_LumaCoeff + r4 + 8] + vpbroadcastd m3, [tab_LumaCoeff + r4 + 12] +%endif + vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512] + vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512] + vbroadcasti32x8 m6, [pd_32] + pxor m7, m7 + vbroadcasti32x8 m8, [pw_pixel_max] + vbroadcasti32x8 m9, [interp8_hpp_shuf1_store_avx512] + lea r6, [3 * r1] + lea r7, [3 * r3] + +%rep 7 + PROCESS_IPFILTER_LUMA_PP_24x4_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] +%endrep + PROCESS_IPFILTER_LUMA_PP_24x4_AVX512 + RET +%endif + %macro IPFILTER_LUMA_AVX512_32xN 1 INIT_ZMM avx512 cglobal interp_8tap_horiz_pp_32x%1, 5,6,17 _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel