# HG changeset patch # User Jayashree <jayashre...@multicorewareinc.com> # Date 1524213427 -19800 # Fri Apr 20 14:07:07 2018 +0530 # Node ID 8ea2e5d0296aad4fba48ac36ff6d99d7770c7990 # Parent 1485405aa16ff2d6f04acb8aeafdae6f32a3bfb5 x86:AVX2 optimize luma_hps 16xN
diff -r 1485405aa16f -r 8ea2e5d0296a source/common/x86/h-ipfilter16.asm --- a/source/common/x86/h-ipfilter16.asm Fri Feb 02 10:20:08 2018 +0530 +++ b/source/common/x86/h-ipfilter16.asm Fri Apr 20 14:07:07 2018 +0530 @@ -2379,26 +2379,66 @@ IPFILTER_LUMA_PS_32_64_AVX2 48, 64 +%macro PROCESS_IPFILTER_LUMA_PS_16x1_AVX2 0 + movu m7, [r0] + movu m8, [r0 + 8] + pshufb m10, m7, m14 + pshufb m7, m13 + pshufb m11, m8, m14 + pshufb m8, m13 + + pmaddwd m7, m0 + pmaddwd m10, m1 + paddd m7, m10 + pmaddwd m10, m11, m3 + pmaddwd m9, m8, m2 + paddd m10, m9 + paddd m7, m10 + paddd m7, m4 + psrad m7, INTERP_SHIFT_PS + movu m9, [r0 + 16] + pshufb m10, m9, m14 + pshufb m9, m13 + pmaddwd m8, m0 + pmaddwd m11, m1 + paddd m8, m11 + pmaddwd m10, m3 + pmaddwd m9, m2 + paddd m9, m10 + paddd m8, m9 + paddd m8, m4 + psrad m8, INTERP_SHIFT_PS + packssdw m7, m8 + pshufb m7, m12 + movu [r2], m7 +%endmacro + %macro IPFILTER_LUMA_PS_16xN_AVX2 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_16x%1, 4, 6, 8 +cglobal interp_8tap_horiz_ps_16x%1, 5, 6, 15 - add r1d, r1d - add r3d, r3d + shl r1d, 1 + shl r3d, 1 mov r4d, r4m mov r5d, r5m shl r4d, 4 %ifdef PIC lea r6, [h_tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4] - vpbroadcastq m1, [r6 + r4 + 8] + vpbroadcastd m0, [r6 + r4] + vpbroadcastd m1, [r6 + r4 + 4] + vpbroadcastd m2, [r6 + r4 + 8] + vpbroadcastd m3, [r6 + r4 + 12] %else - vpbroadcastq m0, [h_tab_LumaCoeff + r4] - vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8] + vpbroadcastd m0, [h_tab_LumaCoeff + r4] + vpbroadcastd m1, [h_tab_LumaCoeff + r4 + 4] + vpbroadcastd m2, [h_tab_LumaCoeff + r4 + 8] + vpbroadcastd m3, [h_tab_LumaCoeff + r4 + 12] %endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] + mova m13, [interp8_hpp_shuf1_load_avx512] + mova m14, [interp8_hpp_shuf2_load_avx512] + mova m12, [interp8_hpp_shuf1_store_avx512] + vbroadcasti128 m4, [INTERP_OFFSET_PS] ; register map ; m0 , m1 interpolate coeff @@ -2412,55 +2452,12 @@ add r4d, 7 .loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m7, m5, m3 - pmaddwd m4, m0 - pmaddwd m7, m1 - paddd m4, m7 - vbroadcasti128 m6, [r0 + 16] - pshufb m5, m3 - pshufb m7, m6, m3 - pmaddwd m5, m0 - pmaddwd m7, m1 - paddd m5, m7 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m2 - vextracti128 xm5, m4, 1 - psrad xm4, INTERP_SHIFT_PS - psrad xm5, INTERP_SHIFT_PS - packssdw xm4, xm5 - movu [r2], xm4 - - vbroadcasti128 m5, [r0 + 24] - pshufb m6, m3 - pshufb m7, m5, m3 - pmaddwd m6, m0 - pmaddwd m7, m1 - paddd m6, m7 - - vbroadcasti128 m7, [r0 + 32] - pshufb m5, m3 - pshufb m7, m3 - pmaddwd m5, m0 - pmaddwd m7, m1 - paddd m5, m7 - - phaddd m6, m5 - vpermq m6, m6, q3120 - paddd m6, m2 - vextracti128 xm5,m6, 1 - psrad xm6, INTERP_SHIFT_PS - psrad xm5, INTERP_SHIFT_PS - packssdw xm6, xm5 - movu [r2 + 16], xm6 - - add r2, r3 - add r0, r1 + PROCESS_IPFILTER_LUMA_PS_16x1_AVX2 + lea r0, [r0 + r1] + lea r2, [r2 + r3] + ;add r2, r3 + ;add r0, r1 dec r4d jnz .loop0 RET
# HG changeset patch # User Jayashree <jayashre...@multicorewareinc.com> # Date 1524213427 -19800 # Fri Apr 20 14:07:07 2018 +0530 # Node ID 8ea2e5d0296aad4fba48ac36ff6d99d7770c7990 # Parent 1485405aa16ff2d6f04acb8aeafdae6f32a3bfb5 x86:AVX2 optimize luma_hps 16xN diff -r 1485405aa16f -r 8ea2e5d0296a source/common/x86/h-ipfilter16.asm --- a/source/common/x86/h-ipfilter16.asm Fri Feb 02 10:20:08 2018 +0530 +++ b/source/common/x86/h-ipfilter16.asm Fri Apr 20 14:07:07 2018 +0530 @@ -2379,26 +2379,66 @@ IPFILTER_LUMA_PS_32_64_AVX2 48, 64 +%macro PROCESS_IPFILTER_LUMA_PS_16x1_AVX2 0 + movu m7, [r0] + movu m8, [r0 + 8] + pshufb m10, m7, m14 + pshufb m7, m13 + pshufb m11, m8, m14 + pshufb m8, m13 + + pmaddwd m7, m0 + pmaddwd m10, m1 + paddd m7, m10 + pmaddwd m10, m11, m3 + pmaddwd m9, m8, m2 + paddd m10, m9 + paddd m7, m10 + paddd m7, m4 + psrad m7, INTERP_SHIFT_PS + movu m9, [r0 + 16] + pshufb m10, m9, m14 + pshufb m9, m13 + pmaddwd m8, m0 + pmaddwd m11, m1 + paddd m8, m11 + pmaddwd m10, m3 + pmaddwd m9, m2 + paddd m9, m10 + paddd m8, m9 + paddd m8, m4 + psrad m8, INTERP_SHIFT_PS + packssdw m7, m8 + pshufb m7, m12 + movu [r2], m7 +%endmacro + %macro IPFILTER_LUMA_PS_16xN_AVX2 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_16x%1, 4, 6, 8 +cglobal interp_8tap_horiz_ps_16x%1, 5, 6, 15 - add r1d, r1d - add r3d, r3d + shl r1d, 1 + shl r3d, 1 mov r4d, r4m mov r5d, r5m shl r4d, 4 %ifdef PIC lea r6, [h_tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4] - vpbroadcastq m1, [r6 + r4 + 8] + vpbroadcastd m0, [r6 + r4] + vpbroadcastd m1, [r6 + r4 + 4] + vpbroadcastd m2, [r6 + r4 + 8] + vpbroadcastd m3, [r6 + r4 + 12] %else - vpbroadcastq m0, [h_tab_LumaCoeff + r4] - vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8] + vpbroadcastd m0, [h_tab_LumaCoeff + r4] + vpbroadcastd m1, [h_tab_LumaCoeff + r4 + 4] + vpbroadcastd m2, [h_tab_LumaCoeff + r4 + 8] + vpbroadcastd m3, [h_tab_LumaCoeff + r4 + 12] %endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] + mova m13, [interp8_hpp_shuf1_load_avx512] + mova m14, [interp8_hpp_shuf2_load_avx512] + mova m12, [interp8_hpp_shuf1_store_avx512] + vbroadcasti128 m4, [INTERP_OFFSET_PS] ; register map ; m0 , m1 interpolate coeff @@ -2412,55 +2452,12 @@ add r4d, 7 .loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m7, m5, m3 - pmaddwd m4, m0 - pmaddwd m7, m1 - paddd m4, m7 - vbroadcasti128 m6, [r0 + 16] - pshufb m5, m3 - pshufb m7, m6, m3 - pmaddwd m5, m0 - pmaddwd m7, m1 - paddd m5, m7 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m2 - vextracti128 xm5, m4, 1 - psrad xm4, INTERP_SHIFT_PS - psrad xm5, INTERP_SHIFT_PS - packssdw xm4, xm5 - movu [r2], xm4 - - vbroadcasti128 m5, [r0 + 24] - pshufb m6, m3 - pshufb m7, m5, m3 - pmaddwd m6, m0 - pmaddwd m7, m1 - paddd m6, m7 - - vbroadcasti128 m7, [r0 + 32] - pshufb m5, m3 - pshufb m7, m3 - pmaddwd m5, m0 - pmaddwd m7, m1 - paddd m5, m7 - - phaddd m6, m5 - vpermq m6, m6, q3120 - paddd m6, m2 - vextracti128 xm5,m6, 1 - psrad xm6, INTERP_SHIFT_PS - psrad xm5, INTERP_SHIFT_PS - packssdw xm6, xm5 - movu [r2 + 16], xm6 - - add r2, r3 - add r0, r1 + PROCESS_IPFILTER_LUMA_PS_16x1_AVX2 + lea r0, [r0 + r1] + lea r2, [r2 + r3] + ;add r2, r3 + ;add r0, r1 dec r4d jnz .loop0 RET
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel