# HG changeset patch # User Vignesh Vijayakumar # Date 1509862764 -19800 # Sun Nov 05 11:49:24 2017 +0530 # Node ID 2d94e5d214922d0f6cb0126e4477db8dd33256e7 # Parent 410a223c2caa58321a3a6b3e0a91c1dee512667a x86: AVX512 optimise interp_4tap_vert_pp_8xN high bit depth code
diff -r 410a223c2caa -r 2d94e5d21492 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Sat Nov 04 18:05:34 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Sun Nov 05 11:49:24 2017 +0530 @@ -5930,15 +5930,10 @@ punpckhwd m3, m4 pmaddwd m3, [r5] - lea r0, [r0 + 2 * r1] - lea r6, [r6 + 2 * r1] - lea r8, [r8 + 2 * r1] - lea r9, [r9 + 2 * r1] - - movu xm5, [r0 + r1] - vinserti32x4 m5, [r6 + r1], 1 - vinserti32x4 m5, [r8 + r1], 2 - vinserti32x4 m5, [r9 + r1], 3 + movu xm5, [r0 + r10] + vinserti32x4 m5, [r6 + r10], 1 + vinserti32x4 m5, [r8 + r10], 2 + vinserti32x4 m5, [r9 + r10], 3 punpcklwd m6, m4, m5 pmaddwd m6, [r5 + mmsize] paddd m0, m6 @@ -5946,10 +5941,10 @@ pmaddwd m4, [r5 + mmsize] paddd m1, m4 - movu xm4, [r0 + 2 * r1] - vinserti32x4 m4, [r6 + 2 * r1], 1 - vinserti32x4 m4, [r8 + 2 * r1], 2 - vinserti32x4 m4, [r9 + 2 * r1], 3 + movu xm4, [r0 + 4 * r1] + vinserti32x4 m4, [r6 + 4 * r1], 1 + vinserti32x4 m4, [r8 + 4 * r1], 2 + vinserti32x4 m4, [r9 + 4 * r1], 3 punpcklwd m6, m5, m4 pmaddwd m6, [r5 + mmsize] paddd m2, m6 @@ -5987,7 +5982,7 @@ ;----------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 INIT_ZMM avx512 -cglobal interp_4tap_vert_pp_8x8, 5, 10, 9 +cglobal interp_4tap_vert_pp_8x8, 5, 11, 9 add r1d, r1d add r3d, r3d sub r0, r1 @@ -6001,6 +5996,7 @@ %endif vbroadcasti32x8 m7, [INTERP_OFFSET_PP] vbroadcasti32x8 m8, [pw_pixel_max] + lea r10, [3 * r1] lea r7, [3 * r3] PROCESS_CHROMA_VERT_PP_8x8_AVX512 RET @@ -6008,7 +6004,7 @@ %macro FILTER_VER_PP_CHROMA_8xN_AVX512 1 INIT_ZMM avx512 -cglobal interp_4tap_vert_pp_8x%1, 5, 10, 9 +cglobal interp_4tap_vert_pp_8x%1, 5, 11, 9 add r1d, r1d add r3d, r3d sub r0, r1 @@ -6022,10 +6018,11 @@ %endif vbroadcasti32x8 m7, [INTERP_OFFSET_PP] vbroadcasti32x8 m8, [pw_pixel_max] + lea r10, [3 * r1] lea r7, [3 * r3] %rep %1/8 - 1 PROCESS_CHROMA_VERT_PP_8x8_AVX512 - lea r0, [r9] + lea r0, [r8 + 4 * r1] lea r2, [r2 + 4 * r3] %endrep PROCESS_CHROMA_VERT_PP_8x8_AVX512 _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel