# HG changeset patch # User Vignesh Vijayakumar # Date 1501222403 -19800 # Fri Jul 28 11:43:23 2017 +0530 # Node ID 563b3c4f91eb20374311ed18fb18ad12aeebaf26 # Parent 7d7f2a4e771c7c2b573db9bc298d1a35bb72f32d x86: AVX512 interp_4tap_horiz_pp_64xN
Size | AVX2 performance | AVX512 performance ---------------------------------------------- 64x16 | 21.45x | 39.29x 64x32 | 22.27x | 39.37x 64x48 | 22.76x | 40.75x 64x64 | 22.76x | 40.90x diff -r 7d7f2a4e771c -r 563b3c4f91eb source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 08 15:25:11 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jul 28 11:43:23 2017 +0530 @@ -3996,6 +3996,11 @@ p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx512); + } #endif } diff -r 7d7f2a4e771c -r 563b3c4f91eb source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Tue Aug 08 15:25:11 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Fri Jul 28 11:43:23 2017 +0530 @@ -137,6 +137,10 @@ const interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 +const interp4_horiz_shuf_load1_avx512, times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + +const interp4_horiz_shuf_load2_avx512, times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + SECTION .text cextern pb_128 @@ -9820,3 +9824,75 @@ FILTER_VER_LUMA_S_AVX2_32x24 sp FILTER_VER_LUMA_S_AVX2_32x24 ss +;------------------------------------------------------------------------------------------------------------- +;ipfilter_chroma_pp_avx512 code start +;------------------------------------------------------------------------------------------------------------- +%macro PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 0 + ; register map + ; m0 - interpolate coeff + ; m1, m2 - shuffle order table + ; m3 - constant word 1 + ; m4 - constant word 512 + + movu m5, [r0] + pshufb m6, m5, m2 + pshufb m5, m5, m1 + pmaddubsw m5, m0 + pmaddubsw m6, m0 + pmaddwd m5, m3 + pmaddwd m6, m3 + + movu m7, [r0 + 4] + pshufb m8, m7, m2 + pshufb m7, m7, m1 + pmaddubsw m7, m0 + pmaddubsw m8, m0 + pmaddwd m7, m3 + pmaddwd m8, m3 + + packssdw m5, m7 + packssdw m6, m8 + pmulhrsw m5, m4 + pmulhrsw m6, m4 + packuswb m5, m6 + movu [r2], m5 +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PP_64xN_AVX512 1 +INIT_ZMM avx512 +cglobal interp_4tap_horiz_pp_64x%1, 4,6,9 + mov r4d, r4m + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512] + vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512] + vbroadcasti32x8 m3, [pw_1] + vbroadcasti32x8 m4, [pw_512] + dec r0 + +%rep %1 - 1 + PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 + lea r2, [r2 + r3] + lea r0, [r0 + r1] +%endrep + PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 + RET +%endmacro + + IPFILTER_CHROMA_PP_64xN_AVX512 64 + IPFILTER_CHROMA_PP_64xN_AVX512 32 + IPFILTER_CHROMA_PP_64xN_AVX512 48 + IPFILTER_CHROMA_PP_64xN_AVX512 16 + +;------------------------------------------------------------------------------------------------------------- +;ipfilter_chroma_pp_avx512 code end +;------------------------------------------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel