# HG changeset patch # User Jayashri Murugan <jayas...@multicorewareinc.com> # Date 1504071908 -19800 # Wed Aug 30 11:15:08 2017 +0530 # Node ID 578a08347d3e92db9300f5e28baacd72a71a6423 # Parent 7527c103cbe87811ec9e380a00d8a8605b761377 x86: AVX512 interp_4tap_horiz_ps_16xN
Color Space i420 Size | AVX2 performance | AVX512 performance ---------------------------------------------- 16x4 | 26.49x | 27.94x 16x8 | 27.75x | 32.97x 16x12 | 26.18x | 31.39x 16x16 | 22.33x | 31.64x 16x32 | 25.54x | 28.82x Color Space i422 Size | AVX2 performance | AVX512 performance ---------------------------------------------- 16x8 | 27.99x | 29.72x 16x16 | 22.55x | 31.32x 16x24 | 23.28x | 33.12x 16x32 | 25.35x | 28.84x 16x64 | 27.70x | 30.87x Color Space i444 Size | AVX2 performance | AVX512 performance ---------------------------------------------- 16x4 | 26.31x | 28.37x 16x8 | 28.38x | 32.95x 16x12 | 26.79x | 30.72x 16x16 | 23.92x | 31.28x 16x32 | 25.60x | 28.81x 16x64 | 27.42x | 30.81x diff -r 7527c103cbe8 -r 578a08347d3e source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Aug 23 12:00:03 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 30 11:15:08 2017 +0530 @@ -4414,6 +4414,25 @@ p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_avx512); p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_avx512); p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_avx512); + + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hps = PFX(interp_4tap_horiz_ps_16x4_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx512); + + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hps = PFX(interp_4tap_horiz_ps_16x24_avx512); + + p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hps = PFX(interp_4tap_horiz_ps_16x4_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_avx512); } #endif } diff -r 7527c103cbe8 -r 578a08347d3e source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Aug 23 12:00:03 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Aug 30 11:15:08 2017 +0530 @@ -152,8 +152,7 @@ ALIGN 64 const interp4_hps_shuf_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7 - -ALIGN 64 +const interp4_hps_store_16xN_avx512, dq 0, 2, 1, 3, 4, 6, 5, 7 const interp8_hps_store_avx512, dq 0, 1, 4, 5, 2, 3, 6, 7 SECTION .text @@ -10575,6 +10574,102 @@ IPFILTER_CHROMA_PS_32xN_AVX512 16 IPFILTER_CHROMA_PS_32xN_AVX512 8 +%macro PROCESS_IPFILTER_CHROMA_PS_16x2_AVX512 0 + movu xm6, [r0] + vinserti32x4 m6, [r0 + 4], 1 + vinserti32x4 m6, [r0 + r1], 2 + vinserti32x4 m6, [r0 + r1 + 4], 3 + + pshufb m7, m6, m2 + pshufb m6, m6, m1 + pmaddubsw m6, m0 + pmaddubsw m7, m0 + pmaddwd m6, m3 + pmaddwd m7, m3 + + packssdw m6, m7 + psubw m6, m4 + vpermq m6, m8, m6 + movu [r2], ym6 + vextracti32x8 [r2 + r3], m6, 1 +%endmacro + +%macro PROCESS_IPFILTER_CHROMA_PS_16x1_AVX512 0 + movu xm6, [r0] + vinserti32x4 m6, [r0 + 4], 1 + + pshufb ym7, ym6, ym2 + pshufb ym6, ym6, ym1 + pmaddubsw ym6, ym0 + pmaddubsw ym7, ym0 + pmaddwd ym6, ym3 + pmaddwd ym7, ym3 + + packssdw ym6, ym7 + psubw ym6, ym4 + vpermq ym6, ym8, ym6 + movu [r2], ym6 +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_horiz_ps_16xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PS_16xN_AVX512 1 +INIT_ZMM avx512 +cglobal interp_4tap_horiz_ps_16x%1, 4,7,9 + mov r4d, r4m + mov r5d, r5m + add r3, r3 + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512] + vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512] + vbroadcasti32x8 m3, [pw_1] + vbroadcasti32x8 m4, [pw_2000] + mova m8, [interp4_hps_store_16xN_avx512] + + ; register map + ; m0 - interpolate coeff + ; m1,m2 - load shuffle order table + ; m3 - constant word 1 + ; m4 - constant word 2000 + ; m8 - store shuffle order table + + mov r6d, %1 + dec r0 + test r5d, r5d + je .loop + sub r0, r1 + add r6d, 3 + PROCESS_IPFILTER_CHROMA_PS_16x1_AVX512 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + dec r6d + +.loop: + PROCESS_IPFILTER_CHROMA_PS_16x2_AVX512 + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + sub r6d, 2 + jnz .loop + + RET +%endmacro + + IPFILTER_CHROMA_PS_16xN_AVX512 64 + IPFILTER_CHROMA_PS_16xN_AVX512 32 + IPFILTER_CHROMA_PS_16xN_AVX512 24 + IPFILTER_CHROMA_PS_16xN_AVX512 16 + IPFILTER_CHROMA_PS_16xN_AVX512 12 + IPFILTER_CHROMA_PS_16xN_AVX512 8 + IPFILTER_CHROMA_PS_16xN_AVX512 4 + ;------------------------------------------------------------------------------------------------------------- ;ipfilter_chroma_avx512 code end ;------------------------------------------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel