# HG changeset patch # User Jayashri Murugan <jayas...@multicorewareinc.com> # Date 1502434970 -19800 # Fri Aug 11 12:32:50 2017 +0530 # Node ID 6f811dfd5690866f4c432911982a30665dc0e91c # Parent 951e9a16296e5d1e528c0083630fde8122bd15c1 x86: AVX512 interp_4tap_horiz_ps_32xN
Color Space i444 Size | AVX2 performance | AVX512 performance ------------------------------------------------ 32x8 | 25.91x | 38.35x 32x16 | 25.45x | 32.02x 32x24 | 25.80x | 32.73x 32x32 | 33.49x | 38.02x 32x64 | 27.42x | 36.20x Color Space i422 Size | AVX2 performance | AVX512 performance ------------------------------------------------ 32x16 | 24.74x | 33.95x 32x32 | 33.31x | 34.28x 32x48 | 27.11x | 35.98x 32x64 | 27.32x | 35.02x Color Space i420 Size | AVX2 performance | AVX512 performance ------------------------------------------------ 32x8 | 27.16x | 36.68x 32x16 | 24.87x | 31.40x 32x24 | 25.98x | 34.08x 32x32 | 33.01x | 34.71x diff -r 951e9a16296e -r 6f811dfd5690 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 10 22:47:55 2017 -0700 +++ b/source/common/x86/asm-primitives.cpp Fri Aug 11 12:32:50 2017 +0530 @@ -4034,6 +4034,25 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = PFX(interp_4tap_horiz_ps_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = PFX(interp_4tap_horiz_ps_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = PFX(interp_4tap_horiz_ps_64x16_avx512); + + p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512); + + //i422 chroma_hps + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hps = PFX(interp_4tap_horiz_ps_32x48_avx512); + + //i420 chroma_hps + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512); + } #endif } diff -r 951e9a16296e -r 6f811dfd5690 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Thu Aug 10 22:47:55 2017 -0700 +++ b/source/common/x86/ipfilter8.asm Fri Aug 11 12:32:50 2017 +0530 @@ -10010,7 +10010,7 @@ %endmacro ;------------------------------------------------------------------------------------------------------------- -; void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) +; void interp_horiz_ps_64xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;------------------------------------------------------------------------------------------------------------- %macro IPFILTER_CHROMA_PS_64xN_AVX512 1 INIT_ZMM avx512 @@ -10059,6 +10059,74 @@ IPFILTER_CHROMA_PS_64xN_AVX512 48 IPFILTER_CHROMA_PS_64xN_AVX512 16 +%macro PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512 0 + movu ym6, [r0] + vinserti32x8 m6, [r0 + 4], 1 + pshufb m7, m6, m2 + pshufb m6, m6, m1 + pmaddubsw m6, m0 + pmaddubsw m7, m0 + pmaddwd m6, m3 + pmaddwd m7, m3 + + packssdw m6, m7 + psubw m6, m4 + vpermq m6, m8, m6 + movu [r2], m6 +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_horiz_ps_32xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PS_32xN_AVX512 1 +INIT_ZMM avx512 +cglobal interp_4tap_horiz_ps_32x%1, 4,7,9 + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512] + vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512] + vbroadcasti32x8 m3, [pw_1] + vbroadcasti32x8 m4, [pw_2000] + mova m8, [interp8_hps_shuf_avx512] + + ; register map + ; m0 - interpolate coeff + ; m1,m2 - load shuffle order table + ; m3 - constant word 1 + ; m4 - constant word 2000 + ; m8 - store shuffle order table + + mov r6d, %1 + dec r0 + test r5d, r5d + je .loop + sub r0, r1 + add r6d, 3 + +.loop: + PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512 + lea r2, [r2 + 2 * r3] + lea r0, [r0 + r1] + dec r6d + jnz .loop + RET +%endmacro + + IPFILTER_CHROMA_PS_32xN_AVX512 64 + IPFILTER_CHROMA_PS_32xN_AVX512 48 + IPFILTER_CHROMA_PS_32xN_AVX512 32 + IPFILTER_CHROMA_PS_32xN_AVX512 24 + IPFILTER_CHROMA_PS_32xN_AVX512 16 + IPFILTER_CHROMA_PS_32xN_AVX512 8 + ;------------------------------------------------------------------------------------------------------------- ;ipfilter_chroma_avx512 code end ;------------------------------------------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel