# HG changeset patch # User Jayashri Murugan <jayas...@multicorewareinc.com> # Date 1512455478 -19800 # Tue Dec 05 12:01:18 2017 +0530 # Node ID c335a7ca4304001e245dea7977cde1c2e0c0a8ee # Parent 81a870948ac446b36c248325e0c7264cf8f3f09e x86: AVX512 interp_4tap_vert_ps_32xN
i420 Size | AVX2 performance | AVX512 performance ---------------------------------------------- 32x8 | 36.28x | 47.86x 32x16 | 40.43x | 51.57x 32x24 | 40.96x | 54.05x 32x32 | 40.12x | 54.27x i422 Size | AVX2 performance | AVX512 performance ---------------------------------------------- 32x16 | 39.84x | 51.35x 32x32 | 39.86x | 54.17x 32x48 | 41.14x | 54.85x 32x64 | 42.00x | 56.50x i444 Size | AVX2 performance | AVX512 performance ---------------------------------------------- 32x8 | 36.08x | 47.61x 32x16 | 39.96x | 51.41x 32x24 | 40.38x | 54.51x 32x32 | 40.07x | 54.56x 32x64 | 41.94x | 56.59x diff -r 81a870948ac4 -r c335a7ca4304 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Dec 07 15:31:54 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 05 12:01:18 2017 +0530 @@ -5158,6 +5158,23 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = PFX(interp_4tap_vert_ps_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = PFX(interp_4tap_vert_ps_64x16_avx512); + + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512); + + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vps = PFX(interp_4tap_vert_ps_32x48_avx512); + + p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512); + } #endif } diff -r 81a870948ac4 -r c335a7ca4304 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Thu Dec 07 15:31:54 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Tue Dec 05 12:01:18 2017 +0530 @@ -10951,7 +10951,7 @@ FILTER_VER_PP_CHROMA_16xN_AVX512 64 %endif -%macro PROCESS_CHROMA_VERT_PP_32x4_AVX512 0 +%macro PROCESS_CHROMA_VERT_32x4_AVX512 1 movu ym1, [r0] movu ym3, [r0 + r1] vinserti32x8 m1, [r0 + 2 * r1], 1 @@ -10988,25 +10988,45 @@ pmaddubsw m5, m9 paddw m3, m5 +%ifidn %1,pp pmulhrsw m0, m7 pmulhrsw m1, m7 pmulhrsw m2, m7 pmulhrsw m3, m7 - packuswb m0, m1 packuswb m2, m3 movu [r2], ym0 movu [r2 + r3], ym2 vextracti32x8 [r2 + 2 * r3], m0, 1 vextracti32x8 [r2 + r7], m2, 1 +%else + psubw m0, m7 + psubw m1, m7 + psubw m2, m7 + psubw m3, m7 + + mova m4, m10 + mova m5, m11 + vpermi2q m4, m0, m1 + vpermi2q m5, m0, m1 + mova m6, m10 + mova m12, m11 + vpermi2q m6, m2, m3 + vpermi2q m12, m2, m3 + + movu [r2], m4 + movu [r2 + r3], m6 + movu [r2 + 2 * r3], m5 + movu [r2 + r7], m12 +%endif %endmacro ;----------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_PP_CHROMA_32xN_AVX512 1 +%macro FILTER_VERT_CHROMA_32xN_AVX512 2 INIT_ZMM avx512 -cglobal interp_4tap_vert_pp_32x%1, 4, 10, 8 +cglobal interp_4tap_vert_%1_32x%2, 4, 8, 13 mov r4d, r4m shl r4d, 7 sub r0, r1 @@ -11019,26 +11039,42 @@ mova m8, [tab_ChromaCoeffVer_32_avx512 + r4] mova m9, [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize] %endif - vbroadcasti32x8 m7, [pw_512] + +%ifidn %1,pp + vbroadcasti32x8 m7, [pw_512] +%else + add r3d, r3d + vbroadcasti32x8 m7, [pw_2000] + mova m10, [interp4_vps_store1_avx512] + mova m11, [interp4_vps_store2_avx512] +%endif + lea r6, [3 * r1] lea r7, [3 * r3] -%rep %1/4 - 1 - PROCESS_CHROMA_VERT_PP_32x4_AVX512 +%rep %2/4 - 1 + PROCESS_CHROMA_VERT_32x4_AVX512 %1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 4 * r3] %endrep - PROCESS_CHROMA_VERT_PP_32x4_AVX512 + PROCESS_CHROMA_VERT_32x4_AVX512 %1 RET %endmacro %if ARCH_X86_64 - FILTER_VER_PP_CHROMA_32xN_AVX512 8 - FILTER_VER_PP_CHROMA_32xN_AVX512 16 - FILTER_VER_PP_CHROMA_32xN_AVX512 24 - FILTER_VER_PP_CHROMA_32xN_AVX512 32 - FILTER_VER_PP_CHROMA_32xN_AVX512 48 - FILTER_VER_PP_CHROMA_32xN_AVX512 64 + FILTER_VERT_CHROMA_32xN_AVX512 pp, 8 + FILTER_VERT_CHROMA_32xN_AVX512 pp, 16 + FILTER_VERT_CHROMA_32xN_AVX512 pp, 24 + FILTER_VERT_CHROMA_32xN_AVX512 pp, 32 + FILTER_VERT_CHROMA_32xN_AVX512 pp, 48 + FILTER_VERT_CHROMA_32xN_AVX512 pp, 64 + + FILTER_VERT_CHROMA_32xN_AVX512 ps, 8 + FILTER_VERT_CHROMA_32xN_AVX512 ps, 16 + FILTER_VERT_CHROMA_32xN_AVX512 ps, 24 + FILTER_VERT_CHROMA_32xN_AVX512 ps, 32 + FILTER_VERT_CHROMA_32xN_AVX512 ps, 48 + FILTER_VERT_CHROMA_32xN_AVX512 ps, 64 %endif %macro PROCESS_CHROMA_VERT_PP_48x4_AVX512 0 _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel