# HG changeset patch # User Vignesh Vijayakumar # Date 1500988851 -19800 # Tue Jul 25 18:50:51 2017 +0530 # Node ID b4c2149e9bb1119857363094492b50e85593fb74 # Parent d05b920865e7c9e8cc9441e77df888b48acb50d1 x86: AVX512 convert_p2s_32xN for high bit depth
Size | AVX2 performance | AVX512 performance ---------------------------------------------- 32x8 | 7.85x | 7.95x 32x16 | 9.54x | 15.32x 32x24 | 10.02x | 17.01x 32x32 | 10.97x | 18.22x 32x64 | 9.82x | 19.59x diff -r d05b920865e7 -r b4c2149e9bb1 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 25 18:28:43 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 25 18:50:51 2017 +0530 @@ -2234,6 +2234,20 @@ p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512); p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512); p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512); + p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2); + p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx512); + p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512); + p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512); + p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); + } } #else // if HIGH_BIT_DEPTH diff -r d05b920865e7 -r b4c2149e9bb1 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Tue Jul 25 18:28:43 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Tue Jul 25 18:50:51 2017 +0530 @@ -377,6 +377,45 @@ movu [r2 + r4 + mmsize], m3 %endmacro +%macro P2S_32x8_AVX512 0 + movu m0, [r0] + movu m1, [r0 + r1] + movu m2, [r0 + r1 * 2] + movu m3, [r0 + r5] + psllw m0, (14 - BIT_DEPTH) + psllw m1, (14 - BIT_DEPTH) + psllw m2, (14 - BIT_DEPTH) + psllw m3, (14 - BIT_DEPTH) + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r4], m3 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + + movu m0, [r0] + movu m1, [r0 + r1] + movu m2, [r0 + r1 * 2] + movu m3, [r0 + r5] + psllw m0, (14 - BIT_DEPTH) + psllw m1, (14 - BIT_DEPTH) + psllw m2, (14 - BIT_DEPTH) + psllw m3, (14 - BIT_DEPTH) + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r4], m3 +%endmacro + ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride) ;----------------------------------------------------------------------------- @@ -475,6 +514,132 @@ lea r2, [r2 + r3 * 4] P2S_64x8_AVX512 RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x8, 4, 6, 5 + add r1d, r1d + add r3d, r3d + lea r4, [r3 * 3] + lea r5, [r1 * 3] + + ; load constant + vbroadcasti32x8 m4, [pw_2000] + P2S_32x8_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x16, 4, 6, 5 + add r1d, r1d + add r3d, r3d + lea r4, [r3 * 3] + lea r5, [r1 * 3] + + ; load constant + vbroadcasti32x8 m4, [pw_2000] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x24, 4, 6, 5 + add r1d, r1d + add r3d, r3d + lea r4, [r3 * 3] + lea r5, [r1 * 3] + + ; load constant + vbroadcasti32x8 m4, [pw_2000] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x32, 4, 6, 5 + add r1d, r1d + add r3d, r3d + lea r4, [r3 * 3] + lea r5, [r1 * 3] + + ; load constant + vbroadcasti32x8 m4, [pw_2000] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x48, 4, 6, 5 + add r1d, r1d + add r3d, r3d + lea r4, [r3 * 3] + lea r5, [r1 * 3] + + ; load constant + vbroadcasti32x8 m4, [pw_2000] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x64, 4, 6, 5 + add r1d, r1d + add r3d, r3d + lea r4, [r3 * 3] + lea r5, [r1 * 3] + + ; load constant + vbroadcasti32x8 m4, [pw_2000] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + P2S_32x8_AVX512 + RET ;----------------------------------------------------------------------------------------------------------------------------- ;p2s avx512 code end ;----------------------------------------------------------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel