# HG changeset patch # User Vignesh Vijayakumar # Date 1500445753 -19800 # Wed Jul 19 11:59:13 2017 +0530 # Node ID 60c8ad7f3cadcfe7bb5242a89908546ce38bb5d1 # Parent a77082ebfa67b40f3dbb8cd45b54c17e710a104c x86: AVX512 convert_p2s_32xN
Size | AVX2 performance | AVX512 performance ------------------------------------------------ 32x8 | 1.51x | 1.54x 32x16 | 2.18x | 3.62x 32x24 | 2.26x | 3.58x 32x32 | 2.28x | 3.94x 32x64 | 2.20x | 4.06x diff -r a77082ebfa67 -r 60c8ad7f3cad source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jul 12 16:48:22 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Jul 19 11:59:13 2017 +0530 @@ -3836,6 +3836,19 @@ p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512); p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512); p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512); + p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2); + p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx512); + p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512); + p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512); + p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); } #endif diff -r a77082ebfa67 -r 60c8ad7f3cad source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Jul 12 16:48:22 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Jul 19 11:59:13 2017 +0530 @@ -1956,6 +1956,184 @@ ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- +%macro PROCESS_P2S_32x8_AVX512 0 + pmovzxbw m0, [r0] + pmovzxbw m1, [r0 + r1] + pmovzxbw m2, [r0 + r1 * 2] + pmovzxbw m3, [r0 + r5] + + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 + + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + + pmovzxbw m0, [r0] + pmovzxbw m1, [r0 + r1] + pmovzxbw m2, [r0 + r1 * 2] + pmovzxbw m3, [r0 + r5] + + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 + + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 +%endmacro + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x8, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + + PROCESS_P2S_32x8_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x16, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x24, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x32, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x48, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x64, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m4, [pw_2000] + + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_32x8_AVX512 + RET + +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- %macro P2S_H_64xN 1 INIT_XMM ssse3 cglobal filterPixelToShort_64x%1, 3, 7, 6 _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel