# HG changeset patch # User Vignesh Vijayakumar # Date 1500447343 -19800 # Wed Jul 19 12:25:43 2017 +0530 # Node ID 97d5ab44b6da2db69584875c2dde97aef5533d9b # Parent 60c8ad7f3cadcfe7bb5242a89908546ce38bb5d1 x86: AVX512 convert_p2s 48x64
AVX2 performance : 2.22x AVX512 performance: 3.01x diff -r 60c8ad7f3cad -r 97d5ab44b6da source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jul 19 11:59:13 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Jul 19 12:25:43 2017 +0530 @@ -3841,6 +3841,7 @@ p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512); p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512); p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512); + p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512); diff -r 60c8ad7f3cad -r 97d5ab44b6da source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Jul 19 11:59:13 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Jul 19 12:25:43 2017 +0530 @@ -3047,6 +3047,115 @@ jnz .loop RET +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +%macro PROCESS_P2S_48x8_AVX512 0 + pmovzxbw m0, [r0] + pmovzxbw m1, [r0 + r1] + pmovzxbw m2, [r0 + r1 * 2] + pmovzxbw m3, [r0 + r5] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 + + pmovzxbw ym0, [r0 + 32] + pmovzxbw ym1, [r0 + r1 + 32] + pmovzxbw ym2, [r0 + r1 * 2 + 32] + pmovzxbw ym3, [r0 + r5 + 32] + psllw ym0, 6 + psllw ym1, 6 + psllw ym2, 6 + psllw ym3, 6 + psubw ym0, ym4 + psubw ym1, ym4 + psubw ym2, ym4 + psubw ym3, ym4 + movu [r2 + 64], ym0 + movu [r2 + r3 + 64], ym1 + movu [r2 + r3 * 2 + 64], ym2 + movu [r2 + r6 + 64], ym3 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + + pmovzxbw m0, [r0] + pmovzxbw m1, [r0 + r1] + pmovzxbw m2, [r0 + r1 * 2] + pmovzxbw m3, [r0 + r5] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 + + pmovzxbw ym0, [r0 + 32] + pmovzxbw ym1, [r0 + r1 + 32] + pmovzxbw ym2, [r0 + r1 * 2 + 32] + pmovzxbw ym3, [r0 + r5 + 32] + psllw ym0, 6 + psllw ym1, 6 + psllw ym2, 6 + psllw ym3, 6 + psubw ym0, ym4 + psubw ym1, ym4 + psubw ym2, ym4 + psubw ym3, ym4 + movu [r2 + 64], ym0 + movu [r2 + r3 + 64], ym1 + movu [r2 + r3 * 2 + 64], ym2 + movu [r2 + r6 + 64], ym3 +%endmacro + +INIT_ZMM avx512 +cglobal filterPixelToShort_48x64, 3,7,5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load constant + vpbroadcastd m8, [pw_2000] + + PROCESS_P2S_48x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_48x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_48x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_48x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_48x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_48x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_48x8_AVX512 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + PROCESS_P2S_48x8_AVX512 + RET %macro PROCESS_LUMA_W4_4R 0 movd m0, [r0] _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel