# HG changeset patch # User Vignesh Vijayakumar # Date 1499854366 -19800 # Wed Jul 12 15:42:46 2017 +0530 # Node ID 77b61125a20591cb5bad2a15a30cb9114a1d8d30 # Parent fda2f079d3358900506a7965569c6a9a39d15eb4 x86: AVX512 pixel_sub_ps_32xN
AVX2 performance : 3.35x AVX512 performance: 6.07x diff -r fda2f079d335 -r 77b61125a205 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jul 12 15:31:51 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Jul 12 15:42:46 2017 +0530 @@ -3811,6 +3811,9 @@ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2); p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512); + p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512); + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512); + p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512); } #endif diff -r fda2f079d335 -r 77b61125a205 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Wed Jul 12 15:31:51 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Wed Jul 12 15:42:46 2017 +0530 @@ -5359,6 +5359,117 @@ %endif ;----------------------------------------------------------------------------- +; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); +;----------------------------------------------------------------------------- +%macro PROCESS_SUB_PS_32x8_AVX512 0 + pmovzxbw m0, [r2] + pmovzxbw m1, [r3] + pmovzxbw m2, [r2 + r4] + pmovzxbw m3, [r3 + r5] + pmovzxbw m4, [r2 + 2 * r4] + pmovzxbw m5, [r3 + 2 * r5] + pmovzxbw m6, [r2 + r7] + pmovzxbw m7, [r3 + r8] + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + psubw m6, m7 + + movu [r0], m0 + movu [r0 + r1], m2 + movu [r0 + r1 * 2 ], m4 + movu [r0 + r9], m6 + + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + + pmovzxbw m0, [r2] + pmovzxbw m1, [r3] + pmovzxbw m2, [r2 + r4] + pmovzxbw m3, [r3 + r5] + pmovzxbw m4, [r2 + 2 * r4] + pmovzxbw m5, [r3 + 2 * r5] + pmovzxbw m6, [r2 + r7] + pmovzxbw m7, [r3 + r8] + + psubw m0, m1 + psubw m2, m3 + psubw m4, m5 + psubw m6, m7 + + movu [r0], m0 + movu [r0 + r1], m2 + movu [r0 + r1 * 2 ], m4 + movu [r0 + r9], m6 +%endmacro + +%if HIGH_BIT_DEPTH==0 +%if ARCH_X86_64 +INIT_ZMM avx512 +cglobal pixel_sub_ps_32x32, 6, 10, 8 + add r1, r1 + lea r7, [r4 * 3] + lea r8, [r5 * 3] + lea r9, [r1 * 3] + + PROCESS_SUB_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_SUB_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_SUB_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_SUB_PS_32x8_AVX512 + RET + +INIT_ZMM avx512 +cglobal pixel_sub_ps_32x64, 6, 10, 8 + add r1, r1 + lea r7, [r4 * 3] + lea r8, [r5 * 3] + lea r9, [r1 * 3] + + PROCESS_SUB_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_SUB_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_SUB_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_SUB_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_SUB_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_SUB_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_SUB_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_SUB_PS_32x8_AVX512 + RET +%endif +%endif + +;----------------------------------------------------------------------------- ; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); ;----------------------------------------------------------------------------- %macro PIXELSUB_PS_W64_H2 2 _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel