# HG changeset patch # User Vignesh Vijayakumar # Date 1499924389 -19800 # Thu Jul 13 11:09:49 2017 +0530 # Node ID c1b7926fb590752578aa8cd17f4b86a7f743791b # Parent 238c5ee9ad24dc6b283bb399eb013d937bc9ac1e x86: AVX512 pixel_add_ps_32xN
AVX2 performance: 14.81x AVX512 performance: 18.01x diff -r 238c5ee9ad24 -r c1b7926fb590 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 13 10:49:56 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jul 13 11:09:49 2017 +0530 @@ -3806,6 +3806,9 @@ p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512); + p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512); + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2); + p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2); } #endif diff -r 238c5ee9ad24 -r c1b7926fb590 source/common/x86/pixeladd8.asm --- a/source/common/x86/pixeladd8.asm Thu Jul 13 10:49:56 2017 +0530 +++ b/source/common/x86/pixeladd8.asm Thu Jul 13 11:09:49 2017 +0530 @@ -768,6 +768,131 @@ PIXEL_ADD_PS_W32_H4_avx2 32 PIXEL_ADD_PS_W32_H4_avx2 64 +;----------------------------------------------------------------------------- +; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;----------------------------------------------------------------------------- +%macro PROCESS_ADD_PS_32x8_AVX512 0 + pmovzxbw m0, [r2] ; row 0 of src0 + movu m1, [r3] ; row 0 of src1 + pmovzxbw m2, [r2 + r4] ; row 1 of src0 + movu m3, [r3 + r5] ; row 1 of src1 + pmovzxbw m4, [r2 + r4 * 2] ; row 2 of src0 + movu m5, [r3 + r5 * 2] ; row 2 of src1 + pmovzxbw m6, [r2 + r7] ; row 3 of src0 + movu m7, [r3 + r8] ; row 3 of src1 + + paddw m0, m1 + paddw m2, m3 + paddw m4, m5 + paddw m6, m7 + packuswb m0, m2 + packuswb m4, m6 + vpermq m0, m0, 11011000b + vpermq m4, m4, 11011000b + vshufi64x2 m0, m0, 11011000b + vshufi64x2 m4, m4, 11011000b + movu [r0], ym0 ; row 0 of dst + movu [r0 + r1 * 2], ym4 ; row 2 of dst + vshufi64x2 m0, m0, 01001110b + vshufi64x2 m4, m4, 01001110b + movu [r0 + r1], ym0 ; row 1 of dst + movu [r0 + r9], ym4 ; row 3 of dst + + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + + pmovzxbw m0, [r2] ; row 4 of src0 + movu m1, [r3] ; row 4 of src1 + pmovzxbw m2, [r2 + r4] ; row 5 of src0 + movu m3, [r3 + r5] ; row 5 of src1 + pmovzxbw m4, [r2 + r4 * 2] ; row 6 of src0 + movu m5, [r3 + r5 * 2] ; row 6 of src1 + pmovzxbw m6, [r2 + r7] ; row 7 of src0 + movu m7, [r3 + r8] ; row 7 of src1 + + paddw m0, m1 + paddw m2, m3 + paddw m4, m5 + paddw m6, m7 + packuswb m0, m2 + packuswb m4, m6 + vpermq m0, m0, 11011000b + vpermq m4, m4, 11011000b + vshufi64x2 m0, m0, 11011000b + vshufi64x2 m4, m4, 11011000b + movu [r0], ym0 ; row 4 of dst + movu [r0 + r1 * 2], ym4 ; row 6 of dst + vshufi64x2 m0, m0, 01001110b + vshufi64x2 m4, m4, 01001110b + movu [r0 + r1], ym0 ; row 5 of dst + movu [r0 + r9], ym4 ; row 7 of dst +%endmacro + + +%if HIGH_BIT_DEPTH==0 +%if ARCH_X86_64 +INIT_ZMM avx512 +cglobal pixel_add_ps_32x32, 6, 10, 8 + add r5, r5 + lea r7, [r4 * 3] + lea r8, [r5 * 3] + lea r9, [r1 * 3] + + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + RET + +INIT_ZMM avx512 +cglobal pixel_add_ps_32x64, 6, 10, 8 + add r5, r5 + lea r7, [r4 * 3] + lea r8, [r5 * 3] + lea r9, [r1 * 3] + + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + RET +%endif +%endif ;----------------------------------------------------------------------------- ; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel