# HG changeset patch # User Vignesh Vijayakumar # Date 1501571354 -19800 # Tue Aug 01 12:39:14 2017 +0530 # Node ID f8687bef93f25b343606e42f4fd252d5f0897d1a # Parent 05972a61eb1aeac474ecc0d0150671e879177112 x86: AVX512 pixel_add_ps_32xN for high bit depth
AVX2 performance : 12.77x AVX512 performance : 21.54x This patch also cleanup low bit depth code diff -r 05972a61eb1a -r f8687bef93f2 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 01 10:56:55 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 12:39:14 2017 +0530 @@ -2198,6 +2198,9 @@ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512); p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512); + p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512); + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512); + p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx512); // 64 X N p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512); @@ -3893,8 +3896,8 @@ p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512); p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512); - p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2); - p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2); + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512); + p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx512); p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512); p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512); diff -r 05972a61eb1a -r f8687bef93f2 source/common/x86/pixeladd8.asm --- a/source/common/x86/pixeladd8.asm Tue Aug 01 10:56:55 2017 +0530 +++ b/source/common/x86/pixeladd8.asm Tue Aug 01 12:39:14 2017 +0530 @@ -769,132 +769,6 @@ PIXEL_ADD_PS_W32_H4_avx2 64 ;----------------------------------------------------------------------------- -; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) -;----------------------------------------------------------------------------- -%macro PROCESS_ADD_PS_32x8_AVX512 0 - pmovzxbw m0, [r2] ; row 0 of src0 - movu m1, [r3] ; row 0 of src1 - pmovzxbw m2, [r2 + r4] ; row 1 of src0 - movu m3, [r3 + r5] ; row 1 of src1 - pmovzxbw m4, [r2 + r4 * 2] ; row 2 of src0 - movu m5, [r3 + r5 * 2] ; row 2 of src1 - pmovzxbw m6, [r2 + r7] ; row 3 of src0 - movu m7, [r3 + r8] ; row 3 of src1 - - paddw m0, m1 - paddw m2, m3 - paddw m4, m5 - paddw m6, m7 - packuswb m0, m2 - packuswb m4, m6 - vpermq m0, m0, 11011000b - vpermq m4, m4, 11011000b - vshufi64x2 m0, m0, 11011000b - vshufi64x2 m4, m4, 11011000b - movu [r0], ym0 ; row 0 of dst - movu [r0 + r1 * 2], ym4 ; row 2 of dst - vshufi64x2 m0, m0, 01001110b - vshufi64x2 m4, m4, 01001110b - movu [r0 + r1], ym0 ; row 1 of dst - movu [r0 + r9], ym4 ; row 3 of dst - - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r5 * 4] - lea r0, [r0 + r1 * 4] - - pmovzxbw m0, [r2] ; row 4 of src0 - movu m1, [r3] ; row 4 of src1 - pmovzxbw m2, [r2 + r4] ; row 5 of src0 - movu m3, [r3 + r5] ; row 5 of src1 - pmovzxbw m4, [r2 + r4 * 2] ; row 6 of src0 - movu m5, [r3 + r5 * 2] ; row 6 of src1 - pmovzxbw m6, [r2 + r7] ; row 7 of src0 - movu m7, [r3 + r8] ; row 7 of src1 - - paddw m0, m1 - paddw m2, m3 - paddw m4, m5 - paddw m6, m7 - packuswb m0, m2 - packuswb m4, m6 - vpermq m0, m0, 11011000b - vpermq m4, m4, 11011000b - vshufi64x2 m0, m0, 11011000b - vshufi64x2 m4, m4, 11011000b - movu [r0], ym0 ; row 4 of dst - movu [r0 + r1 * 2], ym4 ; row 6 of dst - vshufi64x2 m0, m0, 01001110b - vshufi64x2 m4, m4, 01001110b - movu [r0 + r1], ym0 ; row 5 of dst - movu [r0 + r9], ym4 ; row 7 of dst -%endmacro - - -%if HIGH_BIT_DEPTH==0 -%if ARCH_X86_64 -INIT_ZMM avx512 -cglobal pixel_add_ps_32x32, 6, 10, 8 - add r5, r5 - lea r7, [r4 * 3] - lea r8, [r5 * 3] - lea r9, [r1 * 3] - - PROCESS_ADD_PS_32x8_AVX512 - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r5 * 4] - lea r0, [r0 + r1 * 4] - PROCESS_ADD_PS_32x8_AVX512 - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r5 * 4] - lea r0, [r0 + r1 * 4] - PROCESS_ADD_PS_32x8_AVX512 - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r5 * 4] - lea r0, [r0 + r1 * 4] - PROCESS_ADD_PS_32x8_AVX512 - RET - -INIT_ZMM avx512 -cglobal pixel_add_ps_32x64, 6, 10, 8 - add r5, r5 - lea r7, [r4 * 3] - lea r8, [r5 * 3] - lea r9, [r1 * 3] - - PROCESS_ADD_PS_32x8_AVX512 - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r5 * 4] - lea r0, [r0 + r1 * 4] - PROCESS_ADD_PS_32x8_AVX512 - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r5 * 4] - lea r0, [r0 + r1 * 4] - PROCESS_ADD_PS_32x8_AVX512 - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r5 * 4] - lea r0, [r0 + r1 * 4] - PROCESS_ADD_PS_32x8_AVX512 - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r5 * 4] - lea r0, [r0 + r1 * 4] - PROCESS_ADD_PS_32x8_AVX512 - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r5 * 4] - lea r0, [r0 + r1 * 4] - PROCESS_ADD_PS_32x8_AVX512 - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r5 * 4] - lea r0, [r0 + r1 * 4] - PROCESS_ADD_PS_32x8_AVX512 - lea r2, [r2 + r4 * 4] - lea r3, [r3 + r5 * 4] - lea r0, [r0 + r1 * 4] - PROCESS_ADD_PS_32x8_AVX512 - RET -%endif -%endif - -;----------------------------------------------------------------------------- ; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W64_H2 2 @@ -1272,7 +1146,7 @@ %endif ;----------------------------------------------------------------------------- -; pixel_add_ps_64x64 avx512 code start +; pixel_add_ps avx512 code start ;----------------------------------------------------------------------------- %macro PROCESS_ADD_PS_64x8_AVX512 0 pmovzxbw m0, [r2] @@ -1553,6 +1427,250 @@ RET %endif %endif + +%macro PROCESS_ADD_PS_32x8_AVX512 0 + pmovzxbw m0, [r2] + movu m1, [r3] + pmovzxbw m2, [r2 + r4] + movu m3, [r3 + r5] + paddw m0, m1 + paddw m2, m3 + packuswb m0, m2 + vpermq m0, m0, 11011000b + vshufi64x2 m0, m0, 11011000b + movu [r0], ym0 + vshufi64x2 m0, m0, 01001110b + movu [r0 + r1], ym0 + + pmovzxbw m0, [r2 + r4 * 2] + movu m1, [r3 + r5 * 2] + pmovzxbw m2, [r2 + r6] + movu m3, [r3 + r7] + paddw m0, m1 + paddw m2, m3 + packuswb m0, m2 + vpermq m0, m0, 11011000b + vshufi64x2 m0, m0, 11011000b + movu [r0 + r1 * 2], ym0 + vshufi64x2 m0, m0, 01001110b + movu [r0 + r8], ym0 + + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + + pmovzxbw m0, [r2] + movu m1, [r3] + pmovzxbw m2, [r2 + r4] + movu m3, [r3 + r5] + paddw m0, m1 + paddw m2, m3 + packuswb m0, m2 + vpermq m0, m0, 11011000b + vshufi64x2 m0, m0, 11011000b + movu [r0], ym0 + vshufi64x2 m0, m0, 01001110b + movu [r0 + r1], ym0 + + pmovzxbw m0, [r2 + r4 * 2] + movu m1, [r3 + r5 * 2] + pmovzxbw m2, [r2 + r6] + movu m3, [r3 + r7] + paddw m0, m1 + paddw m2, m3 + packuswb m0, m2 + vpermq m0, m0, 11011000b + vshufi64x2 m0, m0, 11011000b + movu [r0 + r1 * 2], ym0 + vshufi64x2 m0, m0, 01001110b + movu [r0 + r8], ym0 +%endmacro + +%macro PROCESS_ADD_PS_32x8_HBD_AVX512 0 + movu m0, [r2] + movu m1, [r2 + r4] + movu m2, [r3] + movu m3, [r3 + r5] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + movu [r0], m0 + movu [r0 + r1], m1 + + movu m0, [r2 + r4 * 2] + movu m1, [r2 + r6] + movu m2, [r3 + r5 * 2] + movu m3, [r3 + r7] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + movu [r0 + r1 * 2], m0 + movu [r0 + r8], m1 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + + movu m0, [r2] + movu m1, [r2 + r4] + movu m2, [r3] + movu m3, [r3 + r5] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + movu [r0], m0 + movu [r0 + r1], m1 + + movu m0, [r2 + r4 * 2] + movu m1, [r2 + r6] + movu m2, [r3 + r5 * 2] + movu m3, [r3 + r7] + paddw m0, m2 + paddw m1, m3 + + CLIPW2 m0, m1, m4, m5 + movu [r0 + r1 * 2], m0 + movu [r0 + r8], m1 +%endmacro ;----------------------------------------------------------------------------- -; pixel_add_ps_64x64 avx512 code end +; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- +%if HIGH_BIT_DEPTH +%if ARCH_X86_64 +INIT_ZMM avx512 +cglobal pixel_add_ps_32x32, 6, 9, 6 + vbroadcasti32x8 m5, [pw_pixel_max] + pxor m4, m4 + add r4d, r4d + add r5d, r5d + add r1d, r1d + lea r6, [r4 * 3] + lea r7, [r5 * 3] + lea r8, [r1 * 3] + + PROCESS_ADD_PS_32x8_HBD_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_HBD_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_HBD_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_HBD_AVX512 + RET + +INIT_ZMM avx512 +cglobal pixel_add_ps_32x64, 6, 9, 6 + vbroadcasti32x8 m5, [pw_pixel_max] + pxor m4, m4 + add r4d, r4d + add r5d, r5d + add r1d, r1d + lea r6, [r4 * 3] + lea r7, [r5 * 3] + lea r8, [r1 * 3] + + PROCESS_ADD_PS_32x8_HBD_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_HBD_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_HBD_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_HBD_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_HBD_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_HBD_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_HBD_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_HBD_AVX512 + RET +%endif +%else +%if ARCH_X86_64 +INIT_ZMM avx512 +cglobal pixel_add_ps_32x32, 6, 9, 4 + add r5, r5 + lea r6, [r4 * 3] + lea r7, [r5 * 3] + lea r8, [r1 * 3] + + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + RET + +INIT_ZMM avx512 +cglobal pixel_add_ps_32x64, 6, 9, 4 + add r5, r5 + lea r6, [r4 * 3] + lea r7, [r5 * 3] + lea r8, [r1 * 3] + + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + lea r2, [r2 + r4 * 4] + lea r3, [r3 + r5 * 4] + lea r0, [r0 + r1 * 4] + PROCESS_ADD_PS_32x8_AVX512 + RET +%endif +%endif +;----------------------------------------------------------------------------- +; pixel_add_ps avx512 code end +;----------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel