# HG changeset patch # User Jayashri Murugan <jayas...@multicorewareinc.com> # Date 1496987374 -19800 # Fri Jun 09 11:19:34 2017 +0530 # Node ID 88c2c703a20129df4a06b530bb10254c582bf342 # Parent 0e351679c063149e0c9f7677a0c75c679918281b avx2: 'integral32h' asm code -> 1.07x faster than 'C' version
integral_init32h 1.07x 1142.99 1218.1 diff -r 0e351679c063 -r 88c2c703a201 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jun 09 11:05:00 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jun 09 11:19:34 2017 +0530 @@ -3718,6 +3718,7 @@ p.integral_inith[INTEGRAL_12] = PFX(integral12h_avx2); p.integral_inith[INTEGRAL_16] = PFX(integral16h_avx2); p.integral_inith[INTEGRAL_24] = PFX(integral24h_avx2); + p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2); } #endif diff -r 0e351679c063 -r 88c2c703a201 source/common/x86/seaintegral.asm --- a/source/common/x86/seaintegral.asm Fri Jun 09 11:05:00 2017 +0530 +++ b/source/common/x86/seaintegral.asm Fri Jun 09 11:19:34 2017 +0530 @@ -892,10 +892,177 @@ .end RET +%macro INTEGRAL_THIRTYTWO_HORIZONTAL_16 0 + pmovzxbw m0, [r1] + pmovzxbw m1, [r1 + 1] + paddw m0, m1 + pmovzxbw m1, [r1 + 2] + paddw m0, m1 + pmovzxbw m1, [r1 + 3] + paddw m0, m1 + pmovzxbw m1, [r1 + 4] + paddw m0, m1 + pmovzxbw m1, [r1 + 5] + paddw m0, m1 + pmovzxbw m1, [r1 + 6] + paddw m0, m1 + pmovzxbw m1, [r1 + 7] + paddw m0, m1 + pmovzxbw m1, [r1 + 8] + paddw m0, m1 + pmovzxbw m1, [r1 + 9] + paddw m0, m1 + pmovzxbw m1, [r1 + 10] + paddw m0, m1 + pmovzxbw m1, [r1 + 11] + paddw m0, m1 + pmovzxbw m1, [r1 + 12] + paddw m0, m1 + pmovzxbw m1, [r1 + 13] + paddw m0, m1 + pmovzxbw m1, [r1 + 14] + paddw m0, m1 + pmovzxbw m1, [r1 + 15] + paddw m0, m1 + pmovzxbw m1, [r1 + 16] + paddw m0, m1 + pmovzxbw m1, [r1 + 17] + paddw m0, m1 + pmovzxbw m1, [r1 + 18] + paddw m0, m1 + pmovzxbw m1, [r1 + 19] + paddw m0, m1 + pmovzxbw m1, [r1 + 20] + paddw m0, m1 + pmovzxbw m1, [r1 + 21] + paddw m0, m1 + pmovzxbw m1, [r1 + 22] + paddw m0, m1 + pmovzxbw m1, [r1 + 23] + paddw m0, m1 + pmovzxbw m1, [r1 + 24] + paddw m0, m1 + pmovzxbw m1, [r1 + 25] + paddw m0, m1 + pmovzxbw m1, [r1 + 26] + paddw m0, m1 + pmovzxbw m1, [r1 + 27] + paddw m0, m1 + pmovzxbw m1, [r1 + 28] + paddw m0, m1 + pmovzxbw m1, [r1 + 29] + paddw m0, m1 + pmovzxbw m1, [r1 + 30] + paddw m0, m1 + pmovzxbw m1, [r1 + 31] + paddw m0, m1 +%endmacro + + +%macro INTEGRAL_THIRTYTWO_HORIZONTAL_8 0 + pmovzxbw xm0, [r1] + pmovzxbw xm1, [r1 + 1] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 2] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 3] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 4] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 5] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 6] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 7] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 8] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 9] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 10] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 11] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 12] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 13] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 14] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 15] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 16] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 17] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 18] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 19] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 20] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 21] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 22] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 23] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 24] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 25] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 26] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 27] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 28] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 29] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 30] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 31] + paddw xm0, xm1 +%endmacro + ;----------------------------------------------------------------------------- ;static void integral_init32h_c(uint32_t *sum, pixel *pix, intptr_t stride) ;----------------------------------------------------------------------------- INIT_YMM avx2 -cglobal integral32h, 3, 3, 0 - +cglobal integral32h, 3, 5, 3 + lea r3, [4 * r2] + sub r0, r3 + sub r2, 32 ;stride - 32 + mov r4, r2 + shr r4, 4 + +.loop_16: + INTEGRAL_THIRTYTWO_HORIZONTAL_16 + vperm2i128 m2, m0, m0, 1 + pmovzxwd m2, xm2 + pmovzxwd m0, xm0 + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + movu m1, [r0 + 32] + paddd m2, m1 + movu [r0 + r3 + 32], m2 + add r1, 16 + add r0, 64 + sub r2, 16 + sub r4, 1 + jnz .loop_16 + cmp r2, 8 + je .loop_8 + jmp .end + +.loop_8: + INTEGRAL_THIRTYTWO_HORIZONTAL_8 + pmovzxwd m0, xm0 + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + jmp .end + +.end RET _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel