# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1431582271 -19800 # Thu May 14 11:14:31 2015 +0530 # Node ID 722ec5cd93ab072a182f0d94ad53ce4e8ad34f94 # Parent 479087422e29a672d6e9bc8d0cd2a65649d71fe2 asm: addAvg avx2 code for high_bit_depth sizes >= 8, improved over ~45% than previous code
diff -r 479087422e29 -r 722ec5cd93ab source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed May 13 16:52:59 2015 -0700 +++ b/source/common/x86/asm-primitives.cpp Thu May 14 11:14:31 2015 +0530 @@ -1181,6 +1181,29 @@ } if (cpuMask & X265_CPU_AVX2) { + p.pu[LUMA_8x4].addAvg = x265_addAvg_8x4_avx2; + p.pu[LUMA_8x8].addAvg = x265_addAvg_8x8_avx2; + p.pu[LUMA_8x16].addAvg = x265_addAvg_8x16_avx2; + p.pu[LUMA_8x32].addAvg = x265_addAvg_8x32_avx2; + p.pu[LUMA_12x16].addAvg = x265_addAvg_12x16_avx2; + p.pu[LUMA_16x4].addAvg = x265_addAvg_16x4_avx2; + p.pu[LUMA_16x8].addAvg = x265_addAvg_16x8_avx2; + p.pu[LUMA_16x12].addAvg = x265_addAvg_16x12_avx2; + p.pu[LUMA_16x16].addAvg = x265_addAvg_16x16_avx2; + p.pu[LUMA_16x32].addAvg = x265_addAvg_16x32_avx2; + p.pu[LUMA_16x64].addAvg = x265_addAvg_16x64_avx2; + p.pu[LUMA_24x32].addAvg = x265_addAvg_24x32_avx2; + p.pu[LUMA_32x8].addAvg = x265_addAvg_32x8_avx2; + p.pu[LUMA_32x16].addAvg = x265_addAvg_32x16_avx2; + p.pu[LUMA_32x24].addAvg = x265_addAvg_32x24_avx2; + p.pu[LUMA_32x32].addAvg = x265_addAvg_32x32_avx2; + p.pu[LUMA_32x64].addAvg = x265_addAvg_32x64_avx2; + p.pu[LUMA_48x64].addAvg = x265_addAvg_48x64_avx2; + p.pu[LUMA_64x16].addAvg = x265_addAvg_64x16_avx2; + p.pu[LUMA_64x32].addAvg = x265_addAvg_64x32_avx2; + p.pu[LUMA_64x48].addAvg = x265_addAvg_64x48_avx2; + p.pu[LUMA_64x64].addAvg = x265_addAvg_64x64_avx2; + p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2; p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2; p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_avx2; diff -r 479087422e29 -r 722ec5cd93ab source/common/x86/const-a.asm --- a/source/common/x86/const-a.asm Wed May 13 16:52:59 2015 -0700 +++ b/source/common/x86/const-a.asm Thu May 14 11:14:31 2015 +0530 @@ -75,7 +75,7 @@ const pw_256, times 16 dw 256 const pw_257, times 16 dw 257 const pw_512, times 16 dw 512 -const pw_1023, times 8 dw 1023 +const pw_1023, times 16 dw 1023 const pw_1024, times 16 dw 1024 const pw_4096, times 16 dw 4096 const pw_00ff, times 16 dw 0x00ff diff -r 479087422e29 -r 722ec5cd93ab source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asm Wed May 13 16:52:59 2015 -0700 +++ b/source/common/x86/mc-a.asm Thu May 14 11:14:31 2015 +0530 @@ -1017,6 +1017,454 @@ ADDAVG_W64_H1 32 ADDAVG_W64_H1 48 ADDAVG_W64_H1 64 + +;------------------------------------------------------------------------------ +; avx2 asm for addAvg high_bit_depth +;------------------------------------------------------------------------------ +INIT_YMM avx2 +%macro ADDAVG_W8_H4_AVX2 1 +cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m3, [pw_1024] + pxor m1, m1 + add r3d, r3d + add r4d, r4d + add r5d, r5d + mov r6d, %1/4 + +.loop: + movu m0, [r0] + vinserti128 m0, m0, [r0 + r3], 1 + movu m2, [r1] + vinserti128 m2, m2, [r1 + r4], 1 + + paddw m0, m2 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m1 + pminsw m0, m5 + vextracti128 xm2, m0, 1 + movu [r2], xm0 + movu [r2 + r5], xm2 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movu m0, [r0] + vinserti128 m0, m0, [r0 + r3], 1 + movu m2, [r1] + vinserti128 m2, m2, [r1 + r4], 1 + + paddw m0, m2 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m1 + pminsw m0, m5 + vextracti128 xm2, m0, 1 + movu [r2], xm0 + movu [r2 + r5], xm2 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W8_H4_AVX2 4 +ADDAVG_W8_H4_AVX2 8 +ADDAVG_W8_H4_AVX2 12 +ADDAVG_W8_H4_AVX2 16 +ADDAVG_W8_H4_AVX2 32 +ADDAVG_W8_H4_AVX2 64 + +cglobal addAvg_12x16, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m3, [pw_1024] + pxor m1, m1 + add r3, r3 + add r4, r4 + add r5, r5 + mov r6d, 4 + +.loop: +%rep 2 + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m1 + pminsw m0, m5 + vextracti128 xm2, m0, 1 + movu [r2], xm0 + movq [r2 + 16], xm2 + + movu m0, [r0 + r3] + movu m2, [r1 + r4] + paddw m0, m2 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m1 + pminsw m0, m5 + vextracti128 xm2, m0, 1 + movu [r2 + r5], xm0 + movq [r2 + r5 + 16], xm2 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] +%endrep + dec r6d + jnz .loop + RET + +%macro ADDAVG_W16_H4_AVX2 1 +cglobal addAvg_16x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m3, [pw_1024] + pxor m2, m2 + add r3, r3 + add r4, r4 + add r5, r5 + mov r6d, %1/4 + +.loop: +%rep 2 + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2], m0 + + movu m0, [r0 + r3] + movu m1, [r1 + r4] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5], m0 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] +%endrep + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W16_H4_AVX2 4 +ADDAVG_W16_H4_AVX2 8 +ADDAVG_W16_H4_AVX2 12 +ADDAVG_W16_H4_AVX2 16 +ADDAVG_W16_H4_AVX2 24 +ADDAVG_W16_H4_AVX2 32 +ADDAVG_W16_H4_AVX2 64 + +cglobal addAvg_24x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m3, [pw_1024] + pxor m1, m1 + add r3, r3 + add r4, r4 + add r5, r5 + + mov r6d, 16 + +.loop: + movu m0, [r0] + movu m2, [r1] + paddw m0, m2 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m1 + pminsw m0, m5 + movu [r2], m0 + + movu xm0, [r0 + 32] + movu xm2, [r1 + 32] + paddw xm0, xm2 + pmulhrsw xm0, xm3 + paddw xm0, xm4 + pmaxsw xm0, xm1 + pminsw xm0, xm5 + movu [r2 + 32], xm0 + + movu m0, [r0 + r3] + movu m2, [r1 + r4] + paddw m0, m2 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m1 + pminsw m0, m5 + movu [r2 + r5], m0 + + movu xm2, [r0 + r3 + 32] + movu xm0, [r1 + r4 + 32] + paddw xm2, xm0 + pmulhrsw xm2, xm3 + paddw xm2, xm4 + pmaxsw xm2, xm1 + pminsw xm2, xm5 + movu [r2 + r5 + 32], xm2 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET + +%macro ADDAVG_W32_H2_AVX2 1 +cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m3, [pw_1024] + pxor m2, m2 + add r3, r3 + add r4, r4 + add r5, r5 + + mov r6d, %1/2 + +.loop: + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2], m0 + + movu m0, [r0 + 32] + movu m1, [r1 + 32] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + 32], m0 + + movu m0, [r0 + r3] + movu m1, [r1 + r4] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5], m0 + + movu m0, [r0 + r3 + 32] + movu m1, [r1 + r4 + 32] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5 + 32], m0 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W32_H2_AVX2 8 +ADDAVG_W32_H2_AVX2 16 +ADDAVG_W32_H2_AVX2 24 +ADDAVG_W32_H2_AVX2 32 +ADDAVG_W32_H2_AVX2 48 +ADDAVG_W32_H2_AVX2 64 + +cglobal addAvg_48x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m3, [pw_1024] + pxor m2, m2 + add r3, r3 + add r4, r4 + add r5, r5 + + mov r6d, 32 + +.loop: + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2], m0 + + movu m0, [r0 + 32] + movu m1, [r1 + 32] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + 32], m0 + + movu m0, [r0 + 64] + movu m1, [r1 + 64] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + 64], m0 + + movu m0, [r0 + r3] + movu m1, [r1 + r4] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5], m0 + + movu m0, [r0 + r3 + 32] + movu m1, [r1 + r4 + 32] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5 + 32], m0 + + movu m0, [r0 + r3 + 64] + movu m1, [r1 + r4 + 64] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5 + 64], m0 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET + +%macro ADDAVG_W64_H1_AVX2 1 +cglobal addAvg_64x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride + mova m4, [pw_512] + mova m5, [pw_1023] + mova m3, [pw_1024] + pxor m2, m2 + add r3d, r3d + add r4d, r4d + add r5d, r5d + + mov r6d, %1/2 + +.loop: + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2], m0 + + movu m0, [r0 + 32] + movu m1, [r1 + 32] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + 32], m0 + + movu m0, [r0 + 64] + movu m1, [r1 + 64] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + 64], m0 + + movu m0, [r0 + 96] + movu m1, [r1 + 96] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + 96], m0 + + movu m0, [r0 + r3] + movu m1, [r1 + r4] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5], m0 + + movu m0, [r0 + r3 + 32] + movu m1, [r1 + r4 + 32] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5 + 32], m0 + + movu m0, [r0 + r3 + 64] + movu m1, [r1 + r4 + 64] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5 + 64], m0 + + movu m0, [r0 + r3 + 96] + movu m1, [r1 + r4 + 96] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5 + 96], m0 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W64_H1_AVX2 16 +ADDAVG_W64_H1_AVX2 32 +ADDAVG_W64_H1_AVX2 48 +ADDAVG_W64_H1_AVX2 64 ;----------------------------------------------------------------------------- %else ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel