# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1435213462 -19800 # Thu Jun 25 11:54:22 2015 +0530 # Node ID f43aa44673dcd8e96581c938cf22ad4bbb7657e3 # Parent 31da07b7198ca730bae37577d5053a3337477f7b asm: 10bpp AVX2 code for saoCuOrgE1_2Rows, improved 900c->614c over SSE
diff -r 31da07b7198c -r f43aa44673dc source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jun 25 11:49:07 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jun 25 11:54:22 2015 +0530 @@ -1286,6 +1286,7 @@ { p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2); p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2); + p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2); p.cu[BLOCK_16x16].intra_pred[2] = PFX(intra_pred_ang16_2_avx2); p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2); diff -r 31da07b7198c -r f43aa44673dc source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Thu Jun 25 11:49:07 2015 +0530 +++ b/source/common/x86/loopfilter.asm Thu Jun 25 11:54:22 2015 +0530 @@ -728,6 +728,62 @@ %endif INIT_YMM avx2 +%if HIGH_BIT_DEPTH +cglobal saoCuOrgE1_2Rows, 4,5,8 + add r3d, r3d + mov r4d, r4m + mova m4, [pw_1023] + vbroadcasti128 m6, [r2] ; m6 = m_iOffsetEo + shr r4d, 4 +.loop + movu m7, [r0] + movu m5, [r0 + r3] + movu m1, [r0 + r3 * 2] + + pcmpgtw m2, m7, m5 + pcmpgtw m3, m5, m7 + pcmpgtw m0, m5, m1 + pcmpgtw m1, m5 + + packsswb m2, m0 + packsswb m3, m1 + vpermq m2, m2, 11011000b + vpermq m3, m3, 11011000b + + pand m2, [pb_1] + por m2, m3 + + movu xm3, [r1] ; m3 = m_iUpBuff1 + pxor m0, m0 + psubb m1, m0, m2 + vinserti128 m3, m3, xm1, 1 + vextracti128 [r1], m1, 1 + + paddb m3, m2 + paddb m3, [pb_2] + + pshufb m1, m6, m3 + pmovsxbw m3, xm1 + vextracti128 xm1, m1, 1 + pmovsxbw m1, xm1 + + paddw m7, m3 + paddw m5, m1 + + pmaxsw m7, m0 + pmaxsw m5, m0 + pminsw m7, m4 + pminsw m5, m4 + + movu [r0], m7 + movu [r0 + r3], m5 + + add r0, 32 + add r1, 16 + dec r4d + jnz .loop + RET +%else ; HIGH_BIT_DEPTH cglobal saoCuOrgE1_2Rows, 3, 5, 7, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth mov r3d, r3m mov r4d, r4m @@ -775,6 +831,7 @@ dec r4d jnz .loop RET +%endif ;====================================================================================================================================================== ; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride) _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel