# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1435212794 -19800 # Thu Jun 25 11:43:14 2015 +0530 # Node ID faec09e1ab60531924f2d919d4f283fa91bfec81 # Parent b1af4c36f48a4500a4912373ebcda9a5540b5c15 asm: 10bpp AVX2 code for saoCuOrgE0, improved 974c->690c over SSE
diff -r b1af4c36f48a -r faec09e1ab60 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jun 24 10:36:15 2015 -0500 +++ b/source/common/x86/asm-primitives.cpp Thu Jun 25 11:43:14 2015 +0530 @@ -1284,6 +1284,8 @@ } if (cpuMask & X265_CPU_AVX2) { + p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2); + p.cu[BLOCK_16x16].intra_pred[2] = PFX(intra_pred_ang16_2_avx2); p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2); p.cu[BLOCK_16x16].intra_pred[4] = PFX(intra_pred_ang16_4_avx2); diff -r b1af4c36f48a -r faec09e1ab60 source/common/x86/const-a.asm --- a/source/common/x86/const-a.asm Wed Jun 24 10:36:15 2015 -0500 +++ b/source/common/x86/const-a.asm Thu Jun 25 11:43:14 2015 +0530 @@ -41,7 +41,7 @@ const pb_16, times 32 db 16 const pb_32, times 32 db 32 const pb_64, times 32 db 64 -const pb_128, times 16 db 128 +const pb_128, times 32 db 128 const pb_a1, times 16 db 0xa1 const pb_01, times 8 db 0, 1 diff -r b1af4c36f48a -r faec09e1ab60 source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Wed Jun 24 10:36:15 2015 -0500 +++ b/source/common/x86/loopfilter.asm Thu Jun 25 11:43:14 2015 +0530 @@ -235,6 +235,67 @@ %endif INIT_YMM avx2 +%if HIGH_BIT_DEPTH +cglobal saoCuOrgE0, 4,4,9 + vbroadcasti128 m6, [r1] + movzx r1d, byte [r3] + neg r1b + movd xm0, r1d + movzx r1d, byte [r3 + 1] + neg r1b + movd xm1, r1d + vinserti128 m0, m0, xm1, 1 + mova m5, [pw_1023] + mov r1, r4m + add r1d, r1d + shr r2d, 4 + +.loop: + movu m7, [r0] + movu m8, [r0 + r1] + movu m2, [r0 + 2] + movu m1, [r0 + r1 + 2] + + pcmpgtw m3, m7, m2 + pcmpgtw m2, m7 + pcmpgtw m4, m8, m1 + pcmpgtw m1, m8 + + packsswb m3, m4 + packsswb m2, m1 + vpermq m3, m3, 11011000b + vpermq m2, m2, 11011000b + + pand m3, [pb_1] + por m3, m2 + + pslldq m2, m3, 1 + por m2, m0 + + psignb m2, [pb_128] ; m2 = signLeft + pxor m0, m0 + palignr m0, m3, 15 + paddb m3, m2 + paddb m3, [pb_2] ; m3 = uiEdgeType + pshufb m2, m6, m3 + pmovsxbw m3, xm2 ; offsetEo + vextracti128 xm2, m2, 1 + pmovsxbw m2, xm2 + pxor m4, m4 + paddw m7, m3 + paddw m8, m2 + pmaxsw m7, m4 + pmaxsw m8, m4 + pminsw m7, m5 + pminsw m8, m5 + movu [r0], m7 + movu [r0 + r1], m8 + + add r0q, 32 + dec r2d + jnz .loop + RET +%else ; HIGH_BIT_DEPTH cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride mov r4d, r4m @@ -287,6 +348,7 @@ sub r2d, 16 jnz .loop RET +%endif ;================================================================================================== ; void saoCuOrgE1(pixel *pRec, int8_t *m_iUpBuff1, int8_t *m_iOffsetEo, Int iStride, Int iLcuWidth) _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel