# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1439812025 -19800 # Mon Aug 17 17:17:05 2015 +0530 # Node ID 43c9ec65927666db1316efe63d112bd8f9cb5f35 # Parent 8752daab2f07711c556dfffa9a733b7278484479 asm: avx2 asm for intra_ang32 mode 11, 4550c->1326c
diff -r 8752daab2f07 -r 43c9ec659276 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Aug 14 18:27:44 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 17 17:17:05 2015 +0530 @@ -3027,6 +3027,7 @@ p.cu[BLOCK_32x32].intra_pred[8] = PFX(intra_pred_ang32_8_avx2); p.cu[BLOCK_32x32].intra_pred[9] = PFX(intra_pred_ang32_9_avx2); p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx2); + p.cu[BLOCK_32x32].intra_pred[11] = PFX(intra_pred_ang32_11_avx2); p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2); p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2); p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2); diff -r 8752daab2f07 -r 43c9ec659276 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Fri Aug 14 18:27:44 2015 +0530 +++ b/source/common/x86/intrapred8.asm Mon Aug 17 17:17:05 2015 +0530 @@ -440,6 +440,9 @@ const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16 db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8), 8, (32- 6), 6, (32- 4), 4, (32- 2), 2, (32- 0), 0 +const ang32_shuf_mode11, times 8 db 1, 2 + times 8 db 0, 1 + const ang_table %assign x 0 %rep 32 @@ -13627,6 +13630,325 @@ movu [r0 + r4], m3 RET +cglobal intra_pred_ang32_11, 3,4,8 + vbroadcasti128 m0, [angHor_tab_11] + vbroadcasti128 m1, [angHor_tab_11 + mmsize/2] + mova m2, [pw_1024] + mova m7, [ang32_shuf_mode11] + lea r3, [r1 * 3] + + ; prepare for [16 0 -1 -2 ...] + movu xm3, [r2 + mmsize*2 - 1] + vbroadcasti128 m6, [r2 + mmsize*2 + 15] + + pinsrb xm3, [r2 + 0], 1 + pinsrb xm3, [r2 + 16], 0 + vinserti128 m3, m3, xm3, 1 ; [16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] + + pshufb m5, m3, m7 ; [ 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 16 0 16 0 16 0 16 0 16 0 16 0 16 0 16 0] + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0], m4 + + palignr m5, m6, m3, 1 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1], m4 + + palignr m5, m6, m3, 2 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1 * 2], m4 + + palignr m5, m6, m3, 3 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r3], m4 + + lea r0, [r0 + r1 * 4] + + palignr m5, m6, m3, 4 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0], m4 + + palignr m5, m6, m3, 5 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1], m4 + + palignr m5, m6, m3, 6 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1 * 2], m4 + + palignr m5, m6, m3, 7 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r3], m4 + + lea r0, [r0 + r1 * 4] + + palignr m5, m6, m3, 8 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0], m4 + + palignr m5, m6, m3, 9 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1], m4 + + palignr m5, m6, m3, 10 + pshufb m5, m7 + + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1 * 2], m4 + + palignr m5, m6, m3, 11 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r3], m4 + + lea r0, [r0 + r1 * 4] + + palignr m5, m6, m3, 12 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0], m4 + + palignr m5, m6, m3, 13 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1], m4 + + palignr m5, m6, m3, 14 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1 * 2], m4 + + palignr m5, m6, m3, 15 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r3], m4 + + lea r0, [r0 + r1 * 4] + + mova m3, m6 + vbroadcasti128 m6, [r2 + mmsize*2 + 15 + 16] + pshufb m5, m3, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0], m4 + + palignr m5, m6, m3, 1 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1], m4 + + palignr m5, m6, m3, 2 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1 * 2], m4 + + palignr m5, m6, m3, 3 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r3], m4 + + lea r0, [r0 + r1 * 4] + + palignr m5, m6, m3, 4 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0], m4 + + palignr m5, m6, m3, 5 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1], m4 + + palignr m5, m6, m3, 6 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1 * 2], m4 + + palignr m5, m6, m3, 7 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r3], m4 + + lea r0, [r0 + r1 * 4] + + palignr m5, m6, m3, 8 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0], m4 + + palignr m5, m6, m3, 9 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1], m4 + + palignr m5, m6, m3, 10 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1 * 2], m4 + + palignr m5, m6, m3, 11 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r3], m4 + + lea r0, [r0 + r1 * 4] + + palignr m5, m6, m3, 12 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0], m4 + + palignr m5, m6, m3, 13 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1], m4 + + palignr m5, m6, m3, 14 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r1 * 2], m4 + + palignr m5, m6, m3, 15 + pshufb m5, m7 + pmaddubsw m4, m5, m0 + pmaddubsw m5, m1 + pmulhrsw m4, m2 + pmulhrsw m5, m2 + packuswb m4, m5 + movu [r0 + r3], m4 + RET + %endif ; ARCH_X86_64 ;----------------------------------------------------------------------------------------- ; end of intra_pred_ang32 angular modes avx2 asm _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel