# HG changeset patch # User Rajesh Paulraj<raj...@multicorewareinc.com> # Date 1438846738 -19800 # Thu Aug 06 13:08:58 2015 +0530 # Node ID e5d57775bbef81e37bc028c27d61a0a20e64bc9e # Parent 4078c3fa7b2a362cdab1b1ea54e13a29ae0ef4f2 asm: new algorithm for intra_ang_16 modes 4 & 32, improved over 15% than previous avx2 code
diff -r 4078c3fa7b2a -r e5d57775bbef source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Thu Aug 06 13:02:22 2015 +0530 +++ b/source/common/x86/intrapred8.asm Thu Aug 06 13:08:58 2015 +0530 @@ -271,29 +271,6 @@ db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 32 -c_ang16_mode_32: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 - db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 - db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 - db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 - db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 - db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 - db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 - db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 - db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 - db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 - db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - -ALIGN 32 -c_ang16_mode_4: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 - db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 - db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7 - db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 - db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 - db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 - db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 - db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - -ALIGN 32 c_ang16_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 @@ -13727,123 +13704,117 @@ call ang16_mode_3_33 RET + +cglobal ang16_mode_4_32 + ; rows 0 to 7 + movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + vextracti128 xm1, m0, 1 + vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + + pmaddubsw m4, m0, [r3 + 5 * 32] ; [21] + pmulhrsw m4, m7 + + palignr m1, m2, m0, 2 + pmaddubsw m5, m1, [r3 - 6 * 32] ; [10] + pmulhrsw m5, m7 + + palignr m8, m2, m0, 4 + pmaddubsw m6, m1, [r3 + 15 * 32] ; [31] + pmulhrsw m6, m7 + pmaddubsw m8, [r3 + 4 * 32] ; [20] + pmulhrsw m8, m7 + + palignr m10, m2, m0, 6 + pmaddubsw m9, m10, [r3 - 7 * 32] ; [9] + pmulhrsw m9, m7 + pmaddubsw m10, [r3 + 14 * 32] ; [30] + pmulhrsw m10, m7 + + palignr m11, m2, m0, 8 + palignr m1, m2, m0, 10 + pmaddubsw m11, [r3 + 3 * 32] ; [19] + pmulhrsw m11, m7 + pmaddubsw m12, m1, [r3 - 8 * 32] ; [8] + pmulhrsw m12, m7 + + ; rows 8 to 15 + pmaddubsw m3, m1, [r3 + 13 * 32] ; [29] + pmulhrsw m3, m7 + packuswb m4, m3 + + palignr m3, m2, m0, 12 + pmaddubsw m3, m3, [r3 + 2 * 32] ; [18] + pmulhrsw m3, m7 + packuswb m5, m3 + + palignr m1, m2, m0, 14 + pmaddubsw m3, m1, [r3 - 9 * 32] ; [7] + pmulhrsw m3, m7 + packuswb m6, m3 + + pmaddubsw m3, m1, [r3 + 12 * 32] ; [28] + pmulhrsw m3, m7 + packuswb m8, m3 + + palignr m3, m2, m0, 16 + pmaddubsw m3, [r3 + 1 * 32] ; [17] + pmulhrsw m3, m7 + packuswb m9, m3 + + movu xm0, [r2 + 25] + movu xm1, [r2 + 26] + punpcklbw m0, m1 + mova m1, m2 + vinserti128 m1, m1, xm0, 0 + vpermq m1, m1, 01001110b + + palignr m0, m1, m2, 2 + pmaddubsw m3, m0, [r3 - 10 * 32] ; [6] + pmulhrsw m3, m7 + packuswb m10, m3 + + pmaddubsw m3, m0, [r3 + 11 * 32] ; [27] + pmulhrsw m3, m7 + packuswb m11, m3 + + palignr m1, m1, m2, 4 + pmaddubsw m1, [r3] ; [16] + pmulhrsw m1, m7 + packuswb m12, m1 + + TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3 + ret + +INIT_YMM avx2 +cglobal intra_pred_ang16_4, 3, 7, 13 + add r2, 32 + lea r3, [ang_table_avx2 + 16 * 32] + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + clc + + call ang16_mode_4_32 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang16_32, 3, 7, 13 + lea r3, [ang_table_avx2 + 16 * 32] + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + stc + + call ang16_mode_4_32 + RET %endif ; ARCH_X86_64 INIT_YMM avx2 -cglobal intra_pred_ang16_4, 3, 6, 12 - mova m11, [pw_1024] - lea r5, [intra_pred_shuff_0_8] - - movu xm9, [r2 + 1 + 32] - pshufb xm9, [r5] - movu xm10, [r2 + 9 + 32] - pshufb xm10, [r5] - - movu xm7, [r2 + 6 + 32] - pshufb xm7, [r5] - vinserti128 m9, m9, xm7, 1 - - movu xm8, [r2 + 14 + 32] - pshufb xm8, [r5] - vinserti128 m10, m10, xm8, 1 - - lea r3, [3 * r1] - lea r4, [c_ang16_mode_4] - - INTRA_PRED_ANG16_CAL_ROW m0, m1, 0 - - movu xm9, [r2 + 2 + 32] - pshufb xm9, [r5] - movu xm10, [r2 + 10 + 32] - pshufb xm10, [r5] - - movu xm7, [r2 + 7 + 32] - pshufb xm7, [r5] - vinserti128 m9, m9, xm7, 1 - - movu xm8, [r2 + 15 + 32] - pshufb xm8, [r5] - vinserti128 m10, m10, xm8, 1 - - INTRA_PRED_ANG16_CAL_ROW m1, m2, 1 - - movu xm7, [r2 + 8 + 32] - pshufb xm7, [r5] - vinserti128 m9, m9, xm7, 1 - - movu xm8, [r2 + 16 + 32] - pshufb xm8, [r5] - vinserti128 m10, m10, xm8, 1 - - INTRA_PRED_ANG16_CAL_ROW m2, m3, 2 - - movu xm7, [r2 + 3 + 32] - pshufb xm7, [r5] - vinserti128 m9, m9, xm7, 0 - - movu xm8, [r2 + 11 + 32] - pshufb xm8, [r5] - vinserti128 m10, m10, xm8, 0 - - INTRA_PRED_ANG16_CAL_ROW m3, m4, 3 - - add r4, 4 * mmsize - - movu xm9, [r2 + 4 + 32] - pshufb xm9, [r5] - movu xm10, [r2 + 12 + 32] - pshufb xm10, [r5] - - movu xm7, [r2 + 9 + 32] - pshufb xm7, [r5] - vinserti128 m9, m9, xm7, 1 - - movu xm8, [r2 + 17 + 32] - pshufb xm8, [r5] - vinserti128 m10, m10, xm8, 1 - - INTRA_PRED_ANG16_CAL_ROW m4, m5, 0 - - movu xm7, [r2 + 10 + 32] - pshufb xm7, [r5] - vinserti128 m9, m9, xm7, 1 - - movu xm8, [r2 + 18 + 32] - pshufb xm8, [r5] - vinserti128 m10, m10, xm8, 1 - - INTRA_PRED_ANG16_CAL_ROW m5, m6, 1 - - movu xm7, [r2 + 5 + 32] - pshufb xm7, [r5] - vinserti128 m9, m9, xm7, 0 - - movu xm8, [r2 + 13 + 32] - pshufb xm8, [r5] - vinserti128 m10, m10, xm8, 0 - - INTRA_PRED_ANG16_CAL_ROW m6, m7, 2 - - movu xm9, [r2 + 6 + 32] - pshufb xm9, [r5] - movu xm10, [r2 + 14 + 32] - pshufb xm10, [r5] - - movu xm7, [r2 + 11 + 32] - pshufb xm7, [r5] - vinserti128 m9, m9, xm7, 1 - - movu xm8, [r2 + 19 + 32] - pshufb xm8, [r5] - vinserti128 m10, m10, xm8, 1 - - INTRA_PRED_ANG16_CAL_ROW m7, m8, 3 - - ; transpose and store - INTRA_PRED_TRANS_STORE_16x16 - RET - -INIT_YMM avx2 cglobal intra_pred_ang16_5, 3, 6, 12 mova m11, [pw_1024] lea r5, [intra_pred_shuff_0_8] @@ -14382,51 +14353,6 @@ RET INIT_YMM avx2 -cglobal intra_pred_ang16_32, 3, 5, 6 - mova m0, [pw_1024] - mova m5, [intra_pred_shuff_0_8] - lea r3, [3 * r1] - lea r4, [c_ang16_mode_32] - - INTRA_PRED_ANG16_MC2 1 - INTRA_PRED_ANG16_MC3 r0, 0 - - INTRA_PRED_ANG16_MC2 2 - INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 - - INTRA_PRED_ANG16_MC2 3 - INTRA_PRED_ANG16_MC3 r0 + r3, 2 - - INTRA_PRED_ANG16_MC2 4 - lea r0, [r0 + 4 * r1] - INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3 - - INTRA_PRED_ANG16_MC2 5 - - add r4, 4 * mmsize - INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 0 - - INTRA_PRED_ANG16_MC2 6 - INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 1 - INTRA_PRED_ANG16_MC2 7 - - lea r0, [r0 + 4 * r1] - INTRA_PRED_ANG16_MC3 r0 + r1, 2 - INTRA_PRED_ANG16_MC2 8 - INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3 - INTRA_PRED_ANG16_MC2 9 - - lea r0, [r0 + 4 * r1] - add r4, 4 * mmsize - - INTRA_PRED_ANG16_MC3 r0, 0 - INTRA_PRED_ANG16_MC2 10 - INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 - INTRA_PRED_ANG16_MC2 11 - INTRA_PRED_ANG16_MC3 r0 + r3, 2 - RET - -INIT_YMM avx2 cglobal intra_pred_ang16_24, 3, 5, 6 mova m0, [pw_1024] mova m5, [intra_pred_shuff_0_8] _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel