same as previous patch
At 2015-02-25 18:24:43,[email protected] wrote: ># HG changeset patch ># User Praveen Tiwari <[email protected]> ># Date 1424859811 -19800 ># Node ID 3e4e3e2cafab08ff7fae37e0ad4fc8ed4d733656 ># Parent 177fe9372668b4824c291e967349664766688179 >asm-avx2: intra_pred_ang8_32, improved 435.21c -> 323.25c sse4 asm code > >intra_ang_8x8[32] 10.71x 323.25 3463.18 > >diff -r 177fe9372668 -r 3e4e3e2cafab source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Wed Feb 25 14:19:56 2015 +0530 >+++ b/source/common/x86/asm-primitives.cpp Wed Feb 25 15:53:31 2015 +0530 >@@ -1814,6 +1814,7 @@ > // intra_pred functions > p.cu[BLOCK_8x8].intra_pred[3] = x265_intra_pred_ang8_3_avx2; > p.cu[BLOCK_8x8].intra_pred[33] = x265_intra_pred_ang8_33_avx2; >+ p.cu[BLOCK_8x8].intra_pred[32] = x265_intra_pred_ang8_32_avx2; > } > } > #endif // if HIGH_BIT_DEPTH >diff -r 177fe9372668 -r 3e4e3e2cafab source/common/x86/intrapred.h >--- a/source/common/x86/intrapred.h Wed Feb 25 14:19:56 2015 +0530 >+++ b/source/common/x86/intrapred.h Wed Feb 25 15:53:31 2015 +0530 >@@ -159,6 +159,7 @@ > #undef DECL_ANG > void x265_intra_pred_ang8_3_avx2(pixel* dst, intptr_t dstStride, const pixel* > srcPix, int dirMode, int bFilter); > void x265_intra_pred_ang8_33_avx2(pixel* dst, intptr_t dstStride, const > pixel* srcPix, int dirMode, int bFilter); >+void x265_intra_pred_ang8_32_avx2(pixel* dst, intptr_t dstStride, const >pixel* srcPix, int dirMode, int bFilter); > void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, > int bLuma); > void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, > int bLuma); > void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel > *filtPix, int bLuma); >diff -r 177fe9372668 -r 3e4e3e2cafab source/common/x86/intrapred8.asm >--- a/source/common/x86/intrapred8.asm Wed Feb 25 14:19:56 2015 +0530 >+++ b/source/common/x86/intrapred8.asm Wed Feb 25 15:53:31 2015 +0530 >@@ -63,6 +63,14 @@ > c_ang8_src6_14_7_15: db 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, > 13, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 > c_ang8_22_16: db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, > 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 > >+c_ang8_21_10 : db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, >21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 >+c_ang8_src2_10_3_11: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 2, >3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 >+c_ang8_31_20: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, >31, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 >+c_ang8_src4_12_4_12: times 2 db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, >10, 11 >+c_ang8_9_30: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, >9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 >+c_ang8_src5_13_6_14: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, >5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13 >+c_ang8_19_8: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, >19, 13, 19, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 >+ > ;; (blkSize - 1 - x) > pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0 > pw_planar4_1: dw 3, 3, 3, 3, 3, 3, 3, 3 >@@ -32123,3 +32131,38 @@ > movhps [r0 + 2 * r1], xm4 > movhps [r0 + r3], xm2 > RET >+ >+INIT_YMM avx2 >+cglobal intra_pred_ang8_32, 3,4,5 >+ movu m3, [pw_1024] >+ vbroadcasti128 m0, [r2 + 1] >+ >+ pshufb m1, m0, [c_ang8_src1_9_2_10] >+ pshufb m2, m0, [c_ang8_src2_10_3_11] >+ pshufb m4, m0, [c_ang8_src4_12_4_12] >+ pshufb m0, [c_ang8_src5_13_6_14] >+ >+ pmaddubsw m1, [c_ang8_21_10] >+ pmulhrsw m1, m3 >+ pmaddubsw m2, [c_ang8_31_20] >+ pmulhrsw m2, m3 >+ pmaddubsw m4, [c_ang8_9_30] >+ pmulhrsw m4, m3 >+ pmaddubsw m0, [c_ang8_19_8] >+ pmulhrsw m0, m3 >+ packuswb m1, m2 >+ packuswb m4, m0 >+ >+ lea r3, [3 * r1] >+ movq [r0], xm1 >+ vextracti128 xm2, m1, 1 >+ movq [r0 + r1], xm2 >+ movhps [r0 + 2 * r1], xm1 >+ movhps [r0 + r3], xm2 >+ lea r0, [r0 + 4 * r1] >+ movq [r0], xm4 >+ vextracti128 xm2, m4, 1 >+ movq [r0 + r1], xm2 >+ movhps [r0 + 2 * r1], xm4 >+ movhps [r0 + r3], xm2 >+ RET >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
