# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1435578547 -19800 # Mon Jun 29 17:19:07 2015 +0530 # Node ID 60832369ebb4e1014b4080b27a0401f97af93958 # Parent 9feee64efa440c25f016d15ae982789e5393a77e asm: intra_filter 10bpp sse4 code
Performance improved over C code: intra_filter_32x32 7.46x 525.64 3922.56 intra_filter_16x16 6.53x 289.11 1886.86 intra_filter_8x8 5.60x 170.75 956.81 intra_filter_4x4 3.05x 121.20 369.74 diff -r 9feee64efa44 -r 60832369ebb4 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jun 26 15:29:51 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jun 29 17:19:07 2015 +0530 @@ -1120,6 +1120,11 @@ ALL_LUMA_PU(satd, pixel_satd, sse4); ASSIGN_SA8D(sse4); + p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4); + p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4); + p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4); + p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4); + ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4); ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4); INTRA_ANG_SSE4_COMMON(sse4); diff -r 9feee64efa44 -r 60832369ebb4 source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Fri Jun 26 15:29:51 2015 +0530 +++ b/source/common/x86/intrapred16.asm Mon Jun 29 17:19:07 2015 +0530 @@ -75,6 +75,9 @@ const pw_ang16_13, db 14, 15, 8, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 const pw_ang16_16, db 0, 0, 0, 0, 0, 0, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1 +intra_filter4_shuf0: db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13 +intra_filter4_shuf1: db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13 + ;; (blkSize - 1 - x) pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0 @@ -21634,3 +21637,413 @@ dec r4 jnz .loop RET + +;----------------------------------------------------------------------------------- +; void intra_filter_NxN(const pixel* references, pixel* filtered) +;----------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_filter_4x4, 2,4,5 + mov r2w, word [r0 + 16] ; topLast + mov r3w, word [r0 + 32] ; LeftLast + + ; filtering top + movu m0, [r0 + 0] + movu m1, [r0 + 16] + movu m2, [r0 + 32] + + pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1] + palignr m3, m1, m0, 4 + pshufb m3, [intra_filter4_shuf1] ; [8 7 6 5 4 3 2 9] samples[i + 1] + + psllw m0, 1 + paddw m4, m3 + paddw m0, m4 + paddw m0, [pw_2] + psrlw m0, 2 + + ; filtering left + palignr m4, m1, m1, 14 + pinsrw m4, [r0], 1 + palignr m3, m2, m1, 4 + pshufb m3, [intra_filter4_shuf1] + + psllw m1, 1 + paddw m4, m3 + paddw m1, m4 + paddw m1, [pw_2] + psrlw m1, 2 + + movu [r1], m0 + movu [r1 + 16], m1 + mov [r1 + 16], r2w ; topLast + mov [r1 + 32], r3w ; LeftLast + RET + +INIT_XMM sse4 +cglobal intra_filter_8x8, 2,4,6 + mov r2w, word [r0 + 32] ; topLast + mov r3w, word [r0 + 64] ; LeftLast + + ; filtering top + movu m0, [r0] + movu m1, [r0 + 16] + movu m2, [r0 + 32] + + pshufb m4, m0, [intra_filter4_shuf0] + palignr m5, m1, m0, 2 + pinsrw m5, [r0 + 34], 0 + + palignr m3, m1, m0, 14 + psllw m0, 1 + paddw m4, m5 + paddw m0, m4 + paddw m0, [pw_2] + psrlw m0, 2 + + palignr m4, m2, m1, 2 + psllw m1, 1 + paddw m4, m3 + paddw m1, m4 + paddw m1, [pw_2] + psrlw m1, 2 + movu [r1], m0 + movu [r1 + 16], m1 + + ; filtering left + movu m1, [r0 + 48] + movu m0, [r0 + 64] + + palignr m4, m2, m2, 14 + pinsrw m4, [r0], 1 + palignr m5, m1, m2, 2 + + palignr m3, m1, m2, 14 + palignr m0, m1, 2 + + psllw m2, 1 + paddw m4, m5 + paddw m2, m4 + paddw m2, [pw_2] + psrlw m2, 2 + + psllw m1, 1 + paddw m0, m3 + paddw m1, m0 + paddw m1, [pw_2] + psrlw m1, 2 + + movu [r1 + 32], m2 + movu [r1 + 48], m1 + mov [r1 + 32], r2w ; topLast + mov [r1 + 64], r3w ; LeftLast + RET + +INIT_XMM sse4 +cglobal intra_filter_16x16, 2,4,6 + mov r2w, word [r0 + 64] ; topLast + mov r3w, word [r0 + 128] ; LeftLast + + ; filtering top + movu m0, [r0] + movu m1, [r0 + 16] + movu m2, [r0 + 32] + + pshufb m4, m0, [intra_filter4_shuf0] + palignr m5, m1, m0, 2 + pinsrw m5, [r0 + 66], 0 + + palignr m3, m1, m0, 14 + psllw m0, 1 + paddw m4, m5 + paddw m0, m4 + paddw m0, [pw_2] + psrlw m0, 2 + + palignr m4, m2, m1, 2 + psllw m5, m1, 1 + paddw m4, m3 + paddw m5, m4 + paddw m5, [pw_2] + psrlw m5, 2 + movu [r1], m0 + movu [r1 + 16], m5 + + movu m0, [r0 + 48] + movu m5, [r0 + 64] + + palignr m3, m2, m1, 14 + palignr m4, m0, m2, 2 + + psllw m1, m2, 1 + paddw m3, m4 + paddw m1, m3 + paddw m1, [pw_2] + psrlw m1, 2 + + palignr m3, m0, m2, 14 + palignr m4, m5, m0, 2 + + psllw m0, 1 + paddw m4, m3 + paddw m0, m4 + paddw m0, [pw_2] + psrlw m0, 2 + movu [r1 + 32], m1 + movu [r1 + 48], m0 + + ; filtering left + movu m1, [r0 + 80] + movu m2, [r0 + 96] + + palignr m4, m5, m5, 14 + pinsrw m4, [r0], 1 + palignr m0, m1, m5, 2 + + psllw m3, m5, 1 + paddw m4, m0 + paddw m3, m4 + paddw m3, [pw_2] + psrlw m3, 2 + + palignr m0, m1, m5, 14 + palignr m4, m2, m1, 2 + + psllw m5, m1, 1 + paddw m4, m0 + paddw m5, m4 + paddw m5, [pw_2] + psrlw m5, 2 + movu [r1 + 64], m3 + movu [r1 + 80], m5 + + movu m5, [r0 + 112] + movu m0, [r0 + 128] + + palignr m3, m2, m1, 14 + palignr m4, m5, m2, 2 + + psllw m1, m2, 1 + paddw m3, m4 + paddw m1, m3 + paddw m1, [pw_2] + psrlw m1, 2 + + palignr m3, m5, m2, 14 + palignr m4, m0, m5, 2 + + psllw m5, 1 + paddw m4, m3 + paddw m5, m4 + paddw m5, [pw_2] + psrlw m5, 2 + movu [r1 + 96], m1 + movu [r1 + 112], m5 + + mov [r1 + 64], r2w ; topLast + mov [r1 + 128], r3w ; LeftLast + RET + +INIT_XMM sse4 +cglobal intra_filter_32x32, 2,4,6 + mov r2w, word [r0 + 128] ; topLast + mov r3w, word [r0 + 256] ; LeftLast + + ; filtering top + ; 0 to 15 + movu m0, [r0 + 0] + movu m1, [r0 + 16] + movu m2, [r0 + 32] + + pshufb m4, m0, [intra_filter4_shuf0] + palignr m5, m1, m0, 2 + pinsrw m5, [r0 + 130], 0 + + palignr m3, m1, m0, 14 + psllw m0, 1 + paddw m4, m5 + paddw m0, m4 + paddw m0, [pw_2] + psrlw m0, 2 + + palignr m4, m2, m1, 2 + psllw m5, m1, 1 + paddw m4, m3 + paddw m5, m4 + paddw m5, [pw_2] + psrlw m5, 2 + movu [r1], m0 + movu [r1 + 16], m5 + + ; 16 to 31 + movu m0, [r0 + 48] + movu m5, [r0 + 64] + + palignr m3, m2, m1, 14 + palignr m4, m0, m2, 2 + + psllw m1, m2, 1 + paddw m3, m4 + paddw m1, m3 + paddw m1, [pw_2] + psrlw m1, 2 + + palignr m3, m0, m2, 14 + palignr m4, m5, m0, 2 + + psllw m2, m0, 1 + paddw m4, m3 + paddw m2, m4 + paddw m2, [pw_2] + psrlw m2, 2 + movu [r1 + 32], m1 + movu [r1 + 48], m2 + + ; 32 to 47 + movu m1, [r0 + 80] + movu m2, [r0 + 96] + + palignr m3, m5, m0, 14 + palignr m4, m1, m5, 2 + + psllw m0, m5, 1 + paddw m3, m4 + paddw m0, m3 + paddw m0, [pw_2] + psrlw m0, 2 + + palignr m3, m1, m5, 14 + palignr m4, m2, m1, 2 + + psllw m5, m1, 1 + paddw m4, m3 + paddw m5, m4 + paddw m5, [pw_2] + psrlw m5, 2 + movu [r1 + 64], m0 + movu [r1 + 80], m5 + + ; 48 to 63 + movu m0, [r0 + 112] + movu m5, [r0 + 128] + + palignr m3, m2, m1, 14 + palignr m4, m0, m2, 2 + + psllw m1, m2, 1 + paddw m3, m4 + paddw m1, m3 + paddw m1, [pw_2] + psrlw m1, 2 + + palignr m3, m0, m2, 14 + palignr m4, m5, m0, 2 + + psllw m0, 1 + paddw m4, m3 + paddw m0, m4 + paddw m0, [pw_2] + psrlw m0, 2 + movu [r1 + 96], m1 + movu [r1 + 112], m0 + + ; filtering left + ; 64 to 79 + movu m1, [r0 + 144] + movu m2, [r0 + 160] + + palignr m4, m5, m5, 14 + pinsrw m4, [r0], 1 + palignr m0, m1, m5, 2 + + psllw m3, m5, 1 + paddw m4, m0 + paddw m3, m4 + paddw m3, [pw_2] + psrlw m3, 2 + + palignr m0, m1, m5, 14 + palignr m4, m2, m1, 2 + + psllw m5, m1, 1 + paddw m4, m0 + paddw m5, m4 + paddw m5, [pw_2] + psrlw m5, 2 + movu [r1 + 128], m3 + movu [r1 + 144], m5 + + ; 80 to 95 + movu m5, [r0 + 176] + movu m0, [r0 + 192] + + palignr m3, m2, m1, 14 + palignr m4, m5, m2, 2 + + psllw m1, m2, 1 + paddw m3, m4 + paddw m1, m3 + paddw m1, [pw_2] + psrlw m1, 2 + + palignr m3, m5, m2, 14 + palignr m4, m0, m5, 2 + + psllw m2, m5, 1 + paddw m4, m3 + paddw m2, m4 + paddw m2, [pw_2] + psrlw m2, 2 + movu [r1 + 160], m1 + movu [r1 + 176], m2 + + ; 96 to 111 + movu m1, [r0 + 208] + movu m2, [r0 + 224] + + palignr m3, m0, m5, 14 + palignr m4, m1, m0, 2 + + psllw m5, m0, 1 + paddw m3, m4 + paddw m5, m3 + paddw m5, [pw_2] + psrlw m5, 2 + + palignr m3, m1, m0, 14 + palignr m4, m2, m1, 2 + + psllw m0, m1, 1 + paddw m4, m3 + paddw m0, m4 + paddw m0, [pw_2] + psrlw m0, 2 + movu [r1 + 192], m5 + movu [r1 + 208], m0 + + ; 112 to 127 + movu m5, [r0 + 240] + movu m0, [r0 + 256] + + palignr m3, m2, m1, 14 + palignr m4, m5, m2, 2 + + psllw m1, m2, 1 + paddw m3, m4 + paddw m1, m3 + paddw m1, [pw_2] + psrlw m1, 2 + + palignr m3, m5, m2, 14 + palignr m4, m0, m5, 2 + + psllw m5, 1 + paddw m4, m3 + paddw m5, m4 + paddw m5, [pw_2] + psrlw m5, 2 + movu [r1 + 224], m1 + movu [r1 + 240], m5 + + mov [r1 + 128], r2w ; topLast + mov [r1 + 256], r3w ; LeftLast + RET _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel