> ;----------------------------------------------------------------------------- >+; void pixel_sub_ps_c_2x4(int16_t *dest, intptr_t destride, pixel *src0, >pixel *src1, intptr_t srcstride0, intptr_t srcstride1); >+;----------------------------------------------------------------------------- >+INIT_XMM sse4 >+cglobal pixel_sub_ps_2x4, 6, 7, 2, dest, deststride, src0, src1, srcstride0, >srcstride1 >+ >+add r1, r1 >+ >+movd m0, [r2] >+movd m1, [r2 + r4] >+movd m2, [r2 + 2 * r4] I don't worry about small block performance, but if you use below code, it is short and faster movd m0, [r2] movhps m0, [r2 + r4] >+ >+movd m3, [r3] >+movd m4, [r3 + r5] >+movd m5, [r3 + 2 * r5] >+ >+lea r2, [r2 + 2 * r4] >+lea r3, [r3 + 2 * r5] >+ >+movd m6, [r2 + r4] >+movd m7, [r3 + r5] >+ >+pmovzxbw m0, m0 >+pmovzxbw m1, m1 >+pmovzxbw m2, m2 >+pmovzxbw m3, m3 >+pmovzxbw m4, m4 >+pmovzxbw m5, m5 >+pmovzxbw m6, m6 >+pmovzxbw m7, m7 >+ >+psubw m0, m3 >+psubw m1, m4 >+psubw m2, m5 >+psubw m6, m7 here only half of pmovzxbw and psub when use above code.
>+movd [r0], m0 >+movd [r0 + r1], m1 >+movd [r0 + 2* r1], m2 >+ >+lea r0, [r0 + 2 * r1] >+ >+movd [r0 + r1], m6 >+ >+RET
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
