> ;-----------------------------------------------------------------------------
>+; void pixel_sub_ps_c_2x4(int16_t *dest, intptr_t destride, pixel *src0, 
>pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
>+;-----------------------------------------------------------------------------
>+INIT_XMM sse4
>+cglobal pixel_sub_ps_2x4, 6, 7, 2, dest, deststride, src0, src1, srcstride0, 
>srcstride1
>+
>+add         r1,    r1
>+
>+movd        m0,    [r2]
>+movd        m1,    [r2 + r4]
>+movd        m2,    [r2 + 2 * r4]
I don't worry about small block performance, but if you use below code, it is 
short and faster
movd m0, [r2]
movhps m0, [r2 + r4]
 
>+
>+movd        m3,    [r3]
>+movd        m4,    [r3 + r5]
>+movd        m5,    [r3 + 2 * r5]
>+
>+lea         r2,    [r2 + 2 * r4]
>+lea         r3,    [r3 + 2 * r5]
>+
>+movd        m6,    [r2 + r4]
>+movd        m7,    [r3 + r5]
>+
>+pmovzxbw    m0,    m0
>+pmovzxbw    m1,    m1
>+pmovzxbw    m2,    m2
>+pmovzxbw    m3,    m3
>+pmovzxbw    m4,    m4
>+pmovzxbw    m5,    m5
>+pmovzxbw    m6,    m6
>+pmovzxbw    m7,    m7
>+
>+psubw       m0,    m3
>+psubw       m1,    m4
>+psubw       m2,    m5
>+psubw       m6,    m7
here only half of pmovzxbw and psub when use above code.

>+movd    [r0],            m0
>+movd    [r0 + r1],       m1
>+movd    [r0 + 2* r1],    m2
>+
>+lea     r0,              [r0 + 2 * r1]
>+
>+movd    [r0 + r1],       m6
>+
>+RET
_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

Reply via email to