Please ignore this patch.
On Fri, Nov 22, 2013 at 4:02 PM, <[email protected]> wrote: > # HG changeset patch > # User Nabajit Deka > # Date 1385116295 -19800 > # Fri Nov 22 16:01:35 2013 +0530 > # Node ID 18fd67c0d27291012b65bf9c48c675d09b5db1f3 > # Parent 5009254d3d3ac92e90b1551444c5eb32ba2f8d31 > asm : routine for weightUnidirPixel(), for input width in multiples of 16. > > diff -r 5009254d3d3a -r 18fd67c0d272 source/common/x86/pixel-util.asm > --- a/source/common/x86/pixel-util.asm Fri Nov 22 00:17:46 2013 -0600 > +++ b/source/common/x86/pixel-util.asm Fri Nov 22 16:01:35 2013 +0530 > @@ -670,3 +670,81 @@ > movd eax, m7 > > RET > + > > +;------------------------------------------------------------------------------------------------------------------------------------------------------- > +;void weightUnidirPixel(pixel *src, pixel *dst, intptr_t srcStride, > intptr_t dstStride, int width, int height, int w0, int round, int shift, > int offset) > > +;------------------------------------------------------------------------------------------------------------------------------------------------------- > +INIT_XMM sse4 > +cglobal weightUnidirPixel, 6, 7, 6 > + > + mov r6d, r6m > + shl r6d, 6 > + movd m0, r6d ; m0 = [w0<<6] > + > + movd m1, r7m ; m1 = [round] > + punpcklwd m0, m1 ; assuming both (w0<<6) and round are > using maximum of 16 bits each. > + pshufd m0, m0, 0 ; m0 = [w0<<6 round] > + > + movd m1, r8m > + > + movd m2, r9m > + pshufd m2, m2, 0 > + > + mova m5, [tab_c_1] > + > + sub r2d, r4d > + sub r3d, r4d > + > +.loopH > + mov r6d, r4d > + shr r6d, 4 > +.loopW: > + movh m4, [r0] > + pmovzxbw m4, m4 > + > + punpcklwd m3, m4, m5 > + pmaddwd m3, m0 > + psrad m3, m1 > + paddd m3, m2 > + > + punpckhwd m4, m5 > + pmaddwd m4, m0 > + psrad m4, m1 > + paddd m4, m2 > + > + packssdw m3, m4 > + packuswb m3, m3 > + > + movh [r1], m3 > + > + movh m4, [r0 + 8] > + pmovzxbw m4, m4 > + > + punpcklwd m3, m4, m5 > + pmaddwd m3, m0 > + psrad m3, m1 > + paddd m3, m2 > + > + punpckhwd m4, m5 > + pmaddwd m4, m0 > + psrad m4, m1 > + paddd m4, m2 > + > + packssdw m3, m4 > + packuswb m3, m3 > + > + movh [r1 + 8], m3 > + > + add r0, 16 > + add r1, 16 > + > + dec r6d > + jnz .loopW > + > + lea r0, [r0 + r2] > + lea r1, [r1 + r3] > + > + dec r5d > + jnz .loopH > + > + RET >
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
