On 03/16, Min Chen wrote: > # HG changeset patch > # User Min Chen <[email protected]> > # Date 1426539636 25200 > # Node ID 117fb09221983c5f50988741168a216d35e3581a > # Parent d33fc159951225e42889071ef3d877d23f693197 > asm: improve sad[32x32] 10% by unroll loop
queued for testing > --- > source/common/x86/sad-a.asm | 25 ++++++++++++++++++------- > 1 files changed, 18 insertions(+), 7 deletions(-) > > diff -r d33fc1599512 -r 117fb0922198 source/common/x86/sad-a.asm > --- a/source/common/x86/sad-a.asm Mon Mar 16 12:00:42 2015 -0700 > +++ b/source/common/x86/sad-a.asm Mon Mar 16 14:00:36 2015 -0700 > @@ -3898,9 +3898,11 @@ > RET > > INIT_YMM avx2 > -cglobal pixel_sad_32x32, 4,5,5 > +cglobal pixel_sad_32x32, 4,7,5 > xorps m0, m0 > - mov r4d, 16 > + mov r4d, 32/4 > + lea r5, [r1 * 3] > + lea r6, [r3 * 3] > > .loop > movu m1, [r0] ; row 0 of pix0 > @@ -3913,11 +3915,21 @@ > paddd m0, m1 > paddd m0, m3 > > - lea r2, [r2 + 2 * r3] > - lea r0, [r0 + 2 * r1] > - > - dec r4d > - jnz .loop > + movu m1, [r0 + 2 * r1] ; row 2 of pix0 > + movu m2, [r2 + 2 * r3] ; row 2 of pix1 > + movu m3, [r0 + r5] ; row 3 of pix0 > + movu m4, [r2 + r6] ; row 3 of pix1 > + > + psadbw m1, m2 > + psadbw m3, m4 > + paddd m0, m1 > + paddd m0, m3 > + > + lea r2, [r2 + 4 * r3] > + lea r0, [r0 + 4 * r1] > + > + dec r4d > + jnz .loop > > vextracti128 xm1, m0, 1 > paddd xm0, xm1 > @@ -3926,5 +3938,4 @@ > movd eax, xm0 > RET > > - > %endif > > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel -- Steve Borho _______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
