On Wed, Feb 5, 2014 at 3:33 AM, <[email protected]> wrote:
> # HG changeset patch > # User Yuvaraj Venkatesh <[email protected]> > # Date 1391592757 -19800 > # Wed Feb 05 15:02:37 2014 +0530 > # Node ID b14a8528c478bf3068ed95aeef68c050014785cd > # Parent 1374f1168c5cbb97a893172e37bd9f5c6ed5690c > asm: modified satd and sad asm functions in 16bpp to avoid overflow > this patch causes 8bpp testbench failures steve@zeppelin> ./test/TestBench ~/repos/x265/build/linux Using random seed 52F28F46 8bpp Testing primitives: SSE2 Testing primitives: SSE3 Testing primitives: SSSE3 sa8d_inter[12x16]: failed! > > diff -r 1374f1168c5c -r b14a8528c478 source/common/x86/pixel-a.asm > --- a/source/common/x86/pixel-a.asm Wed Feb 05 12:59:57 2014 +0530 > +++ b/source/common/x86/pixel-a.asm Wed Feb 05 15:02:37 2014 +0530 > @@ -511,7 +511,7 @@ > %endif > %endmacro > > -%macro SATD_4x8_SSE 3 > +%macro SATD_4x8_SSE 3-4 > %if HIGH_BIT_DEPTH > movh m0, [r0+0*r1] > movh m4, [r2+0*r3] > @@ -577,7 +577,11 @@ > DIFFOP 2, 6, 3, 5, 7 > %endif > %endif ; HIGH_BIT_DEPTH > +%if %0 == 4 > + SATD_8x4_1_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3, %4 > +%else > SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3 > +%endif > %endmacro > > > > ;----------------------------------------------------------------------------- > @@ -2391,56 +2395,66 @@ > SATD_START_MMX > mov r6, r0 > mov r7, r2 > + pxor m7, m7 > %if vertical==0 > mova m7, [hmul_4p] > %endif > - SATD_4x8_SSE vertical, 0, swap > + SATD_4x8_SSE vertical, 0, 4, 5 > lea r0, [r0 + r1*2*SIZEOF_PIXEL] > lea r2, [r2 + r3*2*SIZEOF_PIXEL] > - SATD_4x8_SSE vertical, 1, add > + SATD_4x8_SSE vertical, 1, 4, 5 > lea r0, [r6 + 4*SIZEOF_PIXEL] > lea r2, [r7 + 4*SIZEOF_PIXEL] > - SATD_4x8_SSE vertical, 1, add > + SATD_4x8_SSE vertical, 1, 4, 5 > lea r0, [r0 + r1*2*SIZEOF_PIXEL] > lea r2, [r2 + r3*2*SIZEOF_PIXEL] > - SATD_4x8_SSE vertical, 1, add > + SATD_4x8_SSE vertical, 1, 4, 5 > lea r0, [r6 + 8*SIZEOF_PIXEL] > lea r2, [r7 + 8*SIZEOF_PIXEL] > - SATD_4x8_SSE vertical, 1, add > + SATD_4x8_SSE vertical, 1, 4, 5 > lea r0, [r0 + r1*2*SIZEOF_PIXEL] > lea r2, [r2 + r3*2*SIZEOF_PIXEL] > - SATD_4x8_SSE vertical, 1, add > - HADDW m7, m1 > - movd eax, m7 > + SATD_4x8_SSE vertical, 1, 4, 5 > + pxor m1, m1 > + movhlps m1, m7 > + paddd m7, m1 > + pshufd m1, m7, 1 > + paddd m7, m1 > + movd eax, m7 > RET > %else > cglobal pixel_satd_12x16, 4,7,8,0-gprsize > SATD_START_MMX > mov r6, r0 > mov [rsp], r2 > + pxor m7, m7 > %if vertical==0 > mova m7, [hmul_4p] > %endif > - SATD_4x8_SSE vertical, 0, swap > + SATD_4x8_SSE vertical, 0, 4, 5 > lea r0, [r0 + r1*2*SIZEOF_PIXEL] > lea r2, [r2 + r3*2*SIZEOF_PIXEL] > - SATD_4x8_SSE vertical, 1, add > + SATD_4x8_SSE vertical, 1, 4, 5 > lea r0, [r6 + 4*SIZEOF_PIXEL] > mov r2, [rsp] > add r2, 4*SIZEOF_PIXEL > - SATD_4x8_SSE vertical, 1, add > + SATD_4x8_SSE vertical, 1, 4, 5 > lea r0, [r0 + r1*2*SIZEOF_PIXEL] > lea r2, [r2 + r3*2*SIZEOF_PIXEL] > - SATD_4x8_SSE vertical, 1, add > + SATD_4x8_SSE vertical, 1, 4, 5 > lea r0, [r6 + 8*SIZEOF_PIXEL] > mov r2, [rsp] > add r2, 8*SIZEOF_PIXEL > - SATD_4x8_SSE vertical, 1, add > + SATD_4x8_SSE vertical, 1, 4, 5 > lea r0, [r0 + r1*2*SIZEOF_PIXEL] > lea r2, [r2 + r3*2*SIZEOF_PIXEL] > - SATD_4x8_SSE vertical, 1, add > - HADDW m7, m1 > - movd eax, m7 > + SATD_4x8_SSE vertical, 1, 4, 5 > + pxor m1, m1 > + movhlps m1, m7 > + paddd m7, m1 > + pshufd m1, m7, 1 > + paddd m7, m1 > + movd eax, m7 > RET > %endif > > diff -r 1374f1168c5c -r b14a8528c478 source/common/x86/sad16-a.asm > --- a/source/common/x86/sad16-a.asm Wed Feb 05 12:59:57 2014 +0530 > +++ b/source/common/x86/sad16-a.asm Wed Feb 05 15:02:37 2014 +0530 > @@ -274,9 +274,10 @@ > lea r0, [r0+4*r1] > lea r2, [r2+4*r3] > ABSW2 m3, m4, m3, m4, m7, m5 > - paddd m1, m2 > - paddd m3, m4 > - paddd m0, m1 > + paddw m1, m2 > + paddw m3, m4 > + paddw m3, m1 > + pmaddwd m3, [pw_1] > paddd m0, m3 > %else > movu m1, [r2] > @@ -286,8 +287,9 @@ > ABSW2 m1, m2, m1, m2, m3, m4 > lea r0, [r0+4*r1] > lea r2, [r2+4*r3] > - paddw m0, m1 > - paddw m0, m2 > + paddw m2, m1 > + pmaddwd m2, [pw_1] > + paddd m0, m2 > %endif > %endmacro > > @@ -308,7 +310,7 @@ > jg .loop > %endif > > - HADDW m0, m1 > + HADDD m0, m1 > movd eax, xm0 > RET > %endmacro > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > -- Steve Borho
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
