On Tue, Oct 29, 2013 at 6:15 AM, <[email protected]> wrote:
> # HG changeset patch > # User Yuvaraj Venkatesh <[email protected]> > # Date 1383044811 -19800 > # Tue Oct 29 16:36:51 2013 +0530 > # Node ID fc35a117efd17270eb15aa56aad7cc90bb7bdd35 > # Parent e2f512dbd2424d099d9984c72bfc7d0729be25fe > assembly code for pixel_sad_x3_32xN > When you mark patches as review only, it would be helpful if you described why you believe the patch needs review or why it is unfinished. > diff -r e2f512dbd242 -r fc35a117efd1 source/common/x86/asm-primitives.cpp > --- a/source/common/x86/asm-primitives.cpp Mon Oct 28 16:13:05 2013 > +0530 > +++ b/source/common/x86/asm-primitives.cpp Tue Oct 29 16:36:51 2013 > +0530 > @@ -280,6 +280,11 @@ > p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3; > p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_ssse3; > p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ssse3; > + p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_ssse3; > + p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_ssse3; > + p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_ssse3; > + p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_ssse3; > + p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_ssse3; > } > if (cpuMask & X265_CPU_SSE4) > { > @@ -310,6 +315,11 @@ > p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx; > p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_avx; > p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_avx; > + p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_avx; > + p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_avx; > + p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_avx; > + p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_avx; > + p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_avx; > } > if (cpuMask & X265_CPU_XOP) > { > diff -r e2f512dbd242 -r fc35a117efd1 source/common/x86/pixel.h > --- a/source/common/x86/pixel.h Mon Oct 28 16:13:05 2013 +0530 > +++ b/source/common/x86/pixel.h Tue Oct 29 16:36:51 2013 +0530 > @@ -29,6 +29,11 @@ > #define X265_I386_PIXEL_H > > #define DECL_PIXELS(ret, name, suffix, args) \ > + ret x265_pixel_ ## name ## _32x64_ ## suffix args; \ > + ret x265_pixel_ ## name ## _32x32_ ## suffix args; \ > + ret x265_pixel_ ## name ## _32x24_ ## suffix args; \ > + ret x265_pixel_ ## name ## _32x16_ ## suffix args; \ > + ret x265_pixel_ ## name ## _32x8_ ## suffix args; \ > ret x265_pixel_ ## name ## _16x64_ ## suffix args; \ > ret x265_pixel_ ## name ## _16x32_ ## suffix args; \ > ret x265_pixel_ ## name ## _16x16_ ## suffix args; \ > diff -r e2f512dbd242 -r fc35a117efd1 source/common/x86/sad-a.asm > --- a/source/common/x86/sad-a.asm Mon Oct 28 16:13:05 2013 +0530 > +++ b/source/common/x86/sad-a.asm Tue Oct 29 16:36:51 2013 +0530 > @@ -1007,19 +1007,30 @@ > ; SAD x3/x4 XMM > > > ;============================================================================= > > -%macro SAD_X3_START_1x16P_SSE2 0 > - mova m2, [r0] > +%macro SAD_X3_START_1x16P_SSE2 1 > + mova m3, [r0 + %1] > +%if %1 == 0 > + pxor m0, m0 > + pxor m1, m1 > + pxor m2, m2 > +%endif > %if cpuflag(avx) > - psadbw m0, m2, [r1] > - psadbw m1, m2, [r2] > - psadbw m2, [r3] > + psadbw m4, m3, [r1 + %1] > + psadbw m5, m3, [r2 + %1] > + psadbw m3, [r3 + %1] > + paddd m0, m4 > + paddd m1, m5 > + paddd m2, m3 > %else > - movu m0, [r1] > - movu m1, [r2] > - movu m3, [r3] > - psadbw m0, m2 > - psadbw m1, m2 > - psadbw m2, m3 > + movu m4, [r1 + %1] > + movu m5, [r2 + %1] > + movu m6, [r3 + %1] > + psadbw m4, m3 > + psadbw m5, m3 > + psadbw m6, m3 > + paddd m0, m4 > + paddd m1, m5 > + paddd m2, m6 > %endif > %endmacro > > @@ -1051,7 +1062,7 @@ > %macro SAD_X3_4x16P_SSE2 2 > %if %1==0 > lea t0, [r4*3] > - SAD_X3_START_1x16P_SSE2 > + SAD_X3_START_1x16P_SSE2 0 > %else > SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0 > %endif > @@ -1068,6 +1079,30 @@ > %endif > %endmacro > > +%macro SAD_X3_4x32P_SSE2 2 > +%assign y 0 > +%rep 2 > +%if %1==0 > + lea t0, [r4+r4*2] > + SAD_X3_START_1x16P_SSE2 y > +%else > + SAD_X3_1x16P_SSE2 (FENC_STRIDE*(0+(%1&1)*4) + y), (r4*0 + y) > +%endif > + SAD_X3_1x16P_SSE2 (FENC_STRIDE*(1+(%1&1)*4) + y), (r4*1 + y) > + SAD_X3_1x16P_SSE2 (FENC_STRIDE*(2+(%1&1)*4) + y), (r4*2 + y) > + SAD_X3_1x16P_SSE2 (FENC_STRIDE*(3+(%1&1)*4) + y), (t0 + y) > +%assign y y+16 > +%endrep > +%if %1 != %2-1 > +%if (%1&1) != 0 > + add r0, 8*FENC_STRIDE > +%endif > + lea r1, [r1+4*r4] > + lea r2, [r2+4*r4] > + lea r3, [r3+4*r4] > +%endif > +%endmacro > + > %macro SAD_X3_START_2x8P_SSE2 0 > movq m3, [r0] > movq m0, [r1] > @@ -1506,7 +1541,7 @@ > SAD_X%1_4x%2P_SSE2 x, %3/4 > %assign x x+1 > %endrep > -%if %3 == 64 > +%if %3 >= 24 > SAD_X%1_END_SSE2 1 > %else > SAD_X%1_END_SSE2 0 > @@ -1544,6 +1579,11 @@ > %endmacro > > INIT_XMM ssse3 > +SAD_X_SSE2 3, 32, 64, 7 > +SAD_X_SSE2 3, 32, 32, 7 > +SAD_X_SSE2 3, 32, 24, 7 > +SAD_X_SSE2 3, 32, 16, 7 > +SAD_X_SSE2 3, 32, 8, 7 > SAD_X_SSE2 3, 16, 64, 7 > SAD_X_SSE2 3, 16, 32, 7 > SAD_X_SSE2 3, 16, 16, 7 > @@ -1562,6 +1602,11 @@ > SAD_X_SSSE3 4, 8, 4 > > INIT_XMM avx > +SAD_X_SSE2 3, 32, 64, 7 > +SAD_X_SSE2 3, 32, 32, 7 > +SAD_X_SSE2 3, 32, 24, 7 > +SAD_X_SSE2 3, 32, 16, 7 > +SAD_X_SSE2 3, 32, 8, 7 > SAD_X_SSE2 3, 16, 64, 7 > SAD_X_SSE2 3, 16, 32, 6 > SAD_X_SSE2 3, 16, 16, 6 > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > -- Steve Borho
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
