Please, ignore the patch. I'm re-sending it.
On Mon, Nov 25, 2013 at 4:14 PM, Deepthi Nandakumar < deep...@multicorewareinc.com> wrote: > Does not apply at the tip. > > > On Mon, Nov 25, 2013 at 11:40 AM, <yuva...@multicorewareinc.com> wrote: > >> # HG changeset patch >> # User Yuvaraj Venkatesh <yuva...@multicorewareinc.com> >> # Date 1385359751 -19800 >> # Mon Nov 25 11:39:11 2013 +0530 >> # Node ID 90a80def0f1aabdf29e1f08dd0f2263d8e6af805 >> # Parent c0c862dc71fbd021efd3922de99da4f2f93e81f4 >> cleanup: removed unused code in pixel-a.asm >> >> diff -r c0c862dc71fb -r 90a80def0f1a source/common/x86/pixel-a.asm >> --- a/source/common/x86/pixel-a.asm Sun Nov 24 17:34:12 2013 +0800 >> +++ b/source/common/x86/pixel-a.asm Mon Nov 25 11:39:11 2013 +0530 >> @@ -7157,173 +7157,6 @@ >> %endif ; !ARCH_X86_64 >> %endmacro ; SA8D >> >> >> -;============================================================================= >> -; SA8D_SATD >> >> -;============================================================================= >> - >> -; %1: vertical/horizontal mode >> -; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9) >> -; m10: satd result >> -; m6, m11-15: tmp regs >> -%macro SA8D_SATD_8x4 5 >> -%if %1 >> - LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1 >> - HADAMARD 0, sumsub, %2, %3, 6 >> - HADAMARD 0, sumsub, %4, %5, 6 >> - SBUTTERFLY wd, %2, %3, 6 >> - SBUTTERFLY wd, %4, %5, 6 >> - HADAMARD2_2D %2, %4, %3, %5, 6, dq >> - >> - mova m12, m%2 >> - mova m13, m%3 >> - mova m14, m%4 >> - mova m15, m%5 >> - HADAMARD 0, sumsub, %2, %3, 6 >> - HADAMARD 0, sumsub, %4, %5, 6 >> - SBUTTERFLY qdq, 12, 13, 6 >> - HADAMARD 0, amax, 12, 13, 6 >> - SBUTTERFLY qdq, 14, 15, 6 >> - paddw m10, m12 >> - HADAMARD 0, amax, 14, 15, 6 >> - paddw m10, m14 >> -%else >> - LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1 >> - HADAMARD4_V %2, %3, %4, %5, 6 >> - >> - pabsw m12, m%2 ; doing the abs first is a slight advantage >> - pabsw m14, m%4 >> - pabsw m13, m%3 >> - pabsw m15, m%5 >> - HADAMARD 1, max, 12, 14, 6, 11 >> - paddw m10, m12 >> - HADAMARD 1, max, 13, 15, 6, 11 >> - paddw m10, m13 >> -%endif >> -%endmacro ; SA8D_SATD_8x4 >> - >> -; %1: add spilled regs? >> -; %2: spill regs? >> -%macro SA8D_SATD_ACCUM 2 >> -%if HIGH_BIT_DEPTH >> - pmaddwd m10, [pw_1] >> - HADDUWD m0, m1 >> -%if %1 >> - paddd m10, temp1 >> - paddd m0, temp0 >> -%endif >> -%if %2 >> - mova temp1, m10 >> - pxor m10, m10 >> -%endif >> -%elif %1 >> - paddw m0, temp0 >> -%endif >> -%if %2 >> - mova temp0, m0 >> -%endif >> -%endmacro >> - >> -%macro SA8D_SATD 0 >> -%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) >> -cglobal pixel_sa8d_satd_8x8_internal >> - SA8D_SATD_8x4 vertical, 0, 1, 2, 3 >> - SA8D_SATD_8x4 vertical, 4, 5, 8, 9 >> - >> -%if vertical ; sse2-style >> - HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax >> - HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax >> -%else ; complete sa8d >> - SUMSUB_BADC w, 0, 4, 1, 5, 12 >> - HADAMARD 2, sumsub, 0, 4, 12, 11 >> - HADAMARD 2, sumsub, 1, 5, 12, 11 >> - SUMSUB_BADC w, 2, 8, 3, 9, 12 >> - HADAMARD 2, sumsub, 2, 8, 12, 11 >> - HADAMARD 2, sumsub, 3, 9, 12, 11 >> - HADAMARD 1, amax, 0, 4, 12, 11 >> - HADAMARD 1, amax, 1, 5, 12, 4 >> - HADAMARD 1, amax, 2, 8, 12, 4 >> - HADAMARD 1, amax, 3, 9, 12, 4 >> -%endif >> - >> - ; create sa8d sub results >> - paddw m1, m2 >> - paddw m0, m3 >> - paddw m0, m1 >> - >> - SAVE_MM_PERMUTATION >> - ret >> - >> >> -;------------------------------------------------------------------------------- >> -; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t ) >> >> -;------------------------------------------------------------------------------- >> -cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize >> - %define temp0 [rsp+0*mmsize] >> - %define temp1 [rsp+1*mmsize] >> - FIX_STRIDES r1, r3 >> -%if vertical==0 >> - mova m7, [hmul_8p] >> -%endif >> - lea r4, [3*r1] >> - lea r5, [3*r3] >> - pxor m10, m10 >> - >> -%if mmsize==32 >> - call pixel_sa8d_satd_8x8_internal >> - SA8D_SATD_ACCUM 0, 1 >> - call pixel_sa8d_satd_8x8_internal >> - SA8D_SATD_ACCUM 1, 0 >> - vextracti128 xm1, m0, 1 >> - vextracti128 xm2, m10, 1 >> - paddw xm0, xm1 >> - paddw xm10, xm2 >> -%else >> - lea r6, [r2+8*SIZEOF_PIXEL] >> - lea r7, [r0+8*SIZEOF_PIXEL] >> - >> - call pixel_sa8d_satd_8x8_internal >> - SA8D_SATD_ACCUM 0, 1 >> - call pixel_sa8d_satd_8x8_internal >> - SA8D_SATD_ACCUM 1, 1 >> - >> - mov r0, r7 >> - mov r2, r6 >> - >> - call pixel_sa8d_satd_8x8_internal >> - SA8D_SATD_ACCUM 1, 1 >> - call pixel_sa8d_satd_8x8_internal >> - SA8D_SATD_ACCUM 1, 0 >> -%endif >> - >> -; xop already has fast horizontal sums >> -%if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0 >> - pmaddwd xm10, [pw_1] >> - HADDUWD xm0, xm1 >> - phaddd xm0, xm10 ; sa8d1 sa8d2 satd1 satd2 >> - pshufd xm1, xm0, q2301 ; sa8d2 sa8d1 satd2 satd1 >> - paddd xm0, xm1 ; sa8d sa8d satd satd >> - movd r0d, xm0 >> - pextrd eax, xm0, 2 >> -%else >> -%if HIGH_BIT_DEPTH >> - HADDD xm0, xm1 >> - HADDD xm10, xm2 >> -%else >> - HADDUW xm0, xm1 >> - HADDW xm10, xm2 >> -%endif >> - movd r0d, xm0 >> - movd eax, xm10 >> -%endif >> - add r0d, 1 >> - shl rax, 32 >> - shr r0d, 1 >> - or rax, r0 >> - RET >> -%endmacro ; SA8D_SATD >> - >> >> -;============================================================================= >> -; INTRA SATD >> >> -;============================================================================= >> %define TRANS TRANS_SSE2 >> %define DIFFOP DIFF_UNPACK_SSE2 >> %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P >> @@ -7334,17 +7167,11 @@ >> INIT_XMM sse2 >> SA8D >> SATDS_SSE2 >> -%if ARCH_X86_64 >> -SA8D_SATD >> -%endif >> >> %if HIGH_BIT_DEPTH == 0 >> INIT_XMM ssse3,atom >> SATDS_SSE2 >> SA8D >> -%if ARCH_X86_64 >> -SA8D_SATD >> -%endif >> %endif >> >> %define DIFFOP DIFF_SUMSUB_SSSE3 >> @@ -7356,9 +7183,6 @@ >> INIT_XMM ssse3 >> SATDS_SSE2 >> SA8D >> -%if ARCH_X86_64 >> -SA8D_SATD >> -%endif >> %undef movdqa ; nehalem doesn't like movaps >> %undef movdqu ; movups >> %undef punpcklqdq ; or movlhps >> @@ -7368,9 +7192,6 @@ >> INIT_XMM sse4 >> SATDS_SSE2 >> SA8D >> -%if ARCH_X86_64 >> -SA8D_SATD >> -%endif >> >> ; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so >> ; it's effectively free. >> @@ -7378,26 +7199,17 @@ >> INIT_XMM avx >> SATDS_SSE2 >> SA8D >> -%if ARCH_X86_64 >> -SA8D_SATD >> -%endif >> >> %define TRANS TRANS_XOP >> INIT_XMM xop >> SATDS_SSE2 >> SA8D >> -%if ARCH_X86_64 >> -SA8D_SATD >> -%endif >> >> >> %if HIGH_BIT_DEPTH == 0 >> %define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2 >> %define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2 >> %define TRANS TRANS_SSE4 >> -%if ARCH_X86_64 >> -SA8D_SATD >> -%endif >> >> %macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul] >> movq xm%1, [r0] >> _______________________________________________ >> x265-devel mailing list >> x265-devel@videolan.org >> https://mailman.videolan.org/listinfo/x265-devel >> > > > _______________________________________________ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > >
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel