# HG changeset patch # User Yuvaraj Venkatesh <yuva...@multicorewareinc.com> # Date 1384430674 -19800 # Thu Nov 14 17:34:34 2013 +0530 # Node ID cb15dab6333f3ce23083274718754ca588596547 # Parent 125f9c97e57737fbcf0bc616e1337265a5090440 asm: assembly code for pixel_satd_64x64
diff -r 125f9c97e577 -r cb15dab6333f source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Nov 14 16:42:05 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Nov 14 17:34:34 2013 +0530 @@ -61,7 +61,7 @@ #define HEVC_SATD(cpu) \ p.satd[LUMA_32x32] = x265_pixel_satd_32x32_ ## cpu; \ p.satd[LUMA_24x32] = x265_pixel_satd_24x32_ ## cpu; \ - p.satd[LUMA_64x64] = cmp<64, 64, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \ + p.satd[LUMA_64x64] = x265_pixel_satd_64x64_ ## cpu; \ p.satd[LUMA_64x32] = x265_pixel_satd_64x32_ ## cpu; \ p.satd[LUMA_32x64] = cmp<32, 64, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \ p.satd[LUMA_64x48] = x265_pixel_satd_64x48_ ## cpu; \ diff -r 125f9c97e577 -r cb15dab6333f source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Thu Nov 14 16:42:05 2013 +0530 +++ b/source/common/x86/pixel-a.asm Thu Nov 14 17:34:34 2013 +0530 @@ -2035,6 +2035,92 @@ movd eax, m10 RET +cglobal pixel_satd_64x64, 4,8,8 ;if WIN64 && notcpuflag(avx) + SATD_START_SSE2 m10, m7 + mov r6, r0 + mov r7, r2 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 16] + lea r2, [r7 + 16] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 32] + lea r2, [r7 + 32] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + lea r0, [r6 + 48] + lea r2, [r7 + 48] + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + call pixel_satd_16x4_internal2 + + pxor m9, m9 + movhlps m9, m10 + paddd m10, m9 + pshufd m9, m10, 1 + paddd m10, m9 + movd eax, m10 + RET + %else cglobal pixel_satd_32x8, 4,6,8 ;if !WIN64 @@ -2599,6 +2685,200 @@ RET %endif +%if WIN64 +cglobal pixel_satd_64x64, 4,8,9 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8] + lea r2, [r7 + 8] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16] + lea r2, [r7 + 16] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24] + lea r2, [r7 + 24] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 32] + lea r2, [r7 + 32] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 40] + lea r2, [r7 + 40] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 48] + lea r2, [r7 + 48] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 56] + lea r2, [r7 + 56] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m8, m8 + movhlps m8, m6 + paddd m6, m8 + pshufd m8, m6, 1 + paddd m6, m8 + movd eax, m6 + RET +%else +cglobal pixel_satd_64x64, 4,6,8 ;if !WIN64 + SATD_START_SSE2 m6, m7 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + mov r0, r0mp + mov r2, r2mp + add r0, 8 + add r2, 8 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + mov r0, r0mp + mov r2, r2mp + add r0, 16 + add r2, 16 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + mov r0, r0mp + mov r2, r2mp + add r0, 24 + add r2, 24 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + mov r0, r0mp + mov r2, r2mp + add r0, 32 + add r2, 32 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + mov r0, r0mp + mov r2, r2mp + add r0, 40 + add r2, 40 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + mov r0, r0mp + mov r2, r2mp + add r0, 48 + add r2, 48 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + mov r0, r0mp + mov r2, r2mp + add r0, 56 + add r2, 56 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + cglobal pixel_satd_16x4, 4,6,8 SATD_START_SSE2 m6, m7 BACKUP_POINTERS _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel