# HG changeset patch # User Dnyaneshwar Gorade <dnyanesh...@multicorewareinc.com> # Date 1383218218 -19800 # Thu Oct 31 16:46:58 2013 +0530 # Node ID 515b0af5eb805407d40ead87fd29a8c32118d3a2 # Parent 86ff1a3ec89720a73325148e8ac01ec1dbdab3c2 asm: Optimized sad_48x64: +5x and sad_24x32: +2x asm routines
diff -r 86ff1a3ec897 -r 515b0af5eb80 source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Thu Oct 31 16:21:35 2013 +0530 +++ b/source/common/x86/sad-a.asm Thu Oct 31 16:46:58 2013 +0530 @@ -175,39 +175,37 @@ %macro PROCESS_SAD_24x4 0 movu m1, [r2] movq m2, [r2 + 16] - lea r2, [r2 + r3] - movu m3, [r2] - movq m4, [r2 + 16] + movu m3, [r2 + r3] + movq m4, [r2 + r3 + 16] psadbw m1, [r0] psadbw m3, [r0 + r1] paddd m0, m1 paddd m0, m3 movq m1, [r0 + 16] - lea r0, [r0 + r1] - movq m3, [r0 + 16] + movq m3, [r0 + r1 + 16] punpcklqdq m2, m4 punpcklqdq m1, m3 psadbw m2, m1 paddd m0, m2 - lea r2, [r2 + r3] - lea r0, [r0 + r1] + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] movu m1, [r2] movq m2, [r2 + 16] - lea r2, [r2 + r3] - movu m3, [r2] - movq m4, [r2 + 16] + movu m3, [r2 + r3] + movq m4, [r2 + r3 + 16] psadbw m1, [r0] psadbw m3, [r0 + r1] paddd m0, m1 paddd m0, m3 movq m1, [r0 + 16] - lea r0, [r0 + r1] - movq m3, [r0 + 16] + movq m3, [r0 + r1 + 16] punpcklqdq m2, m4 punpcklqdq m1, m3 psadbw m2, m1 paddd m0, m2 + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] %endmacro %macro PROCESS_SAD_32x4 0 @@ -255,8 +253,18 @@ paddd m1, m2 paddd m0, m1 paddd m0, m3 - lea r2, [r2 + r3] - lea r0, [r0 + r1] + + movu m1, [r2 + r3] + movu m2, [r2 + r3 + 16] + movu m3, [r2 + r3 + 32] + psadbw m1, [r0 + r1] + psadbw m2, [r0 + r1 + 16] + psadbw m3, [r0 + r1 + 32] + paddd m1, m2 + paddd m0, m1 + paddd m0, m3 + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] movu m1, [r2] movu m2, [r2 + 16] @@ -267,30 +275,18 @@ paddd m1, m2 paddd m0, m1 paddd m0, m3 - lea r2, [r2 + r3] - lea r0, [r0 + r1] - movu m1, [r2] - movu m2, [r2 + 16] - movu m3, [r2 + 32] - psadbw m1, [r0] - psadbw m2, [r0 + 16] - psadbw m3, [r0 + 32] + movu m1, [r2 + r3] + movu m2, [r2 + r3 + 16] + movu m3, [r2 + r3 + 32] + psadbw m1, [r0 + r1] + psadbw m2, [r0 + r1 + 16] + psadbw m3, [r0 + r1 + 32] paddd m1, m2 paddd m0, m1 paddd m0, m3 - lea r2, [r2 + r3] - lea r0, [r0 + r1] - - movu m1, [r2] - movu m2, [r2 + 16] - movu m3, [r2 + 32] - psadbw m1, [r0] - psadbw m2, [r0 + 16] - psadbw m3, [r0 + 32] - paddd m1, m2 - paddd m0, m1 - paddd m0, m3 + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] %endmacro %macro PROCESS_SAD_8x4 0 @@ -725,27 +721,17 @@ ;----------------------------------------------------------------------------- ; int pixel_sad_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- -cglobal pixel_sad_48x64, 4,4,5 +cglobal pixel_sad_48x64, 4,5,5 pxor m0, m0 - mov r4, 64 + mov r4d, 4 .loop PROCESS_SAD_48x4 - lea r2, [r2 + r3] - lea r0, [r0 + r1] - PROCESS_SAD_48x4 - lea r2, [r2 + r3] - lea r0, [r0 + r1] - - sub r4, 8 - cmp r4, 8 - -jnz .loop PROCESS_SAD_48x4 - lea r2, [r2 + r3] - lea r0, [r0 + r1] PROCESS_SAD_48x4 + dec r4d + jnz .loop movhlps m1, m0 paddd m0, m1 @@ -755,24 +741,17 @@ ;----------------------------------------------------------------------------- ; int pixel_sad_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- -cglobal pixel_sad_24x32, 4,4,4 +cglobal pixel_sad_24x32, 4,5,4 pxor m0, m0 - mov r4, 32 + mov r4d, 2 .loop PROCESS_SAD_24x4 - lea r2, [r2 + r3] - lea r0, [r0 + r1] PROCESS_SAD_24x4 - lea r2, [r2 + r3] - lea r0, [r0 + r1] - sub r4, 8 - cmp r4, 8 + PROCESS_SAD_24x4 + PROCESS_SAD_24x4 + dec r4d jnz .loop - PROCESS_SAD_24x4 - lea r2, [r2 + r3] - lea r0, [r0 + r1] - PROCESS_SAD_24x4 movhlps m1, m0 paddd m0, m1 _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel