# HG changeset patch # User Nabajit Deka # Date 1385373715 -19800 # Mon Nov 25 15:31:55 2013 +0530 # Node ID 365f90b3b78cd3c91d6f0985b0d467da4a91d95a # Parent 10f605bd053009c8c981c7529322fecd1e54af7b asm : routine for weight_pp(), for input width in multiples of 16
diff -r 10f605bd0530 -r 365f90b3b78c source/common/x86/pixel-util.asm --- a/source/common/x86/pixel-util.asm Fri Nov 22 14:59:34 2013 -0600 +++ b/source/common/x86/pixel-util.asm Mon Nov 25 15:31:55 2013 +0530 @@ -2,6 +2,7 @@ ;* Copyright (C) 2013 x265 project ;* ;* Authors: Min Chen <[email protected]> <[email protected]> +;* Nabajit Deka <[email protected]> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -29,6 +30,8 @@ c_d_4: dd 4, 4, 4, 4 c_d_1234: dd 1, 2, 3, 4 +tab_c_1: times 8 dw 1 + SECTION .text @@ -670,3 +673,81 @@ movd eax, m7 RET + +;----------------------------------------------------------------------------------------------------------------------------------------------- +;void weight_pp(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) +;----------------------------------------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal weight_pp, 6, 7, 6 + + mov r6d, r6m + shl r6d, 6 + movd m0, r6d ; m0 = [w0<<6] + + movd m1, r7m ; m1 = [round] + punpcklwd m0, m1 ; assuming both (w0<<6) and round are using maximum of 16 bits each. + pshufd m0, m0, 0 ; m0 = [w0<<6 round] + + movd m1, r8m + + movd m2, r9m + pshufd m2, m2, 0 + + mova m5, [tab_c_1] + + sub r2d, r4d + sub r3d, r4d + +.loopH + mov r6d, r4d + shr r6d, 4 +.loopW: + movh m4, [r0] + pmovzxbw m4, m4 + + punpcklwd m3, m4, m5 + pmaddwd m3, m0 + psrad m3, m1 + paddd m3, m2 + + punpckhwd m4, m5 + pmaddwd m4, m0 + psrad m4, m1 + paddd m4, m2 + + packssdw m3, m4 + packuswb m3, m3 + + movh [r1], m3 + + movh m4, [r0 + 8] + pmovzxbw m4, m4 + + punpcklwd m3, m4, m5 + pmaddwd m3, m0 + psrad m3, m1 + paddd m3, m2 + + punpckhwd m4, m5 + pmaddwd m4, m0 + psrad m4, m1 + paddd m4, m2 + + packssdw m3, m4 + packuswb m3, m3 + + movh [r1 + 8], m3 + + add r0, 16 + add r1, 16 + + dec r6d + jnz .loopW + + lea r0, [r0 + r2] + lea r1, [r1 + r3] + + dec r5d + jnz .loopH + + RET _______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
