# HG changeset patch # User praveen Tiwari # Date 1379594773 -19800 # Node ID 4fd7a03bc60da3138a07ea232f4e30a45af0bd4c # Parent 26d6f155f8df69147f40f4945d99c29a52988c56 Assembly routine for filterHorizontal_p_p() for 4 tap filter
diff -r 26d6f155f8df -r 4fd7a03bc60d source/common/x86/ipfilter8.asm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/source/common/x86/ipfilter8.asm Thu Sep 19 18:16:13 2013 +0530 @@ -0,0 +1,139 @@ +;***************************************************************************** +;* Copyright (C) 2013 x265 project +;* +;* Authors: Min Chen <[email protected]> +;* Nabajit Deka <[email protected]> +;* Praveen Kumar Tiwari <[email protected]> +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at [email protected]. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + +%if ARCH_X86_64 == 0 + +SECTION_RODATA 32 +tab_leftmask: dq 0x0000000000000000 + dq 0x00000000000000FF + dq 0x000000000000FFFF + dq 0x0000000000FFFFFF + dq 0x00000000FFFFFFFF + dq 0x000000FFFFFFFFFF + dq 0x0000FFFFFFFFFFFF + dq 0x00FFFFFFFFFFFFFF + +tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + + +SECTION .text +INIT_XMM sse4 + +%macro FILTER_H4 2 + movu %1, [src + col] + pshufb %2, %1, Tm4 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm5 + pmaddubsw %1, coef2 + phaddw %2, %1 + paddw %2, sumOffset + psraw %2, headRoom + packuswb %2, %1 +%endmacro + +;----------------------------------------------------------------------------- +; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff) +;----------------------------------------------------------------------------- +cglobal filterHorizontal_p_p_4, 0, 7, 8 +%define headRoom 14-8 +%define offset (1 << (headRoom - 1)) +%define offset2 (offset | (offset << 16)) +%define src r0 +%define dst r1 +%define row r2 +%define col r3 +%define width r4 +%define widthleft r5 +%define coef2 m7 +%define sumOffset m6 +%define Tm5 m5 +%define Tm4 m4 +%define x2 m3 +%define x1 m2 +%define x0 m1 +%define leftmask m0 +%define tmp r0 +%define tmp1 r1 + + mov tmp, r6m + movu coef2, [tmp] + packuswb coef2, coef2 + pshufd coef2, coef2, 0 + + mov tmp, offset2 + movd sumOffset, tmp + pshufd sumOffset, sumOffset, 0 + + mov width, r4m + mov widthleft, width + and width, ~7 + and widthleft, 7 + + movq leftmask, [tab_leftmask + widthleft * 8] + mova Tm4, [tab_Tm] + mova Tm5, [tab_Tm + 16] + + mov src, r0m + mov dst, r2m + mov row, r5m +_loop_row: + test row, row + jz _end_row + + xor col, col +_loop_col: + cmp col, width + jge _end_col + + FILTER_H4 x0, x1 + movq [dst + col], x1 + + add col, 8 + jmp _loop_col + +_end_col: + test widthleft, widthleft + jz _next_row + + movq x2, [dst + col] + FILTER_H4 x0, x1 + pblendvb x2, x2, x1, leftmask + movq [dst + col], x2 + +_next_row: + add src, r1m + add dst, r3m + dec row + jmp _loop_row + +_end_row: + + RET + +%endif ; ARCH_X86_64 == 0 _______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
