On Thu, Sep 26, 2013 at 9:19 AM, Praveen Tiwari < [email protected]> wrote:
> > > ---------- Forwarded message ---------- > From: <[email protected]> > Date: Thu, Sep 26, 2013 at 7:40 PM > Subject: [PATCH] asm code for filterHorizontal_p_p 4 tap filter > To: [email protected] > > > # HG changeset patch > # User praveen Tiwari > # Date 1380204623 -19800 > # Node ID a31b81b707066aaf0ed42d5a2b453b5c86b9f797 > # Parent 0dbfb0bbca1a1b714aa48db7eaae3f2f9ab713ec > asm code for filterHorizontal_p_p 4 tap filter. > > diff -r 0dbfb0bbca1a -r a31b81b70706 source/common/x86/CMakeLists.txt > --- a/source/common/x86/CMakeLists.txt Wed Sep 25 14:34:49 2013 +0530 > +++ b/source/common/x86/CMakeLists.txt Thu Sep 26 19:40:23 2013 +0530 > @@ -5,7 +5,7 @@ > add_definitions(-DHAVE_ALIGNED_STACK=0) > endif() > > -set(ASMS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a2.asm) > +set(ASMS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a2.asm > ipfilter8.asm) > if (X64) > add_definitions(-DARCH_X86_64=1) > else() > diff -r 0dbfb0bbca1a -r a31b81b70706 source/common/x86/asm-primitives.cpp > --- a/source/common/x86/asm-primitives.cpp Wed Sep 25 14:34:49 2013 > +0530 > +++ b/source/common/x86/asm-primitives.cpp Thu Sep 26 19:40:23 2013 > +0530 > @@ -37,6 +37,9 @@ > LOWRES(ssse3) > LOWRES(avx) > LOWRES(xop) > + > +extern "C" void x265_filterHorizontal_p_p_4_sse4(pixel *src, intptr_t > srcStride, pixel *dst, intptr_t dstStride, int width, int height, short > const *coeff); > + > } > > bool hasXOP(void); // instr_detect.cpp > @@ -615,6 +618,7 @@ > p.sa8d_inter[PARTITION_64x4] = p.satd[PARTITION_64x4]; > p.sa8d_inter[PARTITION_64x12] = p.satd[PARTITION_64x12]; > } > + p.ipfilter_pp[FILTER_H_P_P_4] = > x265_filterHorizontal_p_p_4_sse4; > } > > } > diff -r 0dbfb0bbca1a -r a31b81b70706 source/common/x86/ipfilter8.asm > --- /dev/null Thu Jan 01 00:00:00 1970 +0000 > +++ b/source/common/x86/ipfilter8.asm Thu Sep 26 19:40:23 2013 +0530 > @@ -0,0 +1,134 @@ > > +;***************************************************************************** > +;* Copyright (C) 2013 x265 project > +;* > +;* Authors: Min Chen <[email protected]> > +;* Nabajit Deka <[email protected]> > +;* Praveen Kumar Tiwari <[email protected]> > +;* > +;* This program is free software; you can redistribute it and/or modify > +;* it under the terms of the GNU General Public License as published by > +;* the Free Software Foundation; either version 2 of the License, or > +;* (at your option) any later version. > +;* > +;* This program is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > +;* GNU General Public License for more details. > +;* > +;* You should have received a copy of the GNU General Public License > +;* along with this program; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, > USA. > +;* > +;* This program is also available under a commercial proprietary license. > +;* For more information, contact us at [email protected]. > > +;*****************************************************************************/ > + > + > +%include "x86inc.asm" > +%include "x86util.asm" > + > +%if ARCH_X86_64 == 0 > + > +INIT_XMM sse4 > + > +SECTION_RODATA 32 > +tab_leftmask: db -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 > + > +tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 > + db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 > + > +tab_c_512: times 8 dw 512 > + > +SECTION .text > + > +%macro FILTER_H4 3 > + movu %1, [src + col - 1] > + pshufb %2, %1, Tm4 > + pmaddubsw %2, coef2 > + pshufb %1, %1, Tm5 > + pmaddubsw %1, coef2 > + phaddw %2, %1 > + pmulhrsw %2, %3 > + packuswb %2, %2 > +%endmacro > + > > +;----------------------------------------------------------------------------- > +; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst, > intptr_t dstStride, int width, int height, short const *coeff) > > +;----------------------------------------------------------------------------- > +cglobal filterHorizontal_p_p_4, 0, 7, 8 > +%define src r0 > +%define dst r1 > +%define row r2 > +%define col r3 > +%define width r4 > +%define widthleft r5 > +%define mask_offset r6 > +%define coef2 m7 > +%define x3 m6 > +%define Tm5 m5 > +%define Tm4 m4 > +%define x2 m3 > +%define x1 m2 > +%define x0 m1 > +%define leftmask m0 > +%define tmp r0 > +%define tmp1 r1 > > x86 inc based parameters are not used as of now, they are giving build > errors. > > > + mov tmp, r6m > + movu coef2, [tmp] > + packsswb coef2, coef2 > + pshufd coef2, coef2, 0 > + > + mova x3, [tab_c_512] > + > + mov width, r4m > + mov widthleft, width > + and width, ~7 > + and widthleft, 7 > + mov mask_offset, widthleft > + neg mask_offset > there are tab-stops here, these need to go > + > + movq leftmask, [tab_leftmask + (7 + mask_offset)] > + mova Tm4, [tab_Tm] > + mova Tm5, [tab_Tm + 16] > + > + mov src, r0m > + mov dst, r2m > + mov row, r5m > + > +_loop_row: > + test row, row > + jz _end_row > > + > + xor col, col > + > +_loop_col: > + cmp col, width > + jge _end_col > + > + FILTER_H4 x0, x1, x3 > + movh [dst + col], x1 > + add col, 8 > + > + jmp _loop_col > + > +_end_col: > + test widthleft, widthleft > + jz _next_row > + > + movq x2, [dst + col] > + FILTER_H4 x0, x1, x3 > + pblendvb x2, x2, x1, leftmask > + movh [dst + col], x2 > + > +_next_row: > + add src, r1m > + add dst, r3m > + dec row > + > + jmp _loop_row > > Loop conditions are used at the start to satisfy the test bench > requirements. > that is entirely bass-ackwards. We shouldn't be keeping wierd code in the primitive to meet test bench requirements. The test bench should be fixed if it is exercising this primitive in a way the encoder cannot. -- Steve Borho
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
