On Tue, Oct 8, 2013 at 2:33 AM, <[email protected]> wrote:
> # HG changeset patch > # User Praveen Tiwari > # Date 1381217602 -19800 > # Node ID 4f728eeab74a089c86068663baf522c40a136981 > # Parent 2b2fc4a46c7dcf8720b1b9872c0f3b86c048ffcd > filterHorizontal_p_p_4, 48x48 asm code > For luma, the only width-48 block used in the encoder is 48x64. And at width 64 there is only 64x16, 64x32, 64x48, 64x64 (1/4, 1/2, 3/4, 4/4). The same applies to width 32 (8, 16, 24, 32) and 16 (4, 8, 12, 16). (width 24 only has height 32, width 12 only has height 16) width 8 only has 8x4 and 8x8 So to minimize your work effort you should be writing 8-tap luma macros that interpolate: * 64x16 * 32x8 * 16x4 * 8x4 The 48x64, 24x32, and 12x16 blocks are rarely used (AMP) and could be built from 16x4 or 4x4. These 4-tap filters are only used for 4:2:0 chroma and they will have different block-size requirements, but you need to figure out exactly which chroma blocks are needed before writing 4-tap block intrinsics. diff -r 2b2fc4a46c7d -r 4f728eeab74a source/common/x86/ipfilter8.asm > --- a/source/common/x86/ipfilter8.asm Tue Oct 08 12:53:44 2013 +0530 > +++ b/source/common/x86/ipfilter8.asm Tue Oct 08 13:03:22 2013 +0530 > @@ -530,3 +530,101 @@ > FILTER_H4_w32 x0, x1, x2, x3 > movu [dstq + 16], x1 > RET > + > + SECTION_RODATA 32 > +tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 > + db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 > + > +tab_c_512: times 8 dw 512 > + > +SECTION .text > + > +%macro FILTER_H4_w48 4 > + movu %1, [srcq - 1] > + pshufb %2, %1, Tm0 > + pmaddubsw %2, coef2 > + pshufb %1, %1, Tm1 > + pmaddubsw %1, coef2 > + phaddw %2, %1 > + movu %1, [srcq - 1 + 8] > + pshufb %4, %1, Tm0 > + pmaddubsw %4, coef2 > + pshufb %1, %1, Tm1 > + pmaddubsw %1, coef2 > + phaddw %4, %1 > + pmulhrsw %2, %3 > + pmulhrsw %4, %3 > + packuswb %2, %4 > + movu [dstq], %2 > + movu %1, [srcq - 1 + 16] > + pshufb %2, %1, Tm0 > + pmaddubsw %2, coef2 > + pshufb %1, %1, Tm1 > + pmaddubsw %1, coef2 > + phaddw %2, %1 > + movu %1, [srcq - 1 + 24] > + pshufb %4, %1, Tm0 > + pmaddubsw %4, coef2 > + pshufb %1, %1, Tm1 > + pmaddubsw %1, coef2 > + phaddw %4, %1 > + pmulhrsw %2, %3 > + pmulhrsw %4, %3 > + packuswb %2, %4 > + movu [dstq + 16], x1 > + movu %1, [srcq - 1 + 32] > + pshufb %2, %1, Tm0 > + pmaddubsw %2, coef2 > + pshufb %1, %1, Tm1 > + pmaddubsw %1, coef2 > + phaddw %2, %1 > + movu %1, [srcq - 1 + 40] > + pshufb %4, %1, Tm0 > + pmaddubsw %4, coef2 > + pshufb %1, %1, Tm1 > + pmaddubsw %1, coef2 > + phaddw %4, %1 > + pmulhrsw %2, %3 > + pmulhrsw %4, %3 > + packuswb %2, %4 > +%endmacro > + > +%macro FILTER_H4_w48_CALL 0 > + FILTER_H4_w48 x0, x1, x2, x3 > + > + movu [dstq + 32], x1 > + > + add srcq, srcstrideq > + add dstq, dststrideq > +%endmacro > + > > +;----------------------------------------------------------------------------- > +; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst, > intptr_t dstStride, int width, int height, short const *coeff) > > +;----------------------------------------------------------------------------- > +INIT_XMM sse4 > +cglobal filterHorizontal_p_p_4, 4, 5, 6, src, srcstride, dst, dststride > +%define coef2 m6 > +%define Tm0 m5 > +%define Tm1 m4 > +%define x3 m3 > +%define x2 m2 > +%define x1 m1 > +%define x0 m0 > + > + mov r4, r6m > + movu coef2, [r4] > + packsswb coef2, coef2 > + pshufd coef2, coef2, 0 > + > + mova x2, [tab_c_512] > + > + mova Tm0, [tab_Tm] > + mova Tm1, [tab_Tm + 16] > + > + %rep 47 > + FILTER_H4_w48_CALL > + %endrep > + > + FILTER_H4_w48 x0, x1, x2, x3 > + movu [dstq + 32], x1 > + RET > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > -- Steve Borho
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
