Thanks, queued On Fri, Mar 13, 2015 at 9:16 AM, Min Chen <[email protected]> wrote:
> # HG changeset patch > # User Min Chen <[email protected]> > # Date 1426218406 25200 > # Node ID 91da3d8069fdc0d937097ff3d9d6ae91e25b852c > # Parent 1f125d14f656cfd253bd36c29a111764f007a349 > asm: improve ~5% on AVX2 interp_8tap_horiz_ps_4xN > --- > source/common/x86/ipfilter8.asm | 49 > +++++++++++++++++--------------------- > 2 files changed, 38 insertions(+), 27 deletions(-) > > diff -r 1f125d14f656 -r 91da3d8069fd source/common/x86/ipfilter8.asm > --- a/source/common/x86/ipfilter8.asm Thu Mar 12 13:06:38 2015 -0500 > +++ b/source/common/x86/ipfilter8.asm Thu Mar 12 20:46:46 2015 -0700 > @@ -1749,10 +1749,10 @@ > ;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* > dst, intptr_t dstStride, int coeffIdx, int isRowExt) > > > ;----------------------------------------------------------------------------------------------------------------------------- > > -%macro IPFILTER_LUMA_PS_4x_AVX2 2 > +%macro IPFILTER_LUMA_PS_4xN_AVX2 1 > INIT_YMM avx2 > %if ARCH_X86_64 == 1 > -cglobal interp_8tap_horiz_ps_%1x%2, 6, 11, 6 > +cglobal interp_8tap_horiz_ps_4x%1, 6,7,6 > mov r5d, r5m > mov r4d, r4m > %ifdef PIC > @@ -1762,7 +1762,6 @@ > vpbroadcastq m0, [tab_LumaCoeff + r4 * > 8] > %endif > mova m1, [tab_Lm] > - mov r9d, %2 > ;height > add r3d, r3d > vbroadcasti128 m2, [pw_2000] > > @@ -1771,17 +1770,17 @@ > ; m1 - shuffle order table > ; m2 - pw_2000 > > - xor r10, r10 > ; loop count variable > sub r0, 3 > test r5d, r5d > - jz .label > - lea r8, [r1 * 3] > ; r8 = (N / 2 - 1) * srcStride > - sub r0, r8 > ; r0(src)-r8 > - add r9, 4 > ; blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in > loop) > - > -.label > - add r10, 4 > - > + mov r5d, %1 > ; loop count variable - height > + jz .preloop > + lea r6, [r1 * 3] > ; r8 = (N / 2 - 1) * srcStride > + sub r0, r6 > ; r0(src) - 3 * srcStride > + add r5d, 7 > ; need extra 7 rows, just set a specially flag here, blkheight += N - > 1 (7 - 3 = 4 ; since the last three rows not in loop) > + > +.preloop: > + lea r6, [r3 * 3] > +.loop > ; Row 0-1 > vbroadcasti128 m3, [r0] > ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] > pshufb m3, m1 > ; shuffled based on the col order tab_Lm > @@ -1807,18 +1806,17 @@ > psubw m3, m2 > > vextracti128 xm4, m3, 1 > - lea r7, [r3 * 3] > movq [r2], xm3 > ;row 0 > movhps [r2 + r3], xm3 > ;row 1 > movq [r2 + r3 * 2], xm4 > ;row 2 > - movhps [r2 + r7], xm4 > ;row 3 > + movhps [r2 + r6], xm4 > ;row 3 > > lea r0, [r0 + r1 * 2] > ; first loop src ->5th row(i.e 4) > lea r2, [r2 + r3 * 4] > ; first loop dst ->5th row(i.e 4) > - cmp r10, r9 > - jnz .label > - test r5d, r5d > - jz .end > + sub r5d, 4 > + jz .end > + cmp r5d, 4 > + jge .loop > > ; Row 8-9 > vbroadcasti128 m3, [r0] > ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] > @@ -1830,15 +1828,13 @@ > phaddw m3, m4 > ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] > > ; Row 10 > - lea r0, [r0 + r1 * 2] > - vbroadcasti128 m4, [r0] > ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] > + vbroadcasti128 m4, [r0 + r1 * 2] > ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] > pshufb m4, m1 > pmaddubsw m4, m0 > phaddw m4, m4 > ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] > phaddw m3, m4 > > - mova m4, [interp8_hps_shuf] > - vpermd m3, m4, m3 > + vpermd m3, m5, m3 > ; m5 don't broken in above > psubw m3, m2 > > vextracti128 xm4, m3, 1 > @@ -1846,14 +1842,13 @@ > movhps [r2 + r3], xm3 > movq [r2 + r3 * 2], xm4 > .end > -RET > -%endif > -%endmacro > - > - > - IPFILTER_LUMA_PS_4x_AVX2 4 , 4 > - IPFILTER_LUMA_PS_4x_AVX2 4 , 8 > - IPFILTER_LUMA_PS_4x_AVX2 4 , 16 > + RET > +%endif > +%endmacro > + > + IPFILTER_LUMA_PS_4xN_AVX2 4 > + IPFILTER_LUMA_PS_4xN_AVX2 8 > + IPFILTER_LUMA_PS_4xN_AVX2 16 > > %macro IPFILTER_LUMA_PS_8xN_AVX2 1 > ; TODO: verify and enable on X86 mode > > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel >
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
