it right, but we can improve At 2015-03-02 16:47:23,[email protected] wrote: ># HG changeset patch ># User Sumalatha Polureddy<[email protected]> ># Date 1425286035 -19800 ># Node ID 1be088c8bc675752ebfebc4fda3bad41659269a4 ># Parent a9ad4d8202796dfb78e9d180f5fdb7cc0996ea66 >asm: avx2 code for add_ps[8x8] for 10bpp -- 24.9x > >add_ps[ 8x8] 24.97x 275.68 6882.88 > >diff -r a9ad4d820279 -r 1be088c8bc67 source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Mon Mar 02 14:10:07 2015 +0530 >+++ b/source/common/x86/asm-primitives.cpp Mon Mar 02 14:17:15 2015 +0530 >@@ -1069,6 +1069,8 @@ > } > if (cpuMask & X265_CPU_AVX2) > { >+ p.cu[BLOCK_8x8].add_ps = x265_pixel_add_ps_8x8_avx2; >+ > p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_avx2; > p.cu[BLOCK_16x16].sse_ss = x265_pixel_ssd_ss_16x16_avx2; > >diff -r a9ad4d820279 -r 1be088c8bc67 source/common/x86/pixeladd8.asm >--- a/source/common/x86/pixeladd8.asm Mon Mar 02 14:10:07 2015 +0530 >+++ b/source/common/x86/pixeladd8.asm Mon Mar 02 14:17:15 2015 +0530 >@@ -229,6 +229,53 @@ > > jnz .loop > RET >+ >+INIT_YMM avx2 >+cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, >srcStride1 >+ mova m5, [pw_pixel_max] >+ pxor m4, m4 >+ mov r6d, %2/4 >+ add r4, r4 >+ add r5, r5 >+ add r1, r1 >+.loop: >+ movu xm0, [r2] ; row 0 of src0 >+ movu xm1, [r2 + r4] ; row 1 of src0 >+ vinserti128 m0, m0, xm1, 1 >+ >+ movu xm1, [r3] ; row 0 of src1 >+ movu xm2, [r3 + r5] ; row 1 of src1 >+ vinserti128 m1, m1, xm2, 1 >+ lea r2, [r2 + r4 * 2] >+ lea r3, [r3 + r5 * 2] >+ >+ paddw m0, m1 in here, we may replace vinsert+vinsert+padd with padd+padd+vinsert, the vinsert use Port5, it is bottleneck on Haswell
>+ CLIPW m0, m4, m5 >+ movu [r0], xm0 ; row 0 of dst >+ vextracti128 xm3, m0, 1 >+ movu [r0 + r1], xm3 ; row 1 of dst >+ lea r0, [r0 + r1 * 2] >+ >+ movu xm0, [r2] ; row 2 of src0 >+ movu xm1, [r2 + r4] ; row 3 of src0 >+ vinserti128 m0, m0, xm1, 1 >+ >+ movu xm1, [r3] ; row 2 of src1 >+ movu xm2, [r3 + r5] ; row 3 of src1 >+ vinserti128 m1, m1, xm2, 1 >+ lea r2, [r2 + r4 * 2] >+ lea r3, [r3 + r5 * 2] >+ >+ paddw m0, m1 >+ CLIPW m0, m4, m5 >+ movu [r0], xm0 ; row 2 of dst >+ vextracti128 xm3, m0, 1 >+ movu [r0 + r1], xm3 ; row 3 of dst >+ lea r0, [r0 + r1 * 2] >+ >+ dec r6d >+ jnz .loop >+ RET > %else > INIT_XMM sse4 > cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, > srcStride1 >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
