At 2015-03-02 16:40:22,[email protected] wrote: ># HG changeset patch ># User Sumalatha Polureddy<[email protected]> ># Date 1425285607 -19800 ># Node ID a9ad4d8202796dfb78e9d180f5fdb7cc0996ea66 ># Parent 018e8bbaa854b1a4bd82b3a2e23f7775a77da5cc >asm: avx2 code for add_ps[8x8] for 8bpp -- 22x > >add_ps[ 8x8] 22.34x 320.31 7154.80 > >diff -r 018e8bbaa854 -r a9ad4d820279 source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Fri Feb 27 11:46:09 2015 +0530 >+++ b/source/common/x86/asm-primitives.cpp Mon Mar 02 14:10:07 2015 +0530 >@@ -1426,6 +1426,7 @@ > } > if (cpuMask & X265_CPU_AVX2) > { >+ p.cu[BLOCK_8x8].add_ps = x265_pixel_add_ps_8x8_avx2; > p.cu[BLOCK_16x16].add_ps = x265_pixel_add_ps_16x16_avx2; > p.cu[BLOCK_32x32].add_ps = x265_pixel_add_ps_32x32_avx2; > p.cu[BLOCK_64x64].add_ps = x265_pixel_add_ps_64x64_avx2; >diff -r 018e8bbaa854 -r a9ad4d820279 source/common/x86/pixel.h >--- a/source/common/x86/pixel.h Fri Feb 27 11:46:09 2015 +0530 >+++ b/source/common/x86/pixel.h Mon Mar 02 14:10:07 2015 +0530 >@@ -251,6 +251,7 @@ > void x265_pixel_avg_64x32_avx2(pixel* dst, intptr_t dstride, const pixel* > src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); > void x265_pixel_avg_64x16_avx2(pixel* dst, intptr_t dstride, const pixel* > src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); > >+void x265_pixel_add_ps_8x8_avx2(pixel* a, intptr_t dstride, const pixel* b0, >const int16_t* b1, intptr_t sstride0, intptr_t sstride1); > void x265_pixel_add_ps_16x16_avx2(pixel* a, intptr_t dstride, const pixel* > b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); > void x265_pixel_add_ps_32x32_avx2(pixel* a, intptr_t dstride, const pixel* > b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); > void x265_pixel_add_ps_64x64_avx2(pixel* a, intptr_t dstride, const pixel* > b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); >diff -r 018e8bbaa854 -r a9ad4d820279 source/common/x86/pixeladd8.asm >--- a/source/common/x86/pixeladd8.asm Fri Feb 27 11:46:09 2015 +0530 >+++ b/source/common/x86/pixeladd8.asm Mon Mar 02 14:10:07 2015 +0530 >@@ -267,6 +267,56 @@ > > jnz .loop > RET >+ >+INIT_YMM avx2 >+cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, >srcStride1 >+ >+ mov r6d, %2/4 >+ add r5, r5 >+.loop: >+ >+ movq xm2, [r2] ; row 0 of src0 >+ movhps xm2, [r2 + r4] ; row 1 of src0 >+ pmovzxbw m0, xm2 >+ >+ >+ movu xm1, [r3] ; row 0 of src1 >+ movu xm2, [r3 + r5] ; row 1 of src1 >+ vinserti128 m1, m1, xm2, 1 >+ >+ lea r2, [r2 + r4 * 2] >+ lea r3, [r3 + r5 * 2] >+ >+ movq xm3, [r2] ; row 2 of src0 >+ movhps xm3, [r2 + r4] ; row 3 of src0 >+ pmovzxbw m2, xm3 >+ >+ movu xm3, [r3] ; row 2 of src1 >+ movu xm4, [r3 + r5] ; row 3 of src1 >+ vinserti128 m3, m3, xm4, 1 >+ >+ lea r2, [r2 + r4 * 2] >+ lea r3, [r3 + r5 * 2] >+ >+ paddw m0, m1 >+ paddw m2, m3 >+ packuswb m0, m0 >+ packuswb m2, m2 share packuswb or move m0-operator front, it let us free some register and break dependency link >+ >+ movq [r0], xm0 ; row 0 of dst >+ vextracti128 xm3, m0, 1 >+ movq [r0 + r1], xm3 ; row 1 of dst >+ lea r0, [r0 + r1 * 2] >+ movq [r0], xm2 ; row 2 of dst >+ vextracti128 xm3, m2, 1 >+ movq [r0 + r1], xm3 ; row 3 of dst >+ >+ lea r0, [r0 + r1 * 2] >+ >+ dec r6d >+ jnz .loop >+ RET >+ > %endif > %endmacro > >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
