On 06/26, [email protected] wrote: > # HG changeset patch > # User Rajesh Paulraj<[email protected]> > # Date 1435311677 -19800 > # Fri Jun 26 15:11:17 2015 +0530 > # Node ID 818b70b015513a01993af0c48e4714cf4fd8dc84 > # Parent 956401f1a679f1e71181b704d64e4acdb6f1a93f > asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80) > > avx2: > planecopy_cp 19.36x 5685.80 110052.08 > > sse4: > planecopy_cp 9.65x 10660.20 102850.27
FYI: this primitive has introduced a SIGBUS on some of the main10 smoke tests on Mac. Min is working on a rewrite of the primitive which should hopefully resolve this problem > diff -r 956401f1a679 -r 818b70b01551 source/common/x86/asm-primitives.cpp > --- a/source/common/x86/asm-primitives.cpp Fri Jun 26 15:01:16 2015 +0530 > +++ b/source/common/x86/asm-primitives.cpp Fri Jun 26 15:11:17 2015 +0530 > @@ -1522,6 +1522,7 @@ > p.scale2D_64to32 = PFX(scale2D_64to32_avx2); > p.weight_pp = PFX(weight_pp_avx2); > p.sign = PFX(calSign_avx2); > + p.planecopy_cp = PFX(upShift_8_avx2); > > p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2); > p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2); > diff -r 956401f1a679 -r 818b70b01551 source/common/x86/pixel-a.asm > --- a/source/common/x86/pixel-a.asm Fri Jun 26 15:01:16 2015 +0530 > +++ b/source/common/x86/pixel-a.asm Fri Jun 26 15:11:17 2015 +0530 > @@ -7388,6 +7388,96 @@ > .end: > RET > > +;--------------------------------------------------------------------------------------------------------------------- > +;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t > dstStride, int width, int height, int shift) > +;--------------------------------------------------------------------------------------------------------------------- > +%if ARCH_X86_64 > +INIT_YMM avx2 > +cglobal upShift_8, 7,8,3 > + movd xm2, r6d > + add r3, r3 > + > +.loopH: > + xor r7, r7 > + mov r6d, r4d > +.loopW: > + pmovzxbw m0,[r0 + r7] > + pmovzxbw m1,[r0 + r7 + 16] > + psllw m0, xm2 > + psllw m1, xm2 > + movu [r2 + r7 * 2], m0 > + movu [r2 + r7 * 2 + 32], m1 > + > + add r7d, 32 > + sub r6d, 32 > + jg .loopW > + > + ; move to next row > + add r0, r1 > + add r2, r3 > + dec r5d > + jnz .loopH > + > +;processing last row of every frame [To handle width which not a multiple of > 16] > + > +.loop16: > + pmovzxbw m0,[r0] > + psllw m0, xm2 > + movu [r2], m0 > + > + add r0, mmsize > + add r2, 2 * mmsize > + sub r4d, 16 > + jg .loop16 > + jz .end > + > + cmp r4d, 8 > + jl .process4 > + pmovzxbw m0,[r0] > + psllw m0, xm2 > + movu [r2], m0 > + > + add r0, 8 > + add r2, mmsize > + sub r4d, 8 > + jz .end > + > +.process4: > + cmp r4d, 4 > + jl .process2 > + movq xm0,[r0] > + pmovzxbw m0,xm0 > + psllw xm0, xm2 > + movq [r2], xm0 > + > + add r0, 4 > + add r2, 8 > + sub r4d, 4 > + jz .end > + > +.process2: > + cmp r4d, 2 > + jl .process1 > + movzx r3d, byte [r0] > + shl r3d, 2 > + mov [r2], r3w > + movzx r3d, byte [r0 + 1] > + shl r3d, 2 > + mov [r2 + 2], r3w > + > + add r0, 2 > + add r2, 4 > + sub r4d, 2 > + jz .end > + > +.process1: > + movzx r3d, byte [r0] > + shl r3d, 2 > + mov [r2], r3w > +.end: > + RET > +%endif > + > %macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp > %if cpuflag(ssse3) > pabsd %1, %3 > diff -r 956401f1a679 -r 818b70b01551 source/common/x86/pixel.h > --- a/source/common/x86/pixel.h Fri Jun 26 15:01:16 2015 +0530 > +++ b/source/common/x86/pixel.h Fri Jun 26 15:11:17 2015 +0530 > @@ -31,6 +31,7 @@ > void PFX(downShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* > dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask); > void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* > dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask); > void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, > intptr_t dstStride, int width, int height, int shift); > +void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, > intptr_t dstStride, int width, int height, int shift); > > #define DECL_PIXELS(cpu) \ > FUNCDEF_PU(int, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, > intptr_t); \ > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel -- Steve Borho _______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
