Min, Testbench reports a failure here -
** testbench failure reported for vc11_64_main12:: Testing primitives: AVX2 Testing primitives: BMI2 planecopy_sp_shl failed x265: asm primitive has failed. Go and fix that Right Now! return code -1 On Thu, Dec 31, 2015 at 5:33 AM, Min Chen <chenm...@163.com> wrote: > # HG changeset patch > # User Min Chen <chenm...@163.com> > # Date 1451520182 21600 > # Node ID 717cb31ed9931513bb0851f0e6c68af868b5ad45 > # Parent 75d1c62d8f0c517dda37ac89f401faa308d60f24 > asm: rewrite 16bpp partial pixels process code on upShift and downShift > (Issue #223) > --- > source/common/x86/pixel-a.asm | 327 > ++++++++++------------------------------- > source/test/pixelharness.cpp | 25 +++- > 2 files changed, 103 insertions(+), 249 deletions(-) > > diff -r 75d1c62d8f0c -r 717cb31ed993 source/common/x86/pixel-a.asm > --- a/source/common/x86/pixel-a.asm Thu Dec 24 13:58:32 2015 +0530 > +++ b/source/common/x86/pixel-a.asm Wed Dec 30 18:03:02 2015 -0600 > @@ -8154,92 +8154,57 @@ > ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, > intptr_t dstStride, int width, int height, int shift, uint16_t mask) > > > ;------------------------------------------------------------------------------------------------------------------------ > INIT_XMM sse2 > -cglobal downShift_16, 7,7,3 > - movd m0, r6d ; m0 = shift > +cglobal downShift_16, 4,7,3 > + mov r4d, r4m > + mov r5d, r5m > + movd m0, r6m ; m0 = shift > add r1, r1 > + > dec r5d > .loopH: > xor r6, r6 > + > .loopW: > movu m1, [r0 + r6 * 2] > - movu m2, [r0 + r6 * 2 + 16] > + movu m2, [r0 + r6 * 2 + mmsize] > psrlw m1, m0 > psrlw m2, m0 > packuswb m1, m2 > movu [r2 + r6], m1 > > - add r6, 16 > + add r6, mmsize > cmp r6d, r4d > - jl .loopW > + jl .loopW > > ; move to next row > add r0, r1 > add r2, r3 > dec r5d > - jnz .loopH > - > -;processing last row of every frame [To handle width which not a multiple > of 16] > - > + jnz .loopH > + > + ;processing last row of every frame [To handle width which not a > multiple of 16] > + ; r4d must be more than or equal to 16(mmsize) > .loop16: > + movu m1, [r0 + (r4 - mmsize) * 2] > + movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] > + psrlw m1, m0 > + psrlw m2, m0 > + packuswb m1, m2 > + movu [r2 + r4 - mmsize], m1 > + > + sub r4d, mmsize > + jz .end > + cmp r4d, mmsize > + jge .loop16 > + > + ; process partial pixels > movu m1, [r0] > - movu m2, [r0 + 16] > + movu m2, [r0 + mmsize] > psrlw m1, m0 > psrlw m2, m0 > packuswb m1, m2 > movu [r2], m1 > > - add r0, 2 * mmsize > - add r2, mmsize > - sub r4d, 16 > - jz .end > - cmp r4d, 15 > - jg .loop16 > - > - cmp r4d, 8 > - jl .process4 > - movu m1, [r0] > - psrlw m1, m0 > - packuswb m1, m1 > - movh [r2], m1 > - > - add r0, mmsize > - add r2, 8 > - sub r4d, 8 > - jz .end > - > -.process4: > - cmp r4d, 4 > - jl .process2 > - movh m1,[r0] > - psrlw m1, m0 > - packuswb m1, m1 > - movd [r2], m1 > - > - add r0, 8 > - add r2, 4 > - sub r4d, 4 > - jz .end > - > -.process2: > - cmp r4d, 2 > - jl .process1 > - movd m1, [r0] > - psrlw m1, m0 > - packuswb m1, m1 > - movd r6, m1 > - mov [r2], r6w > - > - add r0, 4 > - add r2, 2 > - sub r4d, 2 > - jz .end > - > -.process1: > - movd m1, [r0] > - psrlw m1, m0 > - packuswb m1, m1 > - movd r3, m1 > - mov [r2], r3b > .end: > RET > > @@ -8248,12 +8213,16 @@ > ;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, > intptr_t dstStride, int width, int height, int shift, uint16_t mask) > > > ;------------------------------------------------------------------------------------------------------------------------------------- > INIT_YMM avx2 > -cglobal downShift_16, 6,7,3 > +cglobal downShift_16, 4,7,3 > + mov r4d, r4m > + mov r5d, r5m > movd xm0, r6m ; m0 = shift > add r1d, r1d > + > dec r5d > .loopH: > xor r6, r6 > + > .loopW: > movu m1, [r0 + r6 * 2 + 0] > movu m2, [r0 + r6 * 2 + 32] > @@ -8265,92 +8234,39 @@ > > add r6d, mmsize > cmp r6d, r4d > - jl .loopW > + jl .loopW > > ; move to next row > add r0, r1 > add r2, r3 > dec r5d > - jnz .loopH > - > -; processing last row of every frame [To handle width which not a > multiple of 32] > - mov r6d, r4d > - and r4d, 31 > - shr r6d, 5 > + jnz .loopH > + > + ; processing last row of every frame [To handle width which not a > multiple of 32] > > .loop32: > - movu m1, [r0] > - movu m2, [r0 + 32] > + movu m1, [r0 + (r4 - mmsize) * 2] > + movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] > psrlw m1, xm0 > psrlw m2, xm0 > packuswb m1, m2 > - vpermq m1, m1, 11011000b > + vpermq m1, m1, q3120 > + movu [r2 + r4 - mmsize], m1 > + > + sub r4d, mmsize > + jz .end > + cmp r4d, mmsize > + jge .loop32 > + > + ; process partial pixels > + movu m1, [r0] > + movu m2, [r0 + mmsize] > + psrlw m1, xm0 > + psrlw m2, xm0 > + packuswb m1, m2 > + vpermq m1, m1, q3120 > movu [r2], m1 > > - add r0, 2*mmsize > - add r2, mmsize > - dec r6d > - jnz .loop32 > - > - cmp r4d, 16 > - jl .process8 > - movu m1, [r0] > - psrlw m1, xm0 > - packuswb m1, m1 > - vpermq m1, m1, 10001000b > - movu [r2], xm1 > - > - add r0, mmsize > - add r2, 16 > - sub r4d, 16 > - jz .end > - > -.process8: > - cmp r4d, 8 > - jl .process4 > - movu m1, [r0] > - psrlw m1, xm0 > - packuswb m1, m1 > - movq [r2], xm1 > - > - add r0, 16 > - add r2, 8 > - sub r4d, 8 > - jz .end > - > -.process4: > - cmp r4d, 4 > - jl .process2 > - movq xm1,[r0] > - psrlw m1, xm0 > - packuswb m1, m1 > - movd [r2], xm1 > - > - add r0, 8 > - add r2, 4 > - sub r4d, 4 > - jz .end > - > -.process2: > - cmp r4d, 2 > - jl .process1 > - movd xm1, [r0] > - psrlw m1, xm0 > - packuswb m1, m1 > - movd r6d, xm1 > - mov [r2], r6w > - > - add r0, 4 > - add r2, 2 > - sub r4d, 2 > - jz .end > - > -.process1: > - movd xm1, [r0] > - psrlw m1, xm0 > - packuswb m1, m1 > - movd r3d, xm1 > - mov [r2], r3b > .end: > RET > > @@ -8487,7 +8403,9 @@ > ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, > intptr_t dstStride, int width, int height, int shift, uint16_t mask) > > > ;------------------------------------------------------------------------------------------------------------------------ > INIT_XMM sse2 > -cglobal upShift_16, 6,7,4 > +cglobal upShift_16, 4,7,4 > + mov r4d, r4m > + mov r5d, r5m > movd m0, r6m ; m0 = shift > mova m3, [pw_pixel_max] > FIX_STRIDES r1d, r3d > @@ -8515,9 +8433,25 @@ > dec r5d > jnz .loopH > > -;processing last row of every frame [To handle width which not a multiple > of 16] > - > + ;processing last row of every frame [To handle width which not a > multiple of 16] > + > + ; WARNING: width(r4d) MUST BE more than or equal to 16(mmsize) in here > .loop16: > + movu m1, [r0 + (r4 - mmsize) * 2] > + movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] > + psllw m1, m0 > + psllw m2, m0 > + pand m1, m3 > + pand m2, m3 > + movu [r2 + (r4 - mmsize) * 2], m1 > + movu [r2 + (r4 - mmsize) * 2 + mmsize], m2 > + > + sub r4d, mmsize > + jz .end > + cmp r4d, mmsize > + jge .loop16 > + > + ; process partial pixels > movu m1, [r0] > movu m2, [r0 + mmsize] > psllw m1, m0 > @@ -8527,56 +8461,6 @@ > movu [r2], m1 > movu [r2 + mmsize], m2 > > - add r0, 2 * mmsize > - add r2, 2 * mmsize > - sub r4d, 16 > - jz .end > - jg .loop16 > - > - cmp r4d, 8 > - jl .process4 > - movu m1, [r0] > - psrlw m1, m0 > - pand m1, m3 > - movu [r2], m1 > - > - add r0, mmsize > - add r2, mmsize > - sub r4d, 8 > - jz .end > - > -.process4: > - cmp r4d, 4 > - jl .process2 > - movh m1,[r0] > - psllw m1, m0 > - pand m1, m3 > - movh [r2], m1 > - > - add r0, 8 > - add r2, 8 > - sub r4d, 4 > - jz .end > - > -.process2: > - cmp r4d, 2 > - jl .process1 > - movd m1, [r0] > - psllw m1, m0 > - pand m1, m3 > - movd [r2], m1 > - > - add r0, 4 > - add r2, 4 > - sub r4d, 2 > - jz .end > - > -.process1: > - movd m1, [r0] > - psllw m1, m0 > - pand m1, m3 > - movd r3, m1 > - mov [r2], r3w > .end: > RET > > @@ -8584,9 +8468,10 @@ > > > ;------------------------------------------------------------------------------------------------------------------------------------- > ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, > intptr_t dstStride, int width, int height, int shift, uint16_t mask) > > > ;------------------------------------------------------------------------------------------------------------------------------------- > -; TODO: NO TEST CODE! > INIT_YMM avx2 > -cglobal upShift_16, 6,7,4 > +cglobal upShift_16, 4,7,4 > + mov r4d, r4m > + mov r5d, r5m > movd xm0, r6m ; m0 = shift > vbroadcasti128 m3, [pw_pixel_max] > FIX_STRIDES r1d, r3d > @@ -8613,83 +8498,33 @@ > dec r5d > jnz .loopH > > -; processing last row of every frame [To handle width which not a > multiple of 32] > - mov r6d, r4d > - and r4d, 31 > - shr r6d, 5 > + ; processing last row of every frame [To handle width which not a > multiple of 32] > > .loop32: > + movu m1, [r0 + (r4 - mmsize) * 2] > + movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] > + psllw m1, xm0 > + psllw m2, xm0 > + pand m1, m3 > + pand m2, m3 > + movu [r2 + (r4 - mmsize) * 2], m1 > + movu [r2 + (r4 - mmsize) * 2 + mmsize], m2 > + > + sub r4d, mmsize > + jz .end > + cmp r4d, mmsize > + jge .loop32 > + > + ; process partial pixels > movu m1, [r0] > - movu m2, [r0 + mmsize] > + movu m2, [r0] > psllw m1, xm0 > psllw m2, xm0 > pand m1, m3 > pand m2, m3 > movu [r2], m1 > - movu [r2 + mmsize], m2 > - > - add r0, 2*mmsize > - add r2, 2*mmsize > - dec r6d > - jnz .loop32 > - > - cmp r4d, 16 > - jl .process8 > - movu m1, [r0] > - psllw m1, xm0 > - pand m1, m3 > - movu [r2], m1 > - > - add r0, mmsize > - add r2, mmsize > - sub r4d, 16 > - jz .end > - > -.process8: > - cmp r4d, 8 > - jl .process4 > - movu xm1, [r0] > - psllw xm1, xm0 > - pand xm1, xm3 > - movu [r2], xm1 > - > - add r0, 16 > - add r2, 16 > - sub r4d, 8 > - jz .end > - > -.process4: > - cmp r4d, 4 > - jl .process2 > - movq xm1,[r0] > - psllw xm1, xm0 > - pand xm1, xm3 > - movq [r2], xm1 > - > - add r0, 8 > - add r2, 8 > - sub r4d, 4 > - jz .end > - > -.process2: > - cmp r4d, 2 > - jl .process1 > - movd xm1, [r0] > - psllw xm1, xm0 > - pand xm1, xm3 > - movd [r2], xm1 > - > - add r0, 4 > - add r2, 4 > - sub r4d, 2 > - jz .end > - > -.process1: > - movd xm1, [r0] > - psllw xm1, xm0 > - pand xm1, xm3 > - movd r3d, xm1 > - mov [r2], r3w > + movu [r2], m2 > + > .end: > RET > > diff -r 75d1c62d8f0c -r 717cb31ed993 source/test/pixelharness.cpp > --- a/source/test/pixelharness.cpp Thu Dec 24 13:58:32 2015 +0530 > +++ b/source/test/pixelharness.cpp Wed Dec 30 18:03:02 2015 -0600 > @@ -1299,8 +1299,8 @@ > > memset(ref_dest, 0xCD, sizeof(ref_dest)); > memset(opt_dest, 0xCD, sizeof(opt_dest)); > - int width = 32 + rand() % 32; > - int height = 32 + rand() % 32; > + int width = 32 + (rand() % 32); > + int height = 32 + (rand() % 32); > intptr_t srcStride = 64; > intptr_t dstStride = width; > int j = 0; > @@ -1308,11 +1308,23 @@ > for (int i = 0; i < ITERS; i++) > { > int index = i % TEST_CASES; > + > checked(opt, ushort_test_buff[index] + j, srcStride, opt_dest, > dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1)); > ref(ushort_test_buff[index] + j, srcStride, ref_dest, dstStride, > width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1)); > > - if (memcmp(ref_dest, opt_dest, width * height * sizeof(pixel))) > + if (memcmp(ref_dest, opt_dest, dstStride * height * > sizeof(pixel))) > + { > + memcpy(opt_dest, ref_dest, sizeof(ref_dest)); > + opt(ushort_test_buff[index] + j, srcStride, opt_dest, > dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1)); > return false; > + } > + > + // check tail memory area > + for(int x = width; x < dstStride; x++) > + { > + if (opt_dest[(height - 1 * dstStride) + x] != 0xCD) > + return false; > + } > > reportfail(); > j += INCR; > @@ -1344,6 +1356,13 @@ > if (memcmp(ref_dest, opt_dest, sizeof(ref_dest))) > return false; > > + // check tail memory area > + for(int x = width; x < dstStride; x++) > + { > + if (opt_dest[(height - 1 * dstStride) + x] != 0xCD) > + return false; > + } > + > reportfail(); > j += INCR; > } > > _______________________________________________ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > -- Deepthi Nandakumar Engineering Manager, x265 Multicoreware, Inc
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel