# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1451520182 21600 # Node ID 717cb31ed9931513bb0851f0e6c68af868b5ad45 # Parent 75d1c62d8f0c517dda37ac89f401faa308d60f24 asm: rewrite 16bpp partial pixels process code on upShift and downShift (Issue #223) --- source/common/x86/pixel-a.asm | 327 ++++++++++------------------------------- source/test/pixelharness.cpp | 25 +++- 2 files changed, 103 insertions(+), 249 deletions(-)
diff -r 75d1c62d8f0c -r 717cb31ed993 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Thu Dec 24 13:58:32 2015 +0530 +++ b/source/common/x86/pixel-a.asm Wed Dec 30 18:03:02 2015 -0600 @@ -8154,92 +8154,57 @@ ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;------------------------------------------------------------------------------------------------------------------------ INIT_XMM sse2 -cglobal downShift_16, 7,7,3 - movd m0, r6d ; m0 = shift +cglobal downShift_16, 4,7,3 + mov r4d, r4m + mov r5d, r5m + movd m0, r6m ; m0 = shift add r1, r1 + dec r5d .loopH: xor r6, r6 + .loopW: movu m1, [r0 + r6 * 2] - movu m2, [r0 + r6 * 2 + 16] + movu m2, [r0 + r6 * 2 + mmsize] psrlw m1, m0 psrlw m2, m0 packuswb m1, m2 movu [r2 + r6], m1 - add r6, 16 + add r6, mmsize cmp r6d, r4d - jl .loopW + jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d - jnz .loopH - -;processing last row of every frame [To handle width which not a multiple of 16] - + jnz .loopH + + ;processing last row of every frame [To handle width which not a multiple of 16] + ; r4d must be more than or equal to 16(mmsize) .loop16: + movu m1, [r0 + (r4 - mmsize) * 2] + movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] + psrlw m1, m0 + psrlw m2, m0 + packuswb m1, m2 + movu [r2 + r4 - mmsize], m1 + + sub r4d, mmsize + jz .end + cmp r4d, mmsize + jge .loop16 + + ; process partial pixels movu m1, [r0] - movu m2, [r0 + 16] + movu m2, [r0 + mmsize] psrlw m1, m0 psrlw m2, m0 packuswb m1, m2 movu [r2], m1 - add r0, 2 * mmsize - add r2, mmsize - sub r4d, 16 - jz .end - cmp r4d, 15 - jg .loop16 - - cmp r4d, 8 - jl .process4 - movu m1, [r0] - psrlw m1, m0 - packuswb m1, m1 - movh [r2], m1 - - add r0, mmsize - add r2, 8 - sub r4d, 8 - jz .end - -.process4: - cmp r4d, 4 - jl .process2 - movh m1,[r0] - psrlw m1, m0 - packuswb m1, m1 - movd [r2], m1 - - add r0, 8 - add r2, 4 - sub r4d, 4 - jz .end - -.process2: - cmp r4d, 2 - jl .process1 - movd m1, [r0] - psrlw m1, m0 - packuswb m1, m1 - movd r6, m1 - mov [r2], r6w - - add r0, 4 - add r2, 2 - sub r4d, 2 - jz .end - -.process1: - movd m1, [r0] - psrlw m1, m0 - packuswb m1, m1 - movd r3, m1 - mov [r2], r3b .end: RET @@ -8248,12 +8213,16 @@ ;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;------------------------------------------------------------------------------------------------------------------------------------- INIT_YMM avx2 -cglobal downShift_16, 6,7,3 +cglobal downShift_16, 4,7,3 + mov r4d, r4m + mov r5d, r5m movd xm0, r6m ; m0 = shift add r1d, r1d + dec r5d .loopH: xor r6, r6 + .loopW: movu m1, [r0 + r6 * 2 + 0] movu m2, [r0 + r6 * 2 + 32] @@ -8265,92 +8234,39 @@ add r6d, mmsize cmp r6d, r4d - jl .loopW + jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d - jnz .loopH - -; processing last row of every frame [To handle width which not a multiple of 32] - mov r6d, r4d - and r4d, 31 - shr r6d, 5 + jnz .loopH + + ; processing last row of every frame [To handle width which not a multiple of 32] .loop32: - movu m1, [r0] - movu m2, [r0 + 32] + movu m1, [r0 + (r4 - mmsize) * 2] + movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] psrlw m1, xm0 psrlw m2, xm0 packuswb m1, m2 - vpermq m1, m1, 11011000b + vpermq m1, m1, q3120 + movu [r2 + r4 - mmsize], m1 + + sub r4d, mmsize + jz .end + cmp r4d, mmsize + jge .loop32 + + ; process partial pixels + movu m1, [r0] + movu m2, [r0 + mmsize] + psrlw m1, xm0 + psrlw m2, xm0 + packuswb m1, m2 + vpermq m1, m1, q3120 movu [r2], m1 - add r0, 2*mmsize - add r2, mmsize - dec r6d - jnz .loop32 - - cmp r4d, 16 - jl .process8 - movu m1, [r0] - psrlw m1, xm0 - packuswb m1, m1 - vpermq m1, m1, 10001000b - movu [r2], xm1 - - add r0, mmsize - add r2, 16 - sub r4d, 16 - jz .end - -.process8: - cmp r4d, 8 - jl .process4 - movu m1, [r0] - psrlw m1, xm0 - packuswb m1, m1 - movq [r2], xm1 - - add r0, 16 - add r2, 8 - sub r4d, 8 - jz .end - -.process4: - cmp r4d, 4 - jl .process2 - movq xm1,[r0] - psrlw m1, xm0 - packuswb m1, m1 - movd [r2], xm1 - - add r0, 8 - add r2, 4 - sub r4d, 4 - jz .end - -.process2: - cmp r4d, 2 - jl .process1 - movd xm1, [r0] - psrlw m1, xm0 - packuswb m1, m1 - movd r6d, xm1 - mov [r2], r6w - - add r0, 4 - add r2, 2 - sub r4d, 2 - jz .end - -.process1: - movd xm1, [r0] - psrlw m1, xm0 - packuswb m1, m1 - movd r3d, xm1 - mov [r2], r3b .end: RET @@ -8487,7 +8403,9 @@ ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;------------------------------------------------------------------------------------------------------------------------ INIT_XMM sse2 -cglobal upShift_16, 6,7,4 +cglobal upShift_16, 4,7,4 + mov r4d, r4m + mov r5d, r5m movd m0, r6m ; m0 = shift mova m3, [pw_pixel_max] FIX_STRIDES r1d, r3d @@ -8515,9 +8433,25 @@ dec r5d jnz .loopH -;processing last row of every frame [To handle width which not a multiple of 16] - + ;processing last row of every frame [To handle width which not a multiple of 16] + + ; WARNING: width(r4d) MUST BE more than or equal to 16(mmsize) in here .loop16: + movu m1, [r0 + (r4 - mmsize) * 2] + movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] + psllw m1, m0 + psllw m2, m0 + pand m1, m3 + pand m2, m3 + movu [r2 + (r4 - mmsize) * 2], m1 + movu [r2 + (r4 - mmsize) * 2 + mmsize], m2 + + sub r4d, mmsize + jz .end + cmp r4d, mmsize + jge .loop16 + + ; process partial pixels movu m1, [r0] movu m2, [r0 + mmsize] psllw m1, m0 @@ -8527,56 +8461,6 @@ movu [r2], m1 movu [r2 + mmsize], m2 - add r0, 2 * mmsize - add r2, 2 * mmsize - sub r4d, 16 - jz .end - jg .loop16 - - cmp r4d, 8 - jl .process4 - movu m1, [r0] - psrlw m1, m0 - pand m1, m3 - movu [r2], m1 - - add r0, mmsize - add r2, mmsize - sub r4d, 8 - jz .end - -.process4: - cmp r4d, 4 - jl .process2 - movh m1,[r0] - psllw m1, m0 - pand m1, m3 - movh [r2], m1 - - add r0, 8 - add r2, 8 - sub r4d, 4 - jz .end - -.process2: - cmp r4d, 2 - jl .process1 - movd m1, [r0] - psllw m1, m0 - pand m1, m3 - movd [r2], m1 - - add r0, 4 - add r2, 4 - sub r4d, 2 - jz .end - -.process1: - movd m1, [r0] - psllw m1, m0 - pand m1, m3 - movd r3, m1 - mov [r2], r3w .end: RET @@ -8584,9 +8468,10 @@ ;------------------------------------------------------------------------------------------------------------------------------------- ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;------------------------------------------------------------------------------------------------------------------------------------- -; TODO: NO TEST CODE! INIT_YMM avx2 -cglobal upShift_16, 6,7,4 +cglobal upShift_16, 4,7,4 + mov r4d, r4m + mov r5d, r5m movd xm0, r6m ; m0 = shift vbroadcasti128 m3, [pw_pixel_max] FIX_STRIDES r1d, r3d @@ -8613,83 +8498,33 @@ dec r5d jnz .loopH -; processing last row of every frame [To handle width which not a multiple of 32] - mov r6d, r4d - and r4d, 31 - shr r6d, 5 + ; processing last row of every frame [To handle width which not a multiple of 32] .loop32: + movu m1, [r0 + (r4 - mmsize) * 2] + movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] + psllw m1, xm0 + psllw m2, xm0 + pand m1, m3 + pand m2, m3 + movu [r2 + (r4 - mmsize) * 2], m1 + movu [r2 + (r4 - mmsize) * 2 + mmsize], m2 + + sub r4d, mmsize + jz .end + cmp r4d, mmsize + jge .loop32 + + ; process partial pixels movu m1, [r0] - movu m2, [r0 + mmsize] + movu m2, [r0] psllw m1, xm0 psllw m2, xm0 pand m1, m3 pand m2, m3 movu [r2], m1 - movu [r2 + mmsize], m2 - - add r0, 2*mmsize - add r2, 2*mmsize - dec r6d - jnz .loop32 - - cmp r4d, 16 - jl .process8 - movu m1, [r0] - psllw m1, xm0 - pand m1, m3 - movu [r2], m1 - - add r0, mmsize - add r2, mmsize - sub r4d, 16 - jz .end - -.process8: - cmp r4d, 8 - jl .process4 - movu xm1, [r0] - psllw xm1, xm0 - pand xm1, xm3 - movu [r2], xm1 - - add r0, 16 - add r2, 16 - sub r4d, 8 - jz .end - -.process4: - cmp r4d, 4 - jl .process2 - movq xm1,[r0] - psllw xm1, xm0 - pand xm1, xm3 - movq [r2], xm1 - - add r0, 8 - add r2, 8 - sub r4d, 4 - jz .end - -.process2: - cmp r4d, 2 - jl .process1 - movd xm1, [r0] - psllw xm1, xm0 - pand xm1, xm3 - movd [r2], xm1 - - add r0, 4 - add r2, 4 - sub r4d, 2 - jz .end - -.process1: - movd xm1, [r0] - psllw xm1, xm0 - pand xm1, xm3 - movd r3d, xm1 - mov [r2], r3w + movu [r2], m2 + .end: RET diff -r 75d1c62d8f0c -r 717cb31ed993 source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Thu Dec 24 13:58:32 2015 +0530 +++ b/source/test/pixelharness.cpp Wed Dec 30 18:03:02 2015 -0600 @@ -1299,8 +1299,8 @@ memset(ref_dest, 0xCD, sizeof(ref_dest)); memset(opt_dest, 0xCD, sizeof(opt_dest)); - int width = 32 + rand() % 32; - int height = 32 + rand() % 32; + int width = 32 + (rand() % 32); + int height = 32 + (rand() % 32); intptr_t srcStride = 64; intptr_t dstStride = width; int j = 0; @@ -1308,11 +1308,23 @@ for (int i = 0; i < ITERS; i++) { int index = i % TEST_CASES; + checked(opt, ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1)); ref(ushort_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1)); - if (memcmp(ref_dest, opt_dest, width * height * sizeof(pixel))) + if (memcmp(ref_dest, opt_dest, dstStride * height * sizeof(pixel))) + { + memcpy(opt_dest, ref_dest, sizeof(ref_dest)); + opt(ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1)); return false; + } + + // check tail memory area + for(int x = width; x < dstStride; x++) + { + if (opt_dest[(height - 1 * dstStride) + x] != 0xCD) + return false; + } reportfail(); j += INCR; @@ -1344,6 +1356,13 @@ if (memcmp(ref_dest, opt_dest, sizeof(ref_dest))) return false; + // check tail memory area + for(int x = width; x < dstStride; x++) + { + if (opt_dest[(height - 1 * dstStride) + x] != 0xCD) + return false; + } + reportfail(); j += INCR; } _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel