this version looks good, thanks
At 2016-03-04 17:29:40,"Ramya Sriraman" <[email protected]> wrote: Thanks for the improvements min. Pls find the modified patch below. # HG changeset patch # User Ramya Sriraman<[email protected]> # Date 1456985538 -19800 # Thu Mar 03 11:42:18 2016 +0530 # Node ID 75a3948f28b6bd8f2b3536cf18e17cc8573be444 # Parent 9cc9920bf82be1b43efd2a3628e28a3a78ab3b2f arm: Implement planecopy_cp NEON diff -r 9cc9920bf82b -r 75a3948f28b6 source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Wed Mar 02 17:26:11 2016 +0530 +++ b/source/common/arm/asm-primitives.cpp Thu Mar 03 11:42:18 2016 +0530 @@ -33,6 +33,7 @@ #include "blockcopy8.h" #include "pixel.h" #include "pixel-util.h" +#include "ipfilter8.h" } namespace X265_NS { @@ -142,6 +143,9 @@ p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_neon); p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon); + // planecopy + p.planecopy_cp = PFX(pixel_planecopy_cp_neon); + // sad p.pu[LUMA_8x4].sad = PFX(pixel_sad_8x4_neon); p.pu[LUMA_8x8].sad = PFX(pixel_sad_8x8_neon); diff -r 9cc9920bf82b -r 75a3948f28b6 source/common/arm/pixel-util.S --- a/source/common/arm/pixel-util.S Wed Mar 02 17:26:11 2016 +0530 +++ b/source/common/arm/pixel-util.S Thu Mar 03 11:42:18 2016 +0530 @@ -626,3 +626,55 @@ pop {r4, r5} bx lr endfunc + +function x265_pixel_planecopy_cp_neon + push {r4, r5, r6, r7} + ldr r4, [sp, #4 * 4] + ldr r5, [sp, #4 * 4 + 4] + ldr r12, [sp, #4 * 4 + 8] + vdup.8 q2, r12 + sub r5, #1 + +.loop_h: + mov r6, r0 + mov r12, r2 + eor r7, r7 +.loop_w: + vld1.u8 {q0}, [r6]! + vshl.u8 q0, q0, q2 + vst1.u8 {q0}, [r12]! + + add r7, #16 + cmp r7, r4 + blt .loop_w + + add r0, r1 + add r2, r3 + + subs r5, #1 + bgt .loop_h + +// handle last row + mov r5, r4 + lsr r5, #3 + +.loopW8: + vld1.u8 d0, [r0]! + vshl.u8 d0, d0, d4 + vst1.u8 d0, [r2]! + subs r4, r4, #8 + subs r5, #1 + bgt .loopW8 + + mov r5,#8 + sub r5, r4 + sub r0, r5 + sub r2, r5 + vld1.u8 d0, [r0] + vshl.u8 d0, d0, d4 + vst1.u8 d0, [r2] + + pop {r4, r5, r6, r7} + bx lr +endfunc + diff -r 9cc9920bf82b -r 75a3948f28b6 source/common/arm/pixel.h --- a/source/common/arm/pixel.h Wed Mar 02 17:26:11 2016 +0530 +++ b/source/common/arm/pixel.h Thu Mar 03 11:42:18 2016 +0530 @@ -163,4 +163,6 @@ void x265_pixel_add_ps_16x16_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); void x265_pixel_add_ps_32x32_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); void x265_pixel_add_ps_64x64_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); + +void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift); #endif // ifndef X265_I386_PIXEL_ARM_H Thank you Regards Ramya On Fri, Mar 4, 2016 at 2:18 PM, Ramya Sriraman <[email protected]> wrote: # HG changeset patch # User Ramya Sriraman<[email protected]> # Date 1456985538 -19800 # Thu Mar 03 11:42:18 2016 +0530 # Node ID 299caedec2f38b9d9b658aace5c74ace36b6b324 # Parent 9cc9920bf82be1b43efd2a3628e28a3a78ab3b2f arm: Implement planecopy_cp NEON diff -r 9cc9920bf82b -r 299caedec2f3 source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Wed Mar 02 17:26:11 2016 +0530 +++ b/source/common/arm/asm-primitives.cpp Thu Mar 03 11:42:18 2016 +0530 @@ -33,6 +33,7 @@ #include "blockcopy8.h" #include "pixel.h" #include "pixel-util.h" +#include "ipfilter8.h" } namespace X265_NS { @@ -142,6 +143,9 @@ p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_neon); p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon); + // planecopy + p.planecopy_cp = PFX(pixel_planecopy_cp_neon); + // sad p.pu[LUMA_8x4].sad = PFX(pixel_sad_8x4_neon); p.pu[LUMA_8x8].sad = PFX(pixel_sad_8x8_neon); diff -r 9cc9920bf82b -r 299caedec2f3 source/common/arm/pixel-util.S --- a/source/common/arm/pixel-util.S Wed Mar 02 17:26:11 2016 +0530 +++ b/source/common/arm/pixel-util.S Thu Mar 03 11:42:18 2016 +0530 @@ -626,3 +626,57 @@ pop {r4, r5} bx lr endfunc + +function x265_pixel_planecopy_cp_neon + push {r4, r5, r6, r7} + ldr r4, [sp, #4 * 4] + ldr r5, [sp, #4 * 4 + 4] + ldr r12, [sp, #4 * 4 + 8] + vdup.8 q2, r12 + sub r5, #1 + +.loop_h: + mov r6, r0 + mov r12, r2 + eor r7, r7 +.loop_w: + vld1.u8 {q0}, [r6] + vshl.u8 q0, q0, q2 + vst1.u8 {q0}, [r12] + + add r12, #16 + add r6, #16 + add r7, #16 + cmp r7, r4 + blt .loop_w + + add r0, r1 + add r2, r3 + + subs r5, #1 + bgt .loop_h + +// handle last row + mov r5, r4 + lsr r5, #3 + +.loopW8: + vld1.u8 d0, [r0]! + vshl.u8 d0, d0, d4 + vst1.u8 d0, [r2]! + subs r4, r4, #8 + subs r5, #1 + bgt .loopW8 + + mov r5,#8 + sub r5, r4 + sub r0, r5 + sub r2, r5 + vld1.u8 d0, [r0] + vshl.u8 d0, d0, d4 + vst1.u8 d0, [r2] + + pop {r4, r5, r6, r7} + bx lr +endfunc + diff -r 9cc9920bf82b -r 299caedec2f3 source/common/arm/pixel.h --- a/source/common/arm/pixel.h Wed Mar 02 17:26:11 2016 +0530 +++ b/source/common/arm/pixel.h Thu Mar 03 11:42:18 2016 +0530 @@ -163,4 +163,6 @@ void x265_pixel_add_ps_16x16_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); void x265_pixel_add_ps_32x32_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); void x265_pixel_add_ps_64x64_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); + +void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift); #endif // ifndef X265_I386_PIXEL_ARM_H Thank you Regards Ramya On Fri, Mar 4, 2016 at 11:42 AM, Ramya Sriraman <[email protected]> wrote: Hi min, I made the #12 -> #4*3 correction. R0 is constant because if i keep adding number of bytes loaded by combining it with vld1.u8, then at the end of the loop when i add r1, it will be r0+number_of_bytes+r1 and not the intended r0+r1. Also, this is basically an upShift primitive. So it mite be useful for 8bit build also. I will mail the patch with modification to mailing list based on your response. Thank you Regards Ramya On Fri, Mar 4, 2016 at 11:41 AM, Min Chen <[email protected]> wrote:
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
