Just some tricky advice.Register alloc rule:1. R0-R3 may use free2. if you need one more, R12 is free3. if you need one more again, R14 (LR) may use after push to stack4. Then use R4-R11 with stack save/restore.
your patch use more registers because without apply above rule. At 2016-04-20 19:15:36,[email protected] wrote: ># HG changeset patch ># User Radhakrishnan VR <[email protected]> ># Date 1461145693 -19800 ># Wed Apr 20 15:18:13 2016 +0530 ># Node ID eed7e06770463bb86c28dade1f0e965215028064 ># Parent a28ba6131b58829d04ffc04b9ac2c67bf850eee4 >arm: Implement sub_ps chroma ARM NEON > >diff -r a28ba6131b58 -r eed7e0677046 source/common/arm/asm-primitives.cpp >--- a/source/common/arm/asm-primitives.cpp Tue Apr 19 12:12:00 2016 +0530 >+++ b/source/common/arm/asm-primitives.cpp Wed Apr 20 15:18:13 2016 +0530 >@@ -446,6 +446,16 @@ > p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_neon); > p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_neon); > >+ // chroma sub_ps >+ p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sub_ps = >PFX(pixel_sub_ps_4x4_neon); >+ p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sub_ps = >PFX(pixel_sub_ps_8x8_neon); >+ p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sub_ps = >PFX(pixel_sub_ps_16x16_neon); >+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = >PFX(pixel_sub_ps_32x32_neon); >+ p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sub_ps = >PFX(pixel_sub_ps_4x8_neon); >+ p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps = >PFX(pixel_sub_ps_8x16_neon); >+ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = >PFX(pixel_sub_ps_16x32_neon); >+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = >PFX(pixel_sub_ps_32x64_neon); >+ > // calc_Residual > p.cu[BLOCK_4x4].calcresidual = PFX(getResidual4_neon); > p.cu[BLOCK_8x8].calcresidual = PFX(getResidual8_neon); >diff -r a28ba6131b58 -r eed7e0677046 source/common/arm/pixel-util.S >--- a/source/common/arm/pixel-util.S Tue Apr 19 12:12:00 2016 +0530 >+++ b/source/common/arm/pixel-util.S Wed Apr 20 15:18:13 2016 +0530 >@@ -439,6 +439,94 @@ > bx lr > endfunc > >+// chroma sub_ps >+function x265_pixel_sub_ps_4x8_neon >+ push {r4} >+ lsl r1, r1, #1 >+ ldr r4, [sp, #4] >+ ldr r12, [sp, #8] >+.rept 4 >+ vld1.u8 {d0}, [r2], r4 >+ vld1.u8 {d1}, [r3], r12 >+ vld1.u8 {d2}, [r2], r4 >+ vld1.u8 {d3}, [r3], r12 >+ vsubl.u8 q2, d0, d1 >+ vsubl.u8 q3, d2, d3 >+ vst1.s16 {d4}, [r0], r1 >+ vst1.s16 {d6}, [r0], r1 >+.endr >+ pop {r4} >+ bx lr >+endfunc >+ >+function x265_pixel_sub_ps_8x16_neon >+ push {r4} >+ lsl r1, r1, #1 >+ ldr r4, [sp, #4] >+ ldr r12, [sp, #8] >+.rept 8 >+ vld1.u8 {d0}, [r2], r4 >+ vld1.u8 {d1}, [r3], r12 >+ vld1.u8 {d2}, [r2], r4 >+ vld1.u8 {d3}, [r3], r12 >+ vsubl.u8 q2, d0, d1 >+ vsubl.u8 q3, d2, d3 >+ vst1.s16 {q2}, [r0], r1 >+ vst1.s16 {q3}, [r0], r1 >+.endr >+ pop {r4} >+ bx lr >+endfunc >+ >+function x265_pixel_sub_ps_16x32_neon >+ push {r4, r5} >+ lsl r1, r1, #1 >+ ldr r4, [sp, #8] >+ ldr r12, [sp, #12] >+ mov r5, #4 >+loop_sub_16x32: >+ subs r5, r5, #1 >+.rept 4 >+ vld1.u8 {q0}, [r2], r4 >+ vld1.u8 {q1}, [r3], r12 >+ vld1.u8 {q2}, [r2], r4 >+ vld1.u8 {q3}, [r3], r12 >+ vsubl.u8 q8, d0, d2 >+ vsubl.u8 q9, d1, d3 >+ vsubl.u8 q10, d4, d6 >+ vsubl.u8 q11, d5, d7 >+ vst1.s16 {q8, q9}, [r0], r1 >+ vst1.s16 {q10, q11}, [r0], r1 >+.endr >+ bne loop_sub_16x32 >+ pop {r4, r5} >+ bx lr >+endfunc >+ >+function x265_pixel_sub_ps_32x64_neon >+ push {r4, r5} >+ lsl r1, r1, #1 >+ ldr r4, [sp, #8] >+ ldr r12, [sp, #12] >+ sub r1, #32 >+ mov r5, #16 >+loop_sub_32x64: >+ subs r5, r5, #1 >+.rept 4 >+ vld1.u8 {q0, q1}, [r2], r4 >+ vld1.u8 {q2, q3}, [r3], r12 >+ vsubl.u8 q8, d0, d4 >+ vsubl.u8 q9, d1, d5 >+ vsubl.u8 q10, d2, d6 >+ vsubl.u8 q11, d3, d7 >+ vst1.s16 {q8, q9}, [r0]! >+ vst1.s16 {q10, q11}, [r0], r1 >+.endr >+ bne loop_sub_32x64 >+ pop {r4, r5} >+ bx lr >+endfunc >+ > // void x265_pixel_add_ps_neon(pixel* a, intptr_t dstride, const pixel* b0, > const int16_t* b1, intptr_t sstride0, intptr_t sstride1); > function x265_pixel_add_ps_4x4_neon > push {r4} >diff -r a28ba6131b58 -r eed7e0677046 source/common/arm/pixel.h >--- a/source/common/arm/pixel.h Tue Apr 19 12:12:00 2016 +0530 >+++ b/source/common/arm/pixel.h Wed Apr 20 15:18:13 2016 +0530 >@@ -157,6 +157,10 @@ > void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* > b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); > void x265_pixel_sub_ps_32x32_neon(int16_t* a, intptr_t dstride, const pixel* > b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); > void x265_pixel_sub_ps_64x64_neon(int16_t* a, intptr_t dstride, const pixel* > b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); >+void x265_pixel_sub_ps_4x8_neon(int16_t* a, intptr_t dstride, const pixel* >b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); >+void x265_pixel_sub_ps_8x16_neon(int16_t* a, intptr_t dstride, const pixel* >b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); >+void x265_pixel_sub_ps_16x32_neon(int16_t* a, intptr_t dstride, const pixel* >b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); >+void x265_pixel_sub_ps_32x64_neon(int16_t* a, intptr_t dstride, const pixel* >b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); > > void x265_pixel_add_ps_4x4_neon(pixel* a, intptr_t dstride, const pixel* b0, > const int16_t* b1, intptr_t sstride0, intptr_t sstride1); > void x265_pixel_add_ps_8x8_neon(pixel* a, intptr_t dstride, const pixel* b0, > const int16_t* b1, intptr_t sstride0, intptr_t sstride1); >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
