Please put this patch on hold. The smoke test is yet to be set up in our new ARM board. I will reply on this thread as soon as i finish running smoke test for this patch asap.
On Tue, Mar 15, 2016 at 5:42 PM, <[email protected]> wrote: > # HG changeset patch > # User Radhakrishnan VR <[email protected]> > # Date 1458043815 -19800 > # Tue Mar 15 17:40:15 2016 +0530 > # Node ID e5859c0bbdd9a5b12ce3a523b3857641bda457ea > # Parent 4a2f94a592511afabd434fc6cf02a469b6d65091 > arm: Implement count_nonzero ARM NEON > > diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/asm-primitives.cpp > --- a/source/common/arm/asm-primitives.cpp Wed Mar 09 14:34:06 2016 > +0530 > +++ b/source/common/arm/asm-primitives.cpp Tue Mar 15 17:40:15 2016 > +0530 > @@ -43,6 +43,12 @@ > { > if (cpuMask & X265_CPU_NEON) > { > + // count nonzero > + p.cu[BLOCK_4x4].count_nonzero = PFX(count_nonzero_4_neon); > + p.cu[BLOCK_8x8].count_nonzero = PFX(count_nonzero_8_neon); > + p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon); > + p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon); > + > //scale2D_64to32 > p.scale2D_64to32 = PFX(scale2D_64to32_neon); > > diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.S > --- a/source/common/arm/blockcopy8.S Wed Mar 09 14:34:06 2016 +0530 > +++ b/source/common/arm/blockcopy8.S Tue Mar 15 17:40:15 2016 +0530 > @@ -457,3 +457,92 @@ > rsb r0, r12, #1024 > bx lr > endfunc > + > +// int count_nonzero_c(const int16_t* quantCoeff) > +function x265_count_nonzero_4_neon > + veor d4, d4 > +.rept 2 > + vld1.s16 {d0}, [r0]! > + vld1.s16 {d1}, [r0]! > + vclz.i16 d2, d0 > + vclz.i16 d3, d1 > + vshr.u16 q1, #4 > + vadd.u16 d2, d3 > + vadd.u16 d4, d2 > +.endr > + vpadd.u16 d4, d4 > + vpadd.u16 d4, d4 > + vmov.u16 r12, d4[0] > + rsb r0, r12, #16 > + bx lr > +endfunc > + > +function x265_count_nonzero_8_neon > + veor q8, q8 > +.rept 4 > + vld1.s16 {q0}, [r0]! > + vld1.s16 {q1}, [r0]! > + vclz.i16 q2, q0 > + vclz.i16 q3, q1 > + vshr.u16 q2, #4 > + vshr.u16 q3, #4 > + vadd.u16 q2, q3 > + vadd.u16 q8, q2 > +.endr > + vadd.u16 d16, d17 > + vpadd.u16 d16, d16 > + vpadd.u16 d16, d16 > + vmov.u16 r12, d16[0] > + rsb r0, r12, #64 > + bx lr > +endfunc > + > +function x265_count_nonzero_16_neon > + veor q2, q2 > +.rept 16 > + vld1.s16 {q0, q1}, [r0]! > + vclz.i16 q8, q0 > + vclz.i16 q9, q1 > + vshr.u16 q8, #4 > + vshr.u16 q9, #4 > + vadd.u16 q8, q9 > + vadd.u16 q2, q8 > +.endr > + vadd.u16 d4, d5 > + vpadd.u16 d4, d4 > + vpadd.u16 d4, d4 > + > + vmov.u16 r12, d4[0] > + rsb r0, r12, #256 > + bx lr > +endfunc > + > +function x265_count_nonzero_32_neon > + veor q12, q12 > +.rept 32 > + vld1.s16 {q0, q1}, [r0]! > + vld1.s16 {q2, q3}, [r0]! > + > + vclz.i16 q8, q0 > + vclz.i16 q9, q1 > + vclz.i16 q10, q2 > + vclz.i16 q11, q3 > + > + vshr.u16 q8, #4 > + vshr.u16 q9, #4 > + vshr.u16 q10, #4 > + vshr.u16 q11, #4 > + > + vadd.u16 q8, q9 > + vadd.u16 q10, q11 > + vadd.u16 q8, q10 > + vadd.u16 q12, q8 > +.endr > + vadd.u16 d24, d25 > + vpadd.u16 d24, d24 > + vpadd.u16 d24, d24 > + > + vmov.u16 r12, d24[0] > + rsb r0, r12, #1024 > + bx lr > +endfunc > diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.h > --- a/source/common/arm/blockcopy8.h Wed Mar 09 14:34:06 2016 +0530 > +++ b/source/common/arm/blockcopy8.h Tue Mar 15 17:40:15 2016 +0530 > @@ -84,4 +84,9 @@ > uint32_t x265_copy_cnt_8_neon(int16_t* coeff, const int16_t* residual, > intptr_t resiStride); > uint32_t x265_copy_cnt_16_neon(int16_t* coeff, const int16_t* residual, > intptr_t resiStride); > uint32_t x265_copy_cnt_32_neon(int16_t* coeff, const int16_t* residual, > intptr_t resiStride); > + > +int x265_count_nonzero_4_neon(const int16_t* quantCoeff); > +int x265_count_nonzero_8_neon(const int16_t* quantCoeff); > +int x265_count_nonzero_16_neon(const int16_t* quantCoeff); > +int x265_count_nonzero_32_neon(const int16_t* quantCoeff); > #endif // ifndef X265_I386_PIXEL_ARM_H >
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
