# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1456136894 -19800 # Mon Feb 22 15:58:14 2016 +0530 # Node ID ed3dd1a26cb5801e306db8f1d4a52cd1f4d6620b # Parent 4a1b8f3c0c7385ff19fd61133e0af4464510e9aa arm: Implement pixel_ssd_s ARM NEON asm
diff -r 4a1b8f3c0c73 -r ed3dd1a26cb5 source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Thu Feb 25 12:15:51 2016 +0530 +++ b/source/common/arm/asm-primitives.cpp Mon Feb 22 15:58:14 2016 +0530 @@ -42,6 +42,12 @@ { if (cpuMask & X265_CPU_NEON) { + // ssd_s + p.cu[BLOCK_4x4].ssd_s = PFX(pixel_ssd_s_4x4_neon); + p.cu[BLOCK_8x8].ssd_s = PFX(pixel_ssd_s_8x8_neon); + p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16x16_neon); + p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32x32_neon); + // sse_ss p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_neon); p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_neon); diff -r 4a1b8f3c0c73 -r ed3dd1a26cb5 source/common/arm/pixel.h --- a/source/common/arm/pixel.h Thu Feb 25 12:15:51 2016 +0530 +++ b/source/common/arm/pixel.h Mon Feb 22 15:58:14 2016 +0530 @@ -123,6 +123,12 @@ sse_t x265_pixel_sse_ss_32x32_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2); sse_t x265_pixel_sse_ss_64x64_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2); +sse_t x265_pixel_ssd_s_4x4_neon(const int16_t* a, intptr_t dstride); +sse_t x265_pixel_ssd_s_8x8_neon(const int16_t* a, intptr_t dstride); +sse_t x265_pixel_ssd_s_16x16_neon(const int16_t* a, intptr_t dstride); +sse_t x265_pixel_ssd_s_32x32_neon(const int16_t* a, intptr_t dstride); +sse_t x265_pixel_ssd_s_64x64_neon(const int16_t* a, intptr_t dstride); + void x265_pixel_sub_ps_4x4_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); void x265_pixel_sub_ps_8x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); diff -r 4a1b8f3c0c73 -r ed3dd1a26cb5 source/common/arm/ssd-a.S --- a/source/common/arm/ssd-a.S Thu Feb 25 12:15:51 2016 +0530 +++ b/source/common/arm/ssd-a.S Mon Feb 22 15:58:14 2016 +0530 @@ -371,4 +371,99 @@ bx lr endfunc +function x265_pixel_ssd_s_4x4_neon + add r1, r1 + vld1.s16 {d4}, [r0], r1 + vld1.s16 {d5}, [r0], r1 + vld1.s16 {d6}, [r0], r1 + vld1.s16 {d7}, [r0] + vmull.s16 q0, d4, d4 + vmull.s16 q1, d5, d5 + vmlal.s16 q0, d6, d6 + vmlal.s16 q1, d7, d7 + vadd.s32 q0, q1 + vadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +endfunc +function x265_pixel_ssd_s_8x8_neon + add r1, r1 + vld1.s16 {q8}, [r0], r1 + vld1.s16 {q9}, [r0], r1 + vmull.s16 q0, d16, d16 + vmull.s16 q1, d17, d17 + vmlal.s16 q0, d18, d18 + vmlal.s16 q1, d19, d19 +.rept 3 + vld1.s16 {q8}, [r0], r1 + vld1.s16 {q9}, [r0], r1 + vmlal.s16 q0, d16, d16 + vmlal.s16 q1, d17, d17 + vmlal.s16 q0, d18, d18 + vmlal.s16 q1, d19, d19 +.endr + vadd.s32 q0, q1 + vadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +endfunc + +function x265_pixel_ssd_s_16x16_neon + add r1, r1 + mov r12, #4 + veor.u8 q0, q0 + veor.u8 q1, q1 + +.loop_ssd_s_16: + subs r12, #1 +.rept 2 + vld1.s16 {q8-q9}, [r0], r1 + vld1.s16 {q10-q11}, [r0], r1 + vmlal.s16 q0, d16, d16 + vmlal.s16 q1, d17, d17 + vmlal.s16 q0, d18, d18 + vmlal.s16 q1, d19, d19 + vmlal.s16 q0, d20, d20 + vmlal.s16 q1, d21, d21 + vmlal.s16 q0, d22, d22 + vmlal.s16 q1, d23, d23 +.endr + bne .loop_ssd_s_16 + vadd.s32 q0, q1 + vadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +endfunc + +function x265_pixel_ssd_s_32x32_neon + add r1, r1 + sub r1, #32 + mov r12, #8 + veor.u8 q0, q0 + veor.u8 q1, q1 + +.loop_ssd_s_32: + subs r12, #1 +.rept 4 + vld1.s16 {q8-q9}, [r0]! + vld1.s16 {q10-q11}, [r0], r1 + vmlal.s16 q0, d16, d16 + vmlal.s16 q1, d17, d17 + vmlal.s16 q0, d18, d18 + vmlal.s16 q1, d19, d19 + vmlal.s16 q0, d20, d20 + vmlal.s16 q1, d21, d21 + vmlal.s16 q0, d22, d22 + vmlal.s16 q1, d23, d23 +.endr + bne .loop_ssd_s_32 + vadd.s32 q0, q1 + vadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +endfunc _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel