# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1456382751 -19800 # Thu Feb 25 12:15:51 2016 +0530 # Node ID 4a1b8f3c0c7385ff19fd61133e0af4464510e9aa # Parent 45c0dbd43dec24608199362a86bfba6ef91cacca arm: Implement pixel_sse_ss ARM NEON asm
diff -r 45c0dbd43dec -r 4a1b8f3c0c73 source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Mon Feb 22 18:22:37 2016 +0530 +++ b/source/common/arm/asm-primitives.cpp Thu Feb 25 12:15:51 2016 +0530 @@ -42,6 +42,13 @@ { if (cpuMask & X265_CPU_NEON) { + // sse_ss + p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_neon); + p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_neon); + p.cu[BLOCK_16x16].sse_ss = PFX(pixel_sse_ss_16x16_neon); + p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_neon); + p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_neon); + // pixel_sub_ps p.cu[BLOCK_4x4].sub_ps = PFX(pixel_sub_ps_4x4_neon); p.cu[BLOCK_8x8].sub_ps = PFX(pixel_sub_ps_8x8_neon); diff -r 45c0dbd43dec -r 4a1b8f3c0c73 source/common/arm/pixel.h --- a/source/common/arm/pixel.h Mon Feb 22 18:22:37 2016 +0530 +++ b/source/common/arm/pixel.h Thu Feb 25 12:15:51 2016 +0530 @@ -117,6 +117,12 @@ sse_t x265_pixel_sse_pp_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); sse_t x265_pixel_sse_pp_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_ss_4x4_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_ss_8x8_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_ss_16x16_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_ss_32x32_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_ss_64x64_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2); + void x265_pixel_sub_ps_4x4_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); void x265_pixel_sub_ps_8x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); diff -r 45c0dbd43dec -r 4a1b8f3c0c73 source/common/arm/ssd-a.S --- a/source/common/arm/ssd-a.S Mon Feb 22 18:22:37 2016 +0530 +++ b/source/common/arm/ssd-a.S Thu Feb 25 12:15:51 2016 +0530 @@ -194,3 +194,181 @@ vmov.32 r0, d0[0] bx lr endfunc + +function x265_pixel_sse_ss_4x4_neon + add r1, r1 + add r3, r3 + + vld1.s16 {d16}, [r0], r1 + vld1.s16 {d18}, [r2], r3 + vsub.s16 q2, q8, q9 + vld1.s16 {d16}, [r0], r1 + vmull.s16 q0, d4, d4 + vld1.s16 {d18}, [r2], r3 + + vsub.s16 q2, q8, q9 + vld1.s16 {d16}, [r0], r1 + vmlal.s16 q0, d4, d4 + vld1.s16 {d18}, [r2], r3 + + vsub.s16 q2, q8, q9 + vld1.s16 {d16}, [r0], r1 + vmlal.s16 q0, d4, d4 + vld1.s16 {d18}, [r2], r3 + + vsub.s16 q2, q8, q9 + vmlal.s16 q0, d4, d4 + + vadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +endfunc + +function x265_pixel_sse_ss_8x8_neon + add r1, r1 + add r3, r3 + + vld1.s16 {q8}, [r0], r1 + vld1.s16 {q9}, [r2], r3 + vsub.s16 q8, q9 + vmull.s16 q0, d16, d16 + vmull.s16 q1, d17, d17 + +.rept 7 + vld1.s16 {q8}, [r0], r1 + vld1.s16 {q9}, [r2], r3 + vsub.s16 q8, q9 + vmlal.s16 q0, d16, d16 + vmlal.s16 q1, d17, d17 +.endr + vadd.s32 q0, q1 + vadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +endfunc + +function x265_pixel_sse_ss_16x16_neon + add r1, r1 + add r3, r3 + + mov r12, #4 + veor.u8 q0, q0 + veor.u8 q1, q1 + +.loop_sse_ss_16: + subs r12, #1 +.rept 4 + vld1.s16 {q8-q9}, [r0], r1 + vld1.s16 {q10-q11}, [r2], r3 + vsub.s16 q8, q10 + vsub.s16 q9, q11 + vmlal.s16 q0, d16, d16 + vmlal.s16 q1, d17, d17 + vmlal.s16 q0, d18, d18 + vmlal.s16 q1, d19, d19 +.endr + bne .loop_sse_ss_16 + vadd.s32 q0, q1 + vadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +endfunc + +function x265_pixel_sse_ss_32x32_neon + add r1, r1 + add r3, r3 + sub r1, #32 + sub r3, #32 + mov r12, #8 + veor.u8 q0, q0 + veor.u8 q1, q1 + +.loop_sse_ss_32: + subs r12, #1 +.rept 4 + vld1.s16 {q8-q9}, [r0]! + vld1.s16 {q10-q11}, [r2]! + vsub.s16 q8, q10 + vsub.s16 q9, q11 + vmlal.s16 q0, d16, d16 + vmlal.s16 q1, d17, d17 + vmlal.s16 q0, d18, d18 + vmlal.s16 q1, d19, d19 + + vld1.s16 {q8-q9}, [r0], r1 + vld1.s16 {q10-q11}, [r2], r3 + vsub.s16 q8, q10 + vsub.s16 q9, q11 + vmlal.s16 q0, d16, d16 + vmlal.s16 q1, d17, d17 + vmlal.s16 q0, d18, d18 + vmlal.s16 q1, d19, d19 +.endr + bne .loop_sse_ss_32 + vadd.s32 q0, q1 + vadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +endfunc + +function x265_pixel_sse_ss_64x64_neon + add r1, r1 + add r3, r3 + sub r1, #96 + sub r3, #96 + mov r12, #32 + veor.u8 q0, q0 + veor.u8 q1, q1 + +.loop_sse_ss_64: + subs r12, #1 +.rept 2 + vld1.s16 {q8-q9}, [r0]! + vld1.s16 {q10-q11}, [r2]! + vsub.s16 q8, q10 + vsub.s16 q9, q11 + vmlal.s16 q0, d16, d16 + vmlal.s16 q1, d17, d17 + vmlal.s16 q0, d18, d18 + vmlal.s16 q1, d19, d19 + + vld1.s16 {q8-q9}, [r0]! + vld1.s16 {q10-q11}, [r2]! + vsub.s16 q8, q10 + vsub.s16 q9, q11 + vmlal.s16 q0, d16, d16 + vmlal.s16 q1, d17, d17 + vmlal.s16 q0, d18, d18 + vmlal.s16 q1, d19, d19 + + vld1.s16 {q8-q9}, [r0]! + vld1.s16 {q10-q11}, [r2]! + vsub.s16 q8, q10 + vsub.s16 q9, q11 + vmlal.s16 q0, d16, d16 + vmlal.s16 q1, d17, d17 + vmlal.s16 q0, d18, d18 + vmlal.s16 q1, d19, d19 + + vld1.s16 {q8-q9}, [r0], r1 + vld1.s16 {q10-q11}, [r2], r3 + vsub.s16 q8, q10 + vsub.s16 q9, q11 + vmlal.s16 q0, d16, d16 + vmlal.s16 q1, d17, d17 + vmlal.s16 q0, d18, d18 + vmlal.s16 q1, d19, d19 +.endr + bne .loop_sse_ss_64 + vadd.s32 q0, q1 + vadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +endfunc + + _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel