# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1455794242 -19800 # Thu Feb 18 16:47:22 2016 +0530 # Node ID 5e4593ef30cc4bccc5eec2a0109b8dff397e5c93 # Parent b31fa1a4ef43697e163d17dda0f4650de45d6ff9 arm: Implement pixel_sse_pp ARM NEON asm
diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Thu Feb 18 16:37:01 2016 +0530 +++ b/source/common/CMakeLists.txt Thu Feb 18 16:47:22 2016 +0530 @@ -89,7 +89,7 @@ set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h) # add ARM assembly/intrinsic files here - set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S) + set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S) set(VEC_PRIMITIVES) set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Thu Feb 18 16:37:01 2016 +0530 +++ b/source/common/arm/asm-primitives.cpp Thu Feb 18 16:47:22 2016 +0530 @@ -42,6 +42,13 @@ { if (cpuMask & X265_CPU_NEON) { + // sse_pp + p.cu[BLOCK_4x4].sse_pp = PFX(pixel_sse_pp_4x4_neon); + p.cu[BLOCK_8x8].sse_pp = PFX(pixel_sse_pp_8x8_neon); + p.cu[BLOCK_16x16].sse_pp = PFX(pixel_sse_pp_16x16_neon); + p.cu[BLOCK_32x32].sse_pp = PFX(pixel_sse_pp_32x32_neon); + p.cu[BLOCK_64x64].sse_pp = PFX(pixel_sse_pp_64x64_neon); + // pixel_var p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_neon); p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_neon); diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/arm/pixel.h --- a/source/common/arm/pixel.h Thu Feb 18 16:37:01 2016 +0530 +++ b/source/common/arm/pixel.h Thu Feb 18 16:47:22 2016 +0530 @@ -111,4 +111,10 @@ void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res); void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res); +sse_t x265_pixel_sse_pp_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_pp_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_pp_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_pp_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); +sse_t x265_pixel_sse_pp_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); + #endif // ifndef X265_I386_PIXEL_ARM_H diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/arm/ssd-a.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/source/common/arm/ssd-a.S Thu Feb 18 16:47:22 2016 +0530 @@ -0,0 +1,196 @@ +/***************************************************************************** + * Copyright (C) 2016 x265 project + * + * Authors: Dnyaneshwar G <dnyanesh...@multicorewareinc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "asm.S" + +.section .rodata + +.align 4 + + +.text + + +function x265_pixel_sse_pp_4x4_neon + vld1.32 {d16[]}, [r0], r1 + vld1.32 {d17[]}, [r2], r3 + vsubl.u8 q2, d16, d17 + vld1.32 {d16[]}, [r0], r1 + vmull.s16 q0, d4, d4 + vld1.32 {d17[]}, [r2], r3 + + vsubl.u8 q2, d16, d17 + vld1.32 {d16[]}, [r0], r1 + vmlal.s16 q0, d4, d4 + vld1.32 {d17[]}, [r2], r3 + + vsubl.u8 q2, d16, d17 + vld1.32 {d16[]}, [r0], r1 + vmlal.s16 q0, d4, d4 + vld1.32 {d17[]}, [r2], r3 + + vsubl.u8 q2, d16, d17 + vmlal.s16 q0, d4, d4 + vadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +endfunc + +function x265_pixel_sse_pp_8x8_neon + vld1.64 {d16}, [r0], r1 + vld1.64 {d17}, [r2], r3 + vsubl.u8 q2, d16, d17 + vld1.64 {d16}, [r0], r1 + vmull.s16 q0, d4, d4 + vmlal.s16 q0, d5, d5 + vld1.64 {d17}, [r2], r3 + +.rept 6 + vsubl.u8 q2, d16, d17 + vld1.64 {d16}, [r0], r1 + vmlal.s16 q0, d4, d4 + vmlal.s16 q0, d5, d5 + vld1.64 {d17}, [r2], r3 +.endr + vsubl.u8 q2, d16, d17 + vmlal.s16 q0, d4, d4 + vmlal.s16 q0, d5, d5 + vadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +endfunc + +function x265_pixel_sse_pp_16x16_neon + vld1.64 {d16-d17}, [r0], r1 + vld1.64 {d18-d19}, [r2], r3 + vsubl.u8 q2, d16, d18 + vsubl.u8 q3, d17, d19 + vld1.64 {d16-d17}, [r0], r1 + vmull.s16 q0, d4, d4 + vmlal.s16 q0, d5, d5 + vld1.64 {d18-d19}, [r2], r3 + vmlal.s16 q0, d6, d6 + vmlal.s16 q0, d7, d7 + +.rept 14 + vsubl.u8 q2, d16, d18 + vsubl.u8 q3, d17, d19 + vld1.64 {d16-d17}, [r0], r1 + vmlal.s16 q0, d4, d4 + vmlal.s16 q0, d5, d5 + vld1.64 {d18-d19}, [r2], r3 + vmlal.s16 q0, d6, d6 + vmlal.s16 q0, d7, d7 +.endr + vsubl.u8 q2, d16, d18 + vsubl.u8 q3, d17, d19 + vmlal.s16 q0, d4, d4 + vmlal.s16 q0, d5, d5 + vmlal.s16 q0, d6, d6 + vmlal.s16 q0, d7, d7 + vadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +endfunc + +function x265_pixel_sse_pp_32x32_neon + mov r12, #8 + veor.u8 q0, q0 + veor.u8 q1, q1 + +.loop_sse_pp_32: + subs r12, #1 +.rept 4 + vld1.64 {q8-q9}, [r0], r1 + vld1.64 {q10-q11}, [r2], r3 + vsubl.u8 q2, d16, d20 + vsubl.u8 q3, d17, d21 + vsubl.u8 q12, d18, d22 + vsubl.u8 q13, d19, d23 + vmlal.s16 q0, d4, d4 + vmlal.s16 q1, d5, d5 + vmlal.s16 q0, d6, d6 + vmlal.s16 q1, d7, d7 + vmlal.s16 q0, d24, d24 + vmlal.s16 q1, d25, d25 + vmlal.s16 q0, d26, d26 + vmlal.s16 q1, d27, d27 +.endr + bne .loop_sse_pp_32 + vadd.s32 q0, q1 + vadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +endfunc + +function x265_pixel_sse_pp_64x64_neon + sub r1, #32 + sub r3, #32 + mov r12, #16 + veor.u8 q0, q0 + veor.u8 q1, q1 + +.loop_sse_pp_64: + subs r12, #1 +.rept 4 + vld1.64 {q8-q9}, [r0]! + vld1.64 {q10-q11}, [r2]! + vsubl.u8 q2, d16, d20 + vsubl.u8 q3, d17, d21 + vsubl.u8 q12, d18, d22 + vsubl.u8 q13, d19, d23 + vmlal.s16 q0, d4, d4 + vmlal.s16 q1, d5, d5 + vmlal.s16 q0, d6, d6 + vmlal.s16 q1, d7, d7 + vmlal.s16 q0, d24, d24 + vmlal.s16 q1, d25, d25 + vmlal.s16 q0, d26, d26 + vmlal.s16 q1, d27, d27 + + vld1.64 {q8-q9}, [r0], r1 + vld1.64 {q10-q11}, [r2], r3 + vsubl.u8 q2, d16, d20 + vsubl.u8 q3, d17, d21 + vsubl.u8 q12, d18, d22 + vsubl.u8 q13, d19, d23 + vmlal.s16 q0, d4, d4 + vmlal.s16 q1, d5, d5 + vmlal.s16 q0, d6, d6 + vmlal.s16 q1, d7, d7 + vmlal.s16 q0, d24, d24 + vmlal.s16 q1, d25, d25 + vmlal.s16 q0, d26, d26 + vmlal.s16 q1, d27, d27 +.endr + bne .loop_sse_pp_64 + vadd.s32 q0, q1 + vadd.s32 d0, d0, d1 + vpadd.s32 d0, d0, d0 + vmov.32 r0, d0[0] + bx lr +endfunc _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel