# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1456831820 -19800 # Tue Mar 01 17:00:20 2016 +0530 # Node ID 61e51faf9e7ee1c8056ac2f66cf51da104bfa106 # Parent 79c00b9bc2b81afef2e41526fc3c390528f3174c arm: Implement filterPixelToShort ARM NEON asm
diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Tue Mar 01 12:18:18 2016 +0530 +++ b/source/common/CMakeLists.txt Tue Mar 01 17:00:20 2016 +0530 @@ -89,7 +89,7 @@ set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h) # add ARM assembly/intrinsic files here - set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S) + set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S) set(VEC_PRIMITIVES) set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Tue Mar 01 12:18:18 2016 +0530 +++ b/source/common/arm/asm-primitives.cpp Tue Mar 01 17:00:20 2016 +0530 @@ -33,6 +33,7 @@ #include "blockcopy8.h" #include "pixel.h" #include "pixel-util.h" +#include "ipfilter8.h" } namespace X265_NS { @@ -42,6 +43,33 @@ { if (cpuMask & X265_CPU_NEON) { + // filterPixelToShort + p.pu[LUMA_4x4].convert_p2s = PFX(filterPixelToShort_4x4_neon); + p.pu[LUMA_4x8].convert_p2s = PFX(filterPixelToShort_4x8_neon); + p.pu[LUMA_4x16].convert_p2s = PFX(filterPixelToShort_4x16_neon); + p.pu[LUMA_8x4].convert_p2s = PFX(filterPixelToShort_8x4_neon); + p.pu[LUMA_8x8].convert_p2s = PFX(filterPixelToShort_8x8_neon); + p.pu[LUMA_8x16].convert_p2s = PFX(filterPixelToShort_8x16_neon); + p.pu[LUMA_8x32].convert_p2s = PFX(filterPixelToShort_8x32_neon); + p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_neon); + p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_neon); + p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_neon); + p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_neon); + p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_neon); + p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_neon); + p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_neon); + p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_neon); + p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_neon); + p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_neon); + p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_neon); + p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_neon); + p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_neon); + p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_neon); + p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_neon); + p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_neon); + p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_neon); + p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_neon); + // Block_fill p.cu[BLOCK_4x4].blockfill_s = PFX(blockfill_s_4x4_neon); p.cu[BLOCK_8x8].blockfill_s = PFX(blockfill_s_8x8_neon); diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/ipfilter8.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/source/common/arm/ipfilter8.S Tue Mar 01 17:00:20 2016 +0530 @@ -0,0 +1,694 @@ +/***************************************************************************** + * Copyright (C) 2016 x265 project + * + * Authors: Dnyaneshwar G <dnyanesh...@multicorewareinc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "asm.S" + +.section .rodata + +.align 4 + +.text + +// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride) +function x265_filterPixelToShort_4x4_neon + add r3, r3 + vmov.u16 q8, #64 + vmov.u16 q9, #8192 + vneg.s16 q9, q9 +.rept 2 + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d2}, [r0], r1 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vmov q2, q9 + vmov q3, q9 + vmla.s16 q2, q0, q8 + vmla.s16 q3, q1, q8 + vst1.16 {d4}, [r2], r3 + vst1.16 {d6}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_4x8_neon + add r3, r3 + vmov.u16 q8, #64 + vmov.u16 q9, #8192 + vneg.s16 q9, q9 +.rept 4 + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d2}, [r0], r1 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vmov q2, q9 + vmov q3, q9 + vmla.s16 q2, q0, q8 + vmla.s16 q3, q1, q8 + vst1.16 {d4}, [r2], r3 + vst1.16 {d6}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_4x16_neon + add r3, r3 + vmov.u16 q8, #64 + vmov.u16 q9, #8192 + vneg.s16 q9, q9 +.rept 8 + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d2}, [r0], r1 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vmov q2, q9 + vmov q3, q9 + vmla.s16 q2, q0, q8 + vmla.s16 q3, q1, q8 + vst1.16 {d4}, [r2], r3 + vst1.16 {d6}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_8x4_neon + add r3, r3 + vmov.u16 q8, #64 + vmov.u16 q9, #8192 + vneg.s16 q9, q9 +.rept 2 + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d2}, [r0], r1 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vmov q2, q9 + vmov q3, q9 + vmla.s16 q2, q0, q8 + vmla.s16 q3, q1, q8 + vst1.16 {q2}, [r2], r3 + vst1.16 {q3}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_8x8_neon + add r3, r3 + vmov.u16 q8, #64 + vmov.u16 q9, #8192 + vneg.s16 q9, q9 +.rept 4 + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d2}, [r0], r1 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vmov q2, q9 + vmov q3, q9 + vmla.s16 q2, q0, q8 + vmla.s16 q3, q1, q8 + vst1.16 {q2}, [r2], r3 + vst1.16 {q3}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_8x16_neon + add r3, r3 + vmov.u16 q8, #64 + vmov.u16 q9, #8192 + vneg.s16 q9, q9 +.rept 8 + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d2}, [r0], r1 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vmov q2, q9 + vmov q3, q9 + vmla.s16 q2, q0, q8 + vmla.s16 q3, q1, q8 + vst1.16 {q2}, [r2], r3 + vst1.16 {q3}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_8x32_neon + add r3, r3 + vmov.u16 q8, #64 + vmov.u16 q9, #8192 + vneg.s16 q9, q9 +.rept 16 + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d2}, [r0], r1 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vmov q2, q9 + vmov q3, q9 + vmla.s16 q2, q0, q8 + vmla.s16 q3, q1, q8 + vst1.16 {q2}, [r2], r3 + vst1.16 {q3}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_12x16_neon + add r3, r3 + vmov.u16 q8, #64 + vmov.u16 q9, #8192 + vneg.s16 q9, q9 +.rept 16 + vld1.u8 {d2-d3}, [r0], r1 + vmovl.u8 q0, d2 + vmovl.u8 q1, d3 + vmov q2, q9 + vmov q3, q9 + vmla.s16 q2, q0, q8 + vmla.s16 q3, q1, q8 + vst1.16 {d4, d5, d6}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_16x4_neon + add r3, r3 + vmov.u16 q8, #64 + vmov.u16 q9, #8192 + vneg.s16 q9, q9 +.rept 4 + vld1.u8 {d2-d3}, [r0], r1 + vmovl.u8 q0, d2 + vmovl.u8 q1, d3 + vmov q2, q9 + vmov q3, q9 + vmla.s16 q2, q0, q8 + vmla.s16 q3, q1, q8 + vst1.16 {q2-q3}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_16x8_neon + add r3, r3 + vmov.u16 q8, #64 + vmov.u16 q9, #8192 + vneg.s16 q9, q9 +.rept 8 + vld1.u8 {d2-d3}, [r0], r1 + vmovl.u8 q0, d2 + vmovl.u8 q1, d3 + vmov q2, q9 + vmov q3, q9 + vmla.s16 q2, q0, q8 + vmla.s16 q3, q1, q8 + vst1.16 {q2-q3}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_16x12_neon + add r3, r3 + vmov.u16 q8, #64 + vmov.u16 q9, #8192 + vneg.s16 q9, q9 +.rept 12 + vld1.u8 {d2-d3}, [r0], r1 + vmovl.u8 q0, d2 + vmovl.u8 q1, d3 + vmov q2, q9 + vmov q3, q9 + vmla.s16 q2, q0, q8 + vmla.s16 q3, q1, q8 + vst1.16 {q2-q3}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_16x16_neon + add r3, r3 + vmov.u16 q8, #64 + vmov.u16 q9, #8192 + vneg.s16 q9, q9 +.rept 16 + vld1.u8 {d2-d3}, [r0], r1 + vmovl.u8 q0, d2 + vmovl.u8 q1, d3 + vmov q2, q9 + vmov q3, q9 + vmla.s16 q2, q0, q8 + vmla.s16 q3, q1, q8 + vst1.16 {q2-q3}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_16x32_neon + add r3, r3 + vmov.u16 q8, #64 + vmov.u16 q9, #8192 + vneg.s16 q9, q9 +.rept 32 + vld1.u8 {d2-d3}, [r0], r1 + vmovl.u8 q0, d2 + vmovl.u8 q1, d3 + vmov q2, q9 + vmov q3, q9 + vmla.s16 q2, q0, q8 + vmla.s16 q3, q1, q8 + vst1.16 {q2-q3}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_16x64_neon + add r3, r3 + vmov.u16 q8, #64 + vmov.u16 q9, #8192 + vneg.s16 q9, q9 +.rept 64 + vld1.u8 {d2-d3}, [r0], r1 + vmovl.u8 q0, d2 + vmovl.u8 q1, d3 + vmov q2, q9 + vmov q3, q9 + vmla.s16 q2, q0, q8 + vmla.s16 q3, q1, q8 + vst1.16 {q2-q3}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_24x32_neon + add r3, r3 + sub r3, #32 + vmov.u16 q0, #64 + vmov.u16 q1, #8192 + vneg.s16 q1, q1 +.rept 32 + vld1.u8 {d18, d19, d20}, [r0], r1 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmovl.u8 q11, d20 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2]! + vmov q2, q1 + vmla.s16 q2, q11, q0 + vst1.16 {q2}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_32x8_neon + add r3, r3 + sub r3, #32 + vmov.u16 q0, #64 + vmov.u16 q1, #8192 + vneg.s16 q1, q1 +.rept 8 + vld1.u8 {q9-q10}, [r0], r1 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmovl.u8 q11, d20 + vmovl.u8 q10, d21 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2]! + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q11, q0 + vmla.s16 q3, q10, q0 + vst1.16 {q2-q3}, [r2], r3 +.endr + bx lr +endfunc + +function x265_filterPixelToShort_32x16_neon + add r3, r3 + sub r3, #32 + vmov.u16 q0, #64 + vmov.u16 q1, #8192 + vneg.s16 q1, q1 + mov r12, #8 +.loop_filterP2S_32x16: + subs r12, #1 +.rept 2 + vld1.u8 {q9-q10}, [r0], r1 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmovl.u8 q11, d20 + vmovl.u8 q10, d21 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2]! + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q11, q0 + vmla.s16 q3, q10, q0 + vst1.16 {q2-q3}, [r2], r3 +.endr + bgt .loop_filterP2S_32x16 + bx lr +endfunc + +function x265_filterPixelToShort_32x24_neon + add r3, r3 + sub r3, #32 + vmov.u16 q0, #64 + vmov.u16 q1, #8192 + vneg.s16 q1, q1 + mov r12, #12 +.loop_filterP2S_32x24: + subs r12, #1 +.rept 2 + vld1.u8 {q9-q10}, [r0], r1 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmovl.u8 q11, d20 + vmovl.u8 q10, d21 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2]! + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q11, q0 + vmla.s16 q3, q10, q0 + vst1.16 {q2-q3}, [r2], r3 +.endr + bgt .loop_filterP2S_32x24 + bx lr +endfunc + +function x265_filterPixelToShort_32x32_neon + add r3, r3 + sub r3, #32 + vmov.u16 q0, #64 + vmov.u16 q1, #8192 + vneg.s16 q1, q1 + mov r12, #16 +.loop_filterP2S_32x32: + subs r12, #1 +.rept 2 + vld1.u8 {q9-q10}, [r0], r1 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmovl.u8 q11, d20 + vmovl.u8 q10, d21 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2]! + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q11, q0 + vmla.s16 q3, q10, q0 + vst1.16 {q2-q3}, [r2], r3 +.endr + bgt .loop_filterP2S_32x32 + bx lr +endfunc + +function x265_filterPixelToShort_32x64_neon + add r3, r3 + sub r3, #32 + vmov.u16 q0, #64 + vmov.u16 q1, #8192 + vneg.s16 q1, q1 + mov r12, #32 +.loop_filterP2S_32x64: + subs r12, #1 +.rept 2 + vld1.u8 {q9-q10}, [r0], r1 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmovl.u8 q11, d20 + vmovl.u8 q10, d21 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2]! + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q11, q0 + vmla.s16 q3, q10, q0 + vst1.16 {q2-q3}, [r2], r3 +.endr + bgt .loop_filterP2S_32x64 + bx lr +endfunc + +function x265_filterPixelToShort_64x16_neon + add r3, r3 + sub r1, #32 + sub r3, #96 + vmov.u16 q0, #64 + vmov.u16 q1, #8192 + vneg.s16 q1, q1 + mov r12, #8 +.loop_filterP2S_64x16: + subs r12, #1 +.rept 2 + vld1.u8 {q9-q10}, [r0]! + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmovl.u8 q11, d20 + vmovl.u8 q10, d21 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2]! + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q11, q0 + vmla.s16 q3, q10, q0 + vst1.16 {q2-q3}, [r2]! + + vld1.u8 {q9-q10}, [r0], r1 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmovl.u8 q11, d20 + vmovl.u8 q10, d21 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2]! + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q11, q0 + vmla.s16 q3, q10, q0 + vst1.16 {q2-q3}, [r2], r3 +.endr + bgt .loop_filterP2S_64x16 + bx lr +endfunc + +function x265_filterPixelToShort_64x32_neon + add r3, r3 + sub r1, #32 + sub r3, #96 + vmov.u16 q0, #64 + vmov.u16 q1, #8192 + vneg.s16 q1, q1 + mov r12, #16 +.loop_filterP2S_64x32: + subs r12, #1 +.rept 2 + vld1.u8 {q9-q10}, [r0]! + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmovl.u8 q11, d20 + vmovl.u8 q10, d21 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2]! + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q11, q0 + vmla.s16 q3, q10, q0 + vst1.16 {q2-q3}, [r2]! + + vld1.u8 {q9-q10}, [r0], r1 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmovl.u8 q11, d20 + vmovl.u8 q10, d21 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2]! + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q11, q0 + vmla.s16 q3, q10, q0 + vst1.16 {q2-q3}, [r2], r3 +.endr + bgt .loop_filterP2S_64x32 + bx lr +endfunc + +function x265_filterPixelToShort_64x48_neon + add r3, r3 + sub r1, #32 + sub r3, #96 + vmov.u16 q0, #64 + vmov.u16 q1, #8192 + vneg.s16 q1, q1 + mov r12, #24 +.loop_filterP2S_64x48: + subs r12, #1 +.rept 2 + vld1.u8 {q9-q10}, [r0]! + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmovl.u8 q11, d20 + vmovl.u8 q10, d21 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2]! + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q11, q0 + vmla.s16 q3, q10, q0 + vst1.16 {q2-q3}, [r2]! + + vld1.u8 {q9-q10}, [r0], r1 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmovl.u8 q11, d20 + vmovl.u8 q10, d21 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2]! + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q11, q0 + vmla.s16 q3, q10, q0 + vst1.16 {q2-q3}, [r2], r3 +.endr + bgt .loop_filterP2S_64x48 + bx lr +endfunc + +function x265_filterPixelToShort_64x64_neon + add r3, r3 + sub r1, #32 + sub r3, #96 + vmov.u16 q0, #64 + vmov.u16 q1, #8192 + vneg.s16 q1, q1 + mov r12, #32 +.loop_filterP2S_64x64: + subs r12, #1 +.rept 2 + vld1.u8 {q9-q10}, [r0]! + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmovl.u8 q11, d20 + vmovl.u8 q10, d21 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2]! + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q11, q0 + vmla.s16 q3, q10, q0 + vst1.16 {q2-q3}, [r2]! + + vld1.u8 {q9-q10}, [r0], r1 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmovl.u8 q11, d20 + vmovl.u8 q10, d21 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2]! + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q11, q0 + vmla.s16 q3, q10, q0 + vst1.16 {q2-q3}, [r2], r3 +.endr + bgt .loop_filterP2S_64x64 + bx lr +endfunc + +function x265_filterPixelToShort_48x64_neon + add r3, r3 + sub r1, #32 + sub r3, #64 + vmov.u16 q0, #64 + vmov.u16 q1, #8192 + vneg.s16 q1, q1 + mov r12, #32 +.loop_filterP2S_48x64: + subs r12, #1 +.rept 2 + vld1.u8 {q9-q10}, [r0]! + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmovl.u8 q11, d20 + vmovl.u8 q10, d21 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2]! + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q11, q0 + vmla.s16 q3, q10, q0 + vst1.16 {q2-q3}, [r2]! + + vld1.u8 {q9}, [r0], r1 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmov q2, q1 + vmov q3, q1 + vmla.s16 q2, q8, q0 + vmla.s16 q3, q9, q0 + vst1.16 {q2-q3}, [r2], r3 +.endr + bgt .loop_filterP2S_48x64 + bx lr +endfunc diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/ipfilter8.h --- a/source/common/arm/ipfilter8.h Tue Mar 01 12:18:18 2016 +0530 +++ b/source/common/arm/ipfilter8.h Tue Mar 01 17:00:20 2016 +0530 @@ -25,4 +25,30 @@ #ifndef X265_IPFILTER8_ARM_H #define X265_IPFILTER8_ARM_H +void x265_filterPixelToShort_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); +void x265_filterPixelToShort_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); + #endif // ifndef X265_IPFILTER8_ARM_H _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel