This patch doesnt apply. On Tue, Mar 1, 2016 at 5:46 PM, <[email protected]> wrote:
> # HG changeset patch > # User Dnyaneshwar G <[email protected]> > # Date 1456831820 -19800 > # Tue Mar 01 17:00:20 2016 +0530 > # Node ID 61e51faf9e7ee1c8056ac2f66cf51da104bfa106 > # Parent 79c00b9bc2b81afef2e41526fc3c390528f3174c > arm: Implement filterPixelToShort ARM NEON asm > > diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/CMakeLists.txt > --- a/source/common/CMakeLists.txt Tue Mar 01 12:18:18 2016 +0530 > +++ b/source/common/CMakeLists.txt Tue Mar 01 17:00:20 2016 +0530 > @@ -89,7 +89,7 @@ > set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h > dct8.h loopfilter.h) > > # add ARM assembly/intrinsic files here > - set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S > blockcopy8.S) > + set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S > blockcopy8.S ipfilter8.S) > set(VEC_PRIMITIVES) > > set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") > diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/asm-primitives.cpp > --- a/source/common/arm/asm-primitives.cpp Tue Mar 01 12:18:18 2016 > +0530 > +++ b/source/common/arm/asm-primitives.cpp Tue Mar 01 17:00:20 2016 > +0530 > @@ -33,6 +33,7 @@ > #include "blockcopy8.h" > #include "pixel.h" > #include "pixel-util.h" > +#include "ipfilter8.h" > } > > namespace X265_NS { > @@ -42,6 +43,33 @@ > { > if (cpuMask & X265_CPU_NEON) > { > + // filterPixelToShort > + p.pu[LUMA_4x4].convert_p2s = PFX(filterPixelToShort_4x4_neon); > + p.pu[LUMA_4x8].convert_p2s = PFX(filterPixelToShort_4x8_neon); > + p.pu[LUMA_4x16].convert_p2s = PFX(filterPixelToShort_4x16_neon); > + p.pu[LUMA_8x4].convert_p2s = PFX(filterPixelToShort_8x4_neon); > + p.pu[LUMA_8x8].convert_p2s = PFX(filterPixelToShort_8x8_neon); > + p.pu[LUMA_8x16].convert_p2s = PFX(filterPixelToShort_8x16_neon); > + p.pu[LUMA_8x32].convert_p2s = PFX(filterPixelToShort_8x32_neon); > + p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_neon); > + p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_neon); > + p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_neon); > + p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_neon); > + p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_neon); > + p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_neon); > + p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_neon); > + p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_neon); > + p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_neon); > + p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_neon); > + p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_neon); > + p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_neon); > + p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_neon); > + p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_neon); > + p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_neon); > + p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_neon); > + p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_neon); > + p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_neon); > + > // Block_fill > p.cu[BLOCK_4x4].blockfill_s = PFX(blockfill_s_4x4_neon); > p.cu[BLOCK_8x8].blockfill_s = PFX(blockfill_s_8x8_neon); > diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/ipfilter8.S > --- /dev/null Thu Jan 01 00:00:00 1970 +0000 > +++ b/source/common/arm/ipfilter8.S Tue Mar 01 17:00:20 2016 +0530 > @@ -0,0 +1,694 @@ > > +/***************************************************************************** > + * Copyright (C) 2016 x265 project > + * > + * Authors: Dnyaneshwar G <[email protected]> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, > USA. > + * > + * This program is also available under a commercial proprietary license. > + * For more information, contact us at license @ x265.com. > + > *****************************************************************************/ > + > +#include "asm.S" > + > +.section .rodata > + > +.align 4 > + > +.text > + > +// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, > intptr_t dstStride) > +function x265_filterPixelToShort_4x4_neon > + add r3, r3 > + vmov.u16 q8, #64 > + vmov.u16 q9, #8192 > + vneg.s16 q9, q9 > +.rept 2 > + vld1.u8 {d0}, [r0], r1 > + vld1.u8 {d2}, [r0], r1 > + vmovl.u8 q0, d0 > + vmovl.u8 q1, d2 > + vmov q2, q9 > + vmov q3, q9 > + vmla.s16 q2, q0, q8 > + vmla.s16 q3, q1, q8 > + vst1.16 {d4}, [r2], r3 > + vst1.16 {d6}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_4x8_neon > + add r3, r3 > + vmov.u16 q8, #64 > + vmov.u16 q9, #8192 > + vneg.s16 q9, q9 > +.rept 4 > + vld1.u8 {d0}, [r0], r1 > + vld1.u8 {d2}, [r0], r1 > + vmovl.u8 q0, d0 > + vmovl.u8 q1, d2 > + vmov q2, q9 > + vmov q3, q9 > + vmla.s16 q2, q0, q8 > + vmla.s16 q3, q1, q8 > + vst1.16 {d4}, [r2], r3 > + vst1.16 {d6}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_4x16_neon > + add r3, r3 > + vmov.u16 q8, #64 > + vmov.u16 q9, #8192 > + vneg.s16 q9, q9 > +.rept 8 > + vld1.u8 {d0}, [r0], r1 > + vld1.u8 {d2}, [r0], r1 > + vmovl.u8 q0, d0 > + vmovl.u8 q1, d2 > + vmov q2, q9 > + vmov q3, q9 > + vmla.s16 q2, q0, q8 > + vmla.s16 q3, q1, q8 > + vst1.16 {d4}, [r2], r3 > + vst1.16 {d6}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_8x4_neon > + add r3, r3 > + vmov.u16 q8, #64 > + vmov.u16 q9, #8192 > + vneg.s16 q9, q9 > +.rept 2 > + vld1.u8 {d0}, [r0], r1 > + vld1.u8 {d2}, [r0], r1 > + vmovl.u8 q0, d0 > + vmovl.u8 q1, d2 > + vmov q2, q9 > + vmov q3, q9 > + vmla.s16 q2, q0, q8 > + vmla.s16 q3, q1, q8 > + vst1.16 {q2}, [r2], r3 > + vst1.16 {q3}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_8x8_neon > + add r3, r3 > + vmov.u16 q8, #64 > + vmov.u16 q9, #8192 > + vneg.s16 q9, q9 > +.rept 4 > + vld1.u8 {d0}, [r0], r1 > + vld1.u8 {d2}, [r0], r1 > + vmovl.u8 q0, d0 > + vmovl.u8 q1, d2 > + vmov q2, q9 > + vmov q3, q9 > + vmla.s16 q2, q0, q8 > + vmla.s16 q3, q1, q8 > + vst1.16 {q2}, [r2], r3 > + vst1.16 {q3}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_8x16_neon > + add r3, r3 > + vmov.u16 q8, #64 > + vmov.u16 q9, #8192 > + vneg.s16 q9, q9 > +.rept 8 > + vld1.u8 {d0}, [r0], r1 > + vld1.u8 {d2}, [r0], r1 > + vmovl.u8 q0, d0 > + vmovl.u8 q1, d2 > + vmov q2, q9 > + vmov q3, q9 > + vmla.s16 q2, q0, q8 > + vmla.s16 q3, q1, q8 > + vst1.16 {q2}, [r2], r3 > + vst1.16 {q3}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_8x32_neon > + add r3, r3 > + vmov.u16 q8, #64 > + vmov.u16 q9, #8192 > + vneg.s16 q9, q9 > +.rept 16 > + vld1.u8 {d0}, [r0], r1 > + vld1.u8 {d2}, [r0], r1 > + vmovl.u8 q0, d0 > + vmovl.u8 q1, d2 > + vmov q2, q9 > + vmov q3, q9 > + vmla.s16 q2, q0, q8 > + vmla.s16 q3, q1, q8 > + vst1.16 {q2}, [r2], r3 > + vst1.16 {q3}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_12x16_neon > + add r3, r3 > + vmov.u16 q8, #64 > + vmov.u16 q9, #8192 > + vneg.s16 q9, q9 > +.rept 16 > + vld1.u8 {d2-d3}, [r0], r1 > + vmovl.u8 q0, d2 > + vmovl.u8 q1, d3 > + vmov q2, q9 > + vmov q3, q9 > + vmla.s16 q2, q0, q8 > + vmla.s16 q3, q1, q8 > + vst1.16 {d4, d5, d6}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_16x4_neon > + add r3, r3 > + vmov.u16 q8, #64 > + vmov.u16 q9, #8192 > + vneg.s16 q9, q9 > +.rept 4 > + vld1.u8 {d2-d3}, [r0], r1 > + vmovl.u8 q0, d2 > + vmovl.u8 q1, d3 > + vmov q2, q9 > + vmov q3, q9 > + vmla.s16 q2, q0, q8 > + vmla.s16 q3, q1, q8 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_16x8_neon > + add r3, r3 > + vmov.u16 q8, #64 > + vmov.u16 q9, #8192 > + vneg.s16 q9, q9 > +.rept 8 > + vld1.u8 {d2-d3}, [r0], r1 > + vmovl.u8 q0, d2 > + vmovl.u8 q1, d3 > + vmov q2, q9 > + vmov q3, q9 > + vmla.s16 q2, q0, q8 > + vmla.s16 q3, q1, q8 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_16x12_neon > + add r3, r3 > + vmov.u16 q8, #64 > + vmov.u16 q9, #8192 > + vneg.s16 q9, q9 > +.rept 12 > + vld1.u8 {d2-d3}, [r0], r1 > + vmovl.u8 q0, d2 > + vmovl.u8 q1, d3 > + vmov q2, q9 > + vmov q3, q9 > + vmla.s16 q2, q0, q8 > + vmla.s16 q3, q1, q8 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_16x16_neon > + add r3, r3 > + vmov.u16 q8, #64 > + vmov.u16 q9, #8192 > + vneg.s16 q9, q9 > +.rept 16 > + vld1.u8 {d2-d3}, [r0], r1 > + vmovl.u8 q0, d2 > + vmovl.u8 q1, d3 > + vmov q2, q9 > + vmov q3, q9 > + vmla.s16 q2, q0, q8 > + vmla.s16 q3, q1, q8 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_16x32_neon > + add r3, r3 > + vmov.u16 q8, #64 > + vmov.u16 q9, #8192 > + vneg.s16 q9, q9 > +.rept 32 > + vld1.u8 {d2-d3}, [r0], r1 > + vmovl.u8 q0, d2 > + vmovl.u8 q1, d3 > + vmov q2, q9 > + vmov q3, q9 > + vmla.s16 q2, q0, q8 > + vmla.s16 q3, q1, q8 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_16x64_neon > + add r3, r3 > + vmov.u16 q8, #64 > + vmov.u16 q9, #8192 > + vneg.s16 q9, q9 > +.rept 64 > + vld1.u8 {d2-d3}, [r0], r1 > + vmovl.u8 q0, d2 > + vmovl.u8 q1, d3 > + vmov q2, q9 > + vmov q3, q9 > + vmla.s16 q2, q0, q8 > + vmla.s16 q3, q1, q8 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_24x32_neon > + add r3, r3 > + sub r3, #32 > + vmov.u16 q0, #64 > + vmov.u16 q1, #8192 > + vneg.s16 q1, q1 > +.rept 32 > + vld1.u8 {d18, d19, d20}, [r0], r1 > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmovl.u8 q11, d20 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2]! > + vmov q2, q1 > + vmla.s16 q2, q11, q0 > + vst1.16 {q2}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_32x8_neon > + add r3, r3 > + sub r3, #32 > + vmov.u16 q0, #64 > + vmov.u16 q1, #8192 > + vneg.s16 q1, q1 > +.rept 8 > + vld1.u8 {q9-q10}, [r0], r1 > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmovl.u8 q11, d20 > + vmovl.u8 q10, d21 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2]! > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q11, q0 > + vmla.s16 q3, q10, q0 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bx lr > +endfunc > + > +function x265_filterPixelToShort_32x16_neon > + add r3, r3 > + sub r3, #32 > + vmov.u16 q0, #64 > + vmov.u16 q1, #8192 > + vneg.s16 q1, q1 > + mov r12, #8 > +.loop_filterP2S_32x16: > + subs r12, #1 > +.rept 2 > + vld1.u8 {q9-q10}, [r0], r1 > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmovl.u8 q11, d20 > + vmovl.u8 q10, d21 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2]! > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q11, q0 > + vmla.s16 q3, q10, q0 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bgt .loop_filterP2S_32x16 > + bx lr > +endfunc > + > +function x265_filterPixelToShort_32x24_neon > + add r3, r3 > + sub r3, #32 > + vmov.u16 q0, #64 > + vmov.u16 q1, #8192 > + vneg.s16 q1, q1 > + mov r12, #12 > +.loop_filterP2S_32x24: > + subs r12, #1 > +.rept 2 > + vld1.u8 {q9-q10}, [r0], r1 > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmovl.u8 q11, d20 > + vmovl.u8 q10, d21 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2]! > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q11, q0 > + vmla.s16 q3, q10, q0 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bgt .loop_filterP2S_32x24 > + bx lr > +endfunc > + > +function x265_filterPixelToShort_32x32_neon > + add r3, r3 > + sub r3, #32 > + vmov.u16 q0, #64 > + vmov.u16 q1, #8192 > + vneg.s16 q1, q1 > + mov r12, #16 > +.loop_filterP2S_32x32: > + subs r12, #1 > +.rept 2 > + vld1.u8 {q9-q10}, [r0], r1 > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmovl.u8 q11, d20 > + vmovl.u8 q10, d21 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2]! > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q11, q0 > + vmla.s16 q3, q10, q0 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bgt .loop_filterP2S_32x32 > + bx lr > +endfunc > + > +function x265_filterPixelToShort_32x64_neon > + add r3, r3 > + sub r3, #32 > + vmov.u16 q0, #64 > + vmov.u16 q1, #8192 > + vneg.s16 q1, q1 > + mov r12, #32 > +.loop_filterP2S_32x64: > + subs r12, #1 > +.rept 2 > + vld1.u8 {q9-q10}, [r0], r1 > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmovl.u8 q11, d20 > + vmovl.u8 q10, d21 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2]! > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q11, q0 > + vmla.s16 q3, q10, q0 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bgt .loop_filterP2S_32x64 > + bx lr > +endfunc > + > +function x265_filterPixelToShort_64x16_neon > + add r3, r3 > + sub r1, #32 > + sub r3, #96 > + vmov.u16 q0, #64 > + vmov.u16 q1, #8192 > + vneg.s16 q1, q1 > + mov r12, #8 > +.loop_filterP2S_64x16: > + subs r12, #1 > +.rept 2 > + vld1.u8 {q9-q10}, [r0]! > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmovl.u8 q11, d20 > + vmovl.u8 q10, d21 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2]! > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q11, q0 > + vmla.s16 q3, q10, q0 > + vst1.16 {q2-q3}, [r2]! > + > + vld1.u8 {q9-q10}, [r0], r1 > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmovl.u8 q11, d20 > + vmovl.u8 q10, d21 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2]! > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q11, q0 > + vmla.s16 q3, q10, q0 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bgt .loop_filterP2S_64x16 > + bx lr > +endfunc > + > +function x265_filterPixelToShort_64x32_neon > + add r3, r3 > + sub r1, #32 > + sub r3, #96 > + vmov.u16 q0, #64 > + vmov.u16 q1, #8192 > + vneg.s16 q1, q1 > + mov r12, #16 > +.loop_filterP2S_64x32: > + subs r12, #1 > +.rept 2 > + vld1.u8 {q9-q10}, [r0]! > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmovl.u8 q11, d20 > + vmovl.u8 q10, d21 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2]! > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q11, q0 > + vmla.s16 q3, q10, q0 > + vst1.16 {q2-q3}, [r2]! > + > + vld1.u8 {q9-q10}, [r0], r1 > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmovl.u8 q11, d20 > + vmovl.u8 q10, d21 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2]! > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q11, q0 > + vmla.s16 q3, q10, q0 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bgt .loop_filterP2S_64x32 > + bx lr > +endfunc > + > +function x265_filterPixelToShort_64x48_neon > + add r3, r3 > + sub r1, #32 > + sub r3, #96 > + vmov.u16 q0, #64 > + vmov.u16 q1, #8192 > + vneg.s16 q1, q1 > + mov r12, #24 > +.loop_filterP2S_64x48: > + subs r12, #1 > +.rept 2 > + vld1.u8 {q9-q10}, [r0]! > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmovl.u8 q11, d20 > + vmovl.u8 q10, d21 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2]! > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q11, q0 > + vmla.s16 q3, q10, q0 > + vst1.16 {q2-q3}, [r2]! > + > + vld1.u8 {q9-q10}, [r0], r1 > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmovl.u8 q11, d20 > + vmovl.u8 q10, d21 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2]! > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q11, q0 > + vmla.s16 q3, q10, q0 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bgt .loop_filterP2S_64x48 > + bx lr > +endfunc > + > +function x265_filterPixelToShort_64x64_neon > + add r3, r3 > + sub r1, #32 > + sub r3, #96 > + vmov.u16 q0, #64 > + vmov.u16 q1, #8192 > + vneg.s16 q1, q1 > + mov r12, #32 > +.loop_filterP2S_64x64: > + subs r12, #1 > +.rept 2 > + vld1.u8 {q9-q10}, [r0]! > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmovl.u8 q11, d20 > + vmovl.u8 q10, d21 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2]! > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q11, q0 > + vmla.s16 q3, q10, q0 > + vst1.16 {q2-q3}, [r2]! > + > + vld1.u8 {q9-q10}, [r0], r1 > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmovl.u8 q11, d20 > + vmovl.u8 q10, d21 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2]! > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q11, q0 > + vmla.s16 q3, q10, q0 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bgt .loop_filterP2S_64x64 > + bx lr > +endfunc > + > +function x265_filterPixelToShort_48x64_neon > + add r3, r3 > + sub r1, #32 > + sub r3, #64 > + vmov.u16 q0, #64 > + vmov.u16 q1, #8192 > + vneg.s16 q1, q1 > + mov r12, #32 > +.loop_filterP2S_48x64: > + subs r12, #1 > +.rept 2 > + vld1.u8 {q9-q10}, [r0]! > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmovl.u8 q11, d20 > + vmovl.u8 q10, d21 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2]! > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q11, q0 > + vmla.s16 q3, q10, q0 > + vst1.16 {q2-q3}, [r2]! > + > + vld1.u8 {q9}, [r0], r1 > + vmovl.u8 q8, d18 > + vmovl.u8 q9, d19 > + vmov q2, q1 > + vmov q3, q1 > + vmla.s16 q2, q8, q0 > + vmla.s16 q3, q9, q0 > + vst1.16 {q2-q3}, [r2], r3 > +.endr > + bgt .loop_filterP2S_48x64 > + bx lr > +endfunc > diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/ipfilter8.h > --- a/source/common/arm/ipfilter8.h Tue Mar 01 12:18:18 2016 +0530 > +++ b/source/common/arm/ipfilter8.h Tue Mar 01 17:00:20 2016 +0530 > @@ -25,4 +25,30 @@ > #ifndef X265_IPFILTER8_ARM_H > #define X265_IPFILTER8_ARM_H > > +void x265_filterPixelToShort_4x4_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_4x8_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_4x16_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_8x4_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_8x8_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_8x16_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_8x32_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_12x16_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_16x4_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_16x8_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_16x12_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_16x16_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_16x32_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_16x64_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_24x32_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_32x8_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_32x16_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_32x24_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_32x32_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_32x64_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_48x64_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_64x16_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_64x32_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_64x48_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > +void x265_filterPixelToShort_64x64_neon(const pixel* src, intptr_t > srcStride, int16_t* dst, intptr_t dstStride); > + > #endif // ifndef X265_IPFILTER8_ARM_H > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > -- Deepthi Nandakumar Engineering Manager, x265 Multicoreware, Inc
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
