Okay. Will resend it soon! On Fri, Mar 4, 2016 at 12:03 PM, Deepthi Nandakumar < [email protected]> wrote:
> This patch doesnt apply. > > On Tue, Mar 1, 2016 at 5:46 PM, <[email protected]> wrote: > >> # HG changeset patch >> # User Dnyaneshwar G <[email protected]> >> # Date 1456831820 -19800 >> # Tue Mar 01 17:00:20 2016 +0530 >> # Node ID 61e51faf9e7ee1c8056ac2f66cf51da104bfa106 >> # Parent 79c00b9bc2b81afef2e41526fc3c390528f3174c >> arm: Implement filterPixelToShort ARM NEON asm >> >> diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/CMakeLists.txt >> --- a/source/common/CMakeLists.txt Tue Mar 01 12:18:18 2016 +0530 >> +++ b/source/common/CMakeLists.txt Tue Mar 01 17:00:20 2016 +0530 >> @@ -89,7 +89,7 @@ >> set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h >> dct8.h loopfilter.h) >> >> # add ARM assembly/intrinsic files here >> - set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S >> blockcopy8.S) >> + set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S >> blockcopy8.S ipfilter8.S) >> set(VEC_PRIMITIVES) >> >> set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") >> diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/asm-primitives.cpp >> --- a/source/common/arm/asm-primitives.cpp Tue Mar 01 12:18:18 2016 >> +0530 >> +++ b/source/common/arm/asm-primitives.cpp Tue Mar 01 17:00:20 2016 >> +0530 >> @@ -33,6 +33,7 @@ >> #include "blockcopy8.h" >> #include "pixel.h" >> #include "pixel-util.h" >> +#include "ipfilter8.h" >> } >> >> namespace X265_NS { >> @@ -42,6 +43,33 @@ >> { >> if (cpuMask & X265_CPU_NEON) >> { >> + // filterPixelToShort >> + p.pu[LUMA_4x4].convert_p2s = PFX(filterPixelToShort_4x4_neon); >> + p.pu[LUMA_4x8].convert_p2s = PFX(filterPixelToShort_4x8_neon); >> + p.pu[LUMA_4x16].convert_p2s = PFX(filterPixelToShort_4x16_neon); >> + p.pu[LUMA_8x4].convert_p2s = PFX(filterPixelToShort_8x4_neon); >> + p.pu[LUMA_8x8].convert_p2s = PFX(filterPixelToShort_8x8_neon); >> + p.pu[LUMA_8x16].convert_p2s = PFX(filterPixelToShort_8x16_neon); >> + p.pu[LUMA_8x32].convert_p2s = PFX(filterPixelToShort_8x32_neon); >> + p.pu[LUMA_12x16].convert_p2s = >> PFX(filterPixelToShort_12x16_neon); >> + p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_neon); >> + p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_neon); >> + p.pu[LUMA_16x12].convert_p2s = >> PFX(filterPixelToShort_16x12_neon); >> + p.pu[LUMA_16x16].convert_p2s = >> PFX(filterPixelToShort_16x16_neon); >> + p.pu[LUMA_16x32].convert_p2s = >> PFX(filterPixelToShort_16x32_neon); >> + p.pu[LUMA_16x64].convert_p2s = >> PFX(filterPixelToShort_16x64_neon); >> + p.pu[LUMA_24x32].convert_p2s = >> PFX(filterPixelToShort_24x32_neon); >> + p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_neon); >> + p.pu[LUMA_32x16].convert_p2s = >> PFX(filterPixelToShort_32x16_neon); >> + p.pu[LUMA_32x24].convert_p2s = >> PFX(filterPixelToShort_32x24_neon); >> + p.pu[LUMA_32x32].convert_p2s = >> PFX(filterPixelToShort_32x32_neon); >> + p.pu[LUMA_32x64].convert_p2s = >> PFX(filterPixelToShort_32x64_neon); >> + p.pu[LUMA_48x64].convert_p2s = >> PFX(filterPixelToShort_48x64_neon); >> + p.pu[LUMA_64x16].convert_p2s = >> PFX(filterPixelToShort_64x16_neon); >> + p.pu[LUMA_64x32].convert_p2s = >> PFX(filterPixelToShort_64x32_neon); >> + p.pu[LUMA_64x48].convert_p2s = >> PFX(filterPixelToShort_64x48_neon); >> + p.pu[LUMA_64x64].convert_p2s = >> PFX(filterPixelToShort_64x64_neon); >> + >> // Block_fill >> p.cu[BLOCK_4x4].blockfill_s = PFX(blockfill_s_4x4_neon); >> p.cu[BLOCK_8x8].blockfill_s = PFX(blockfill_s_8x8_neon); >> diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/ipfilter8.S >> --- /dev/null Thu Jan 01 00:00:00 1970 +0000 >> +++ b/source/common/arm/ipfilter8.S Tue Mar 01 17:00:20 2016 +0530 >> @@ -0,0 +1,694 @@ >> >> +/***************************************************************************** >> + * Copyright (C) 2016 x265 project >> + * >> + * Authors: Dnyaneshwar G <[email protected]> >> + * >> + * This program is free software; you can redistribute it and/or modify >> + * it under the terms of the GNU General Public License as published by >> + * the Free Software Foundation; either version 2 of the License, or >> + * (at your option) any later version. >> + * >> + * This program is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> + * GNU General Public License for more details. >> + * >> + * You should have received a copy of the GNU General Public License >> + * along with this program; if not, write to the Free Software >> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, >> USA. >> + * >> + * This program is also available under a commercial proprietary license. >> + * For more information, contact us at license @ x265.com. >> + >> *****************************************************************************/ >> + >> +#include "asm.S" >> + >> +.section .rodata >> + >> +.align 4 >> + >> +.text >> + >> +// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* >> dst, intptr_t dstStride) >> +function x265_filterPixelToShort_4x4_neon >> + add r3, r3 >> + vmov.u16 q8, #64 >> + vmov.u16 q9, #8192 >> + vneg.s16 q9, q9 >> +.rept 2 >> + vld1.u8 {d0}, [r0], r1 >> + vld1.u8 {d2}, [r0], r1 >> + vmovl.u8 q0, d0 >> + vmovl.u8 q1, d2 >> + vmov q2, q9 >> + vmov q3, q9 >> + vmla.s16 q2, q0, q8 >> + vmla.s16 q3, q1, q8 >> + vst1.16 {d4}, [r2], r3 >> + vst1.16 {d6}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_4x8_neon >> + add r3, r3 >> + vmov.u16 q8, #64 >> + vmov.u16 q9, #8192 >> + vneg.s16 q9, q9 >> +.rept 4 >> + vld1.u8 {d0}, [r0], r1 >> + vld1.u8 {d2}, [r0], r1 >> + vmovl.u8 q0, d0 >> + vmovl.u8 q1, d2 >> + vmov q2, q9 >> + vmov q3, q9 >> + vmla.s16 q2, q0, q8 >> + vmla.s16 q3, q1, q8 >> + vst1.16 {d4}, [r2], r3 >> + vst1.16 {d6}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_4x16_neon >> + add r3, r3 >> + vmov.u16 q8, #64 >> + vmov.u16 q9, #8192 >> + vneg.s16 q9, q9 >> +.rept 8 >> + vld1.u8 {d0}, [r0], r1 >> + vld1.u8 {d2}, [r0], r1 >> + vmovl.u8 q0, d0 >> + vmovl.u8 q1, d2 >> + vmov q2, q9 >> + vmov q3, q9 >> + vmla.s16 q2, q0, q8 >> + vmla.s16 q3, q1, q8 >> + vst1.16 {d4}, [r2], r3 >> + vst1.16 {d6}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_8x4_neon >> + add r3, r3 >> + vmov.u16 q8, #64 >> + vmov.u16 q9, #8192 >> + vneg.s16 q9, q9 >> +.rept 2 >> + vld1.u8 {d0}, [r0], r1 >> + vld1.u8 {d2}, [r0], r1 >> + vmovl.u8 q0, d0 >> + vmovl.u8 q1, d2 >> + vmov q2, q9 >> + vmov q3, q9 >> + vmla.s16 q2, q0, q8 >> + vmla.s16 q3, q1, q8 >> + vst1.16 {q2}, [r2], r3 >> + vst1.16 {q3}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_8x8_neon >> + add r3, r3 >> + vmov.u16 q8, #64 >> + vmov.u16 q9, #8192 >> + vneg.s16 q9, q9 >> +.rept 4 >> + vld1.u8 {d0}, [r0], r1 >> + vld1.u8 {d2}, [r0], r1 >> + vmovl.u8 q0, d0 >> + vmovl.u8 q1, d2 >> + vmov q2, q9 >> + vmov q3, q9 >> + vmla.s16 q2, q0, q8 >> + vmla.s16 q3, q1, q8 >> + vst1.16 {q2}, [r2], r3 >> + vst1.16 {q3}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_8x16_neon >> + add r3, r3 >> + vmov.u16 q8, #64 >> + vmov.u16 q9, #8192 >> + vneg.s16 q9, q9 >> +.rept 8 >> + vld1.u8 {d0}, [r0], r1 >> + vld1.u8 {d2}, [r0], r1 >> + vmovl.u8 q0, d0 >> + vmovl.u8 q1, d2 >> + vmov q2, q9 >> + vmov q3, q9 >> + vmla.s16 q2, q0, q8 >> + vmla.s16 q3, q1, q8 >> + vst1.16 {q2}, [r2], r3 >> + vst1.16 {q3}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_8x32_neon >> + add r3, r3 >> + vmov.u16 q8, #64 >> + vmov.u16 q9, #8192 >> + vneg.s16 q9, q9 >> +.rept 16 >> + vld1.u8 {d0}, [r0], r1 >> + vld1.u8 {d2}, [r0], r1 >> + vmovl.u8 q0, d0 >> + vmovl.u8 q1, d2 >> + vmov q2, q9 >> + vmov q3, q9 >> + vmla.s16 q2, q0, q8 >> + vmla.s16 q3, q1, q8 >> + vst1.16 {q2}, [r2], r3 >> + vst1.16 {q3}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_12x16_neon >> + add r3, r3 >> + vmov.u16 q8, #64 >> + vmov.u16 q9, #8192 >> + vneg.s16 q9, q9 >> +.rept 16 >> + vld1.u8 {d2-d3}, [r0], r1 >> + vmovl.u8 q0, d2 >> + vmovl.u8 q1, d3 >> + vmov q2, q9 >> + vmov q3, q9 >> + vmla.s16 q2, q0, q8 >> + vmla.s16 q3, q1, q8 >> + vst1.16 {d4, d5, d6}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_16x4_neon >> + add r3, r3 >> + vmov.u16 q8, #64 >> + vmov.u16 q9, #8192 >> + vneg.s16 q9, q9 >> +.rept 4 >> + vld1.u8 {d2-d3}, [r0], r1 >> + vmovl.u8 q0, d2 >> + vmovl.u8 q1, d3 >> + vmov q2, q9 >> + vmov q3, q9 >> + vmla.s16 q2, q0, q8 >> + vmla.s16 q3, q1, q8 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_16x8_neon >> + add r3, r3 >> + vmov.u16 q8, #64 >> + vmov.u16 q9, #8192 >> + vneg.s16 q9, q9 >> +.rept 8 >> + vld1.u8 {d2-d3}, [r0], r1 >> + vmovl.u8 q0, d2 >> + vmovl.u8 q1, d3 >> + vmov q2, q9 >> + vmov q3, q9 >> + vmla.s16 q2, q0, q8 >> + vmla.s16 q3, q1, q8 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_16x12_neon >> + add r3, r3 >> + vmov.u16 q8, #64 >> + vmov.u16 q9, #8192 >> + vneg.s16 q9, q9 >> +.rept 12 >> + vld1.u8 {d2-d3}, [r0], r1 >> + vmovl.u8 q0, d2 >> + vmovl.u8 q1, d3 >> + vmov q2, q9 >> + vmov q3, q9 >> + vmla.s16 q2, q0, q8 >> + vmla.s16 q3, q1, q8 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_16x16_neon >> + add r3, r3 >> + vmov.u16 q8, #64 >> + vmov.u16 q9, #8192 >> + vneg.s16 q9, q9 >> +.rept 16 >> + vld1.u8 {d2-d3}, [r0], r1 >> + vmovl.u8 q0, d2 >> + vmovl.u8 q1, d3 >> + vmov q2, q9 >> + vmov q3, q9 >> + vmla.s16 q2, q0, q8 >> + vmla.s16 q3, q1, q8 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_16x32_neon >> + add r3, r3 >> + vmov.u16 q8, #64 >> + vmov.u16 q9, #8192 >> + vneg.s16 q9, q9 >> +.rept 32 >> + vld1.u8 {d2-d3}, [r0], r1 >> + vmovl.u8 q0, d2 >> + vmovl.u8 q1, d3 >> + vmov q2, q9 >> + vmov q3, q9 >> + vmla.s16 q2, q0, q8 >> + vmla.s16 q3, q1, q8 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_16x64_neon >> + add r3, r3 >> + vmov.u16 q8, #64 >> + vmov.u16 q9, #8192 >> + vneg.s16 q9, q9 >> +.rept 64 >> + vld1.u8 {d2-d3}, [r0], r1 >> + vmovl.u8 q0, d2 >> + vmovl.u8 q1, d3 >> + vmov q2, q9 >> + vmov q3, q9 >> + vmla.s16 q2, q0, q8 >> + vmla.s16 q3, q1, q8 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_24x32_neon >> + add r3, r3 >> + sub r3, #32 >> + vmov.u16 q0, #64 >> + vmov.u16 q1, #8192 >> + vneg.s16 q1, q1 >> +.rept 32 >> + vld1.u8 {d18, d19, d20}, [r0], r1 >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmovl.u8 q11, d20 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2]! >> + vmov q2, q1 >> + vmla.s16 q2, q11, q0 >> + vst1.16 {q2}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_32x8_neon >> + add r3, r3 >> + sub r3, #32 >> + vmov.u16 q0, #64 >> + vmov.u16 q1, #8192 >> + vneg.s16 q1, q1 >> +.rept 8 >> + vld1.u8 {q9-q10}, [r0], r1 >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmovl.u8 q11, d20 >> + vmovl.u8 q10, d21 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2]! >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q11, q0 >> + vmla.s16 q3, q10, q0 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_32x16_neon >> + add r3, r3 >> + sub r3, #32 >> + vmov.u16 q0, #64 >> + vmov.u16 q1, #8192 >> + vneg.s16 q1, q1 >> + mov r12, #8 >> +.loop_filterP2S_32x16: >> + subs r12, #1 >> +.rept 2 >> + vld1.u8 {q9-q10}, [r0], r1 >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmovl.u8 q11, d20 >> + vmovl.u8 q10, d21 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2]! >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q11, q0 >> + vmla.s16 q3, q10, q0 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bgt .loop_filterP2S_32x16 >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_32x24_neon >> + add r3, r3 >> + sub r3, #32 >> + vmov.u16 q0, #64 >> + vmov.u16 q1, #8192 >> + vneg.s16 q1, q1 >> + mov r12, #12 >> +.loop_filterP2S_32x24: >> + subs r12, #1 >> +.rept 2 >> + vld1.u8 {q9-q10}, [r0], r1 >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmovl.u8 q11, d20 >> + vmovl.u8 q10, d21 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2]! >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q11, q0 >> + vmla.s16 q3, q10, q0 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bgt .loop_filterP2S_32x24 >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_32x32_neon >> + add r3, r3 >> + sub r3, #32 >> + vmov.u16 q0, #64 >> + vmov.u16 q1, #8192 >> + vneg.s16 q1, q1 >> + mov r12, #16 >> +.loop_filterP2S_32x32: >> + subs r12, #1 >> +.rept 2 >> + vld1.u8 {q9-q10}, [r0], r1 >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmovl.u8 q11, d20 >> + vmovl.u8 q10, d21 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2]! >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q11, q0 >> + vmla.s16 q3, q10, q0 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bgt .loop_filterP2S_32x32 >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_32x64_neon >> + add r3, r3 >> + sub r3, #32 >> + vmov.u16 q0, #64 >> + vmov.u16 q1, #8192 >> + vneg.s16 q1, q1 >> + mov r12, #32 >> +.loop_filterP2S_32x64: >> + subs r12, #1 >> +.rept 2 >> + vld1.u8 {q9-q10}, [r0], r1 >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmovl.u8 q11, d20 >> + vmovl.u8 q10, d21 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2]! >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q11, q0 >> + vmla.s16 q3, q10, q0 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bgt .loop_filterP2S_32x64 >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_64x16_neon >> + add r3, r3 >> + sub r1, #32 >> + sub r3, #96 >> + vmov.u16 q0, #64 >> + vmov.u16 q1, #8192 >> + vneg.s16 q1, q1 >> + mov r12, #8 >> +.loop_filterP2S_64x16: >> + subs r12, #1 >> +.rept 2 >> + vld1.u8 {q9-q10}, [r0]! >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmovl.u8 q11, d20 >> + vmovl.u8 q10, d21 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2]! >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q11, q0 >> + vmla.s16 q3, q10, q0 >> + vst1.16 {q2-q3}, [r2]! >> + >> + vld1.u8 {q9-q10}, [r0], r1 >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmovl.u8 q11, d20 >> + vmovl.u8 q10, d21 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2]! >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q11, q0 >> + vmla.s16 q3, q10, q0 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bgt .loop_filterP2S_64x16 >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_64x32_neon >> + add r3, r3 >> + sub r1, #32 >> + sub r3, #96 >> + vmov.u16 q0, #64 >> + vmov.u16 q1, #8192 >> + vneg.s16 q1, q1 >> + mov r12, #16 >> +.loop_filterP2S_64x32: >> + subs r12, #1 >> +.rept 2 >> + vld1.u8 {q9-q10}, [r0]! >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmovl.u8 q11, d20 >> + vmovl.u8 q10, d21 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2]! >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q11, q0 >> + vmla.s16 q3, q10, q0 >> + vst1.16 {q2-q3}, [r2]! >> + >> + vld1.u8 {q9-q10}, [r0], r1 >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmovl.u8 q11, d20 >> + vmovl.u8 q10, d21 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2]! >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q11, q0 >> + vmla.s16 q3, q10, q0 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bgt .loop_filterP2S_64x32 >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_64x48_neon >> + add r3, r3 >> + sub r1, #32 >> + sub r3, #96 >> + vmov.u16 q0, #64 >> + vmov.u16 q1, #8192 >> + vneg.s16 q1, q1 >> + mov r12, #24 >> +.loop_filterP2S_64x48: >> + subs r12, #1 >> +.rept 2 >> + vld1.u8 {q9-q10}, [r0]! >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmovl.u8 q11, d20 >> + vmovl.u8 q10, d21 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2]! >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q11, q0 >> + vmla.s16 q3, q10, q0 >> + vst1.16 {q2-q3}, [r2]! >> + >> + vld1.u8 {q9-q10}, [r0], r1 >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmovl.u8 q11, d20 >> + vmovl.u8 q10, d21 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2]! >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q11, q0 >> + vmla.s16 q3, q10, q0 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bgt .loop_filterP2S_64x48 >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_64x64_neon >> + add r3, r3 >> + sub r1, #32 >> + sub r3, #96 >> + vmov.u16 q0, #64 >> + vmov.u16 q1, #8192 >> + vneg.s16 q1, q1 >> + mov r12, #32 >> +.loop_filterP2S_64x64: >> + subs r12, #1 >> +.rept 2 >> + vld1.u8 {q9-q10}, [r0]! >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmovl.u8 q11, d20 >> + vmovl.u8 q10, d21 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2]! >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q11, q0 >> + vmla.s16 q3, q10, q0 >> + vst1.16 {q2-q3}, [r2]! >> + >> + vld1.u8 {q9-q10}, [r0], r1 >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmovl.u8 q11, d20 >> + vmovl.u8 q10, d21 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2]! >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q11, q0 >> + vmla.s16 q3, q10, q0 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bgt .loop_filterP2S_64x64 >> + bx lr >> +endfunc >> + >> +function x265_filterPixelToShort_48x64_neon >> + add r3, r3 >> + sub r1, #32 >> + sub r3, #64 >> + vmov.u16 q0, #64 >> + vmov.u16 q1, #8192 >> + vneg.s16 q1, q1 >> + mov r12, #32 >> +.loop_filterP2S_48x64: >> + subs r12, #1 >> +.rept 2 >> + vld1.u8 {q9-q10}, [r0]! >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmovl.u8 q11, d20 >> + vmovl.u8 q10, d21 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2]! >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q11, q0 >> + vmla.s16 q3, q10, q0 >> + vst1.16 {q2-q3}, [r2]! >> + >> + vld1.u8 {q9}, [r0], r1 >> + vmovl.u8 q8, d18 >> + vmovl.u8 q9, d19 >> + vmov q2, q1 >> + vmov q3, q1 >> + vmla.s16 q2, q8, q0 >> + vmla.s16 q3, q9, q0 >> + vst1.16 {q2-q3}, [r2], r3 >> +.endr >> + bgt .loop_filterP2S_48x64 >> + bx lr >> +endfunc >> diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/ipfilter8.h >> --- a/source/common/arm/ipfilter8.h Tue Mar 01 12:18:18 2016 +0530 >> +++ b/source/common/arm/ipfilter8.h Tue Mar 01 17:00:20 2016 +0530 >> @@ -25,4 +25,30 @@ >> #ifndef X265_IPFILTER8_ARM_H >> #define X265_IPFILTER8_ARM_H >> >> +void x265_filterPixelToShort_4x4_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_4x8_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_4x16_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_8x4_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_8x8_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_8x16_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_8x32_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_12x16_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_16x4_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_16x8_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_16x12_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_16x16_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_16x32_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_16x64_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_24x32_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_32x8_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_32x16_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_32x24_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_32x32_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_32x64_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_48x64_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_64x16_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_64x32_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_64x48_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> +void x265_filterPixelToShort_64x64_neon(const pixel* src, intptr_t >> srcStride, int16_t* dst, intptr_t dstStride); >> + >> #endif // ifndef X265_IPFILTER8_ARM_H >> _______________________________________________ >> x265-devel mailing list >> [email protected] >> https://mailman.videolan.org/listinfo/x265-devel >> > > > > -- > Deepthi Nandakumar > Engineering Manager, x265 > Multicoreware, Inc > > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > >
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
