# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1453891819 -19800 # Wed Jan 27 16:20:19 2016 +0530 # Node ID 14c4806a24eb277d31fa77c1c906838ffcb62395 # Parent f548abe8eae8fb75513a85d1b09233e706c7b5ba testbench: port x264 stack & register check code for ARM arch
diff -r f548abe8eae8 -r 14c4806a24eb source/common/arm/asm.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/source/common/arm/asm.S Wed Jan 27 16:20:19 2016 +0530 @@ -0,0 +1,184 @@ +/***************************************************************************** + * asm.S: arm utility macros + ***************************************************************************** + * Copyright (C) 2016 x265 project + * + * Authors: Mans Rullgard <m...@mansr.com> + * David Conrad <lesse...@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "x265_config.h" + +.syntax unified + +#if HAVE_NEON + .arch armv7-a +#elif HAVE_ARMV6T2 + .arch armv6t2 +#elif HAVE_ARMV6 + .arch armv6 +#endif + +.fpu neon + +#ifdef PREFIX +# define EXTERN_ASM _ +#else +# define EXTERN_ASM +#endif + +#ifdef __ELF__ +# define ELF +#else +# define ELF @ +#endif + +#if HAVE_AS_FUNC +# define FUNC +#else +# define FUNC @ +#endif + +.macro require8, val=1 +ELF .eabi_attribute 24, \val +.endm + +.macro preserve8, val=1 +ELF .eabi_attribute 25, \val +.endm + +.macro function name, export=1 + .macro endfunc +ELF .size \name, . - \name +FUNC .endfunc + .purgem endfunc + .endm + .align 2 +.if \export == 1 + .global EXTERN_ASM\name +ELF .hidden EXTERN_ASM\name +ELF .type EXTERN_ASM\name, %function +FUNC .func EXTERN_ASM\name +EXTERN_ASM\name: +.else +ELF .hidden \name +ELF .type \name, %function +FUNC .func \name +\name: +.endif +.endm + +.macro movrel rd, val +#if HAVE_ARMV6T2 && !defined(PIC) + movw \rd, #:lower16:\val + movt \rd, #:upper16:\val +#else + ldr \rd, =\val +#endif +.endm + +.macro movconst rd, val +#if HAVE_ARMV6T2 + movw \rd, #:lower16:\val +.if \val >> 16 + movt \rd, #:upper16:\val +.endif +#else + ldr \rd, =\val +#endif +.endm + +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) + +#define FENC_STRIDE 16 +#define FDEC_STRIDE 32 + +.macro HORIZ_ADD dest, a, b +.ifnb \b + vadd.u16 \a, \a, \b +.endif + vpaddl.u16 \a, \a + vpaddl.u32 \dest, \a +.endm + +.macro SUMSUB_AB sum, diff, a, b + vadd.s16 \sum, \a, \b + vsub.s16 \diff, \a, \b +.endm + +.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d + SUMSUB_AB \s1, \d1, \a, \b + SUMSUB_AB \s2, \d2, \c, \d +.endm + +.macro ABS2 a b + vabs.s16 \a, \a + vabs.s16 \b, \b +.endm + +// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes) +// op = sumsub/amax (sum and diff / maximum of absolutes) +// d1/2 = destination registers +// s1/2 = source registers +.macro HADAMARD dist, op, d1, d2, s1, s2 +.if \dist == 1 + vtrn.16 \s1, \s2 +.else + vtrn.32 \s1, \s2 +.endif +.ifc \op, sumsub + SUMSUB_AB \d1, \d2, \s1, \s2 +.else + vabs.s16 \s1, \s1 + vabs.s16 \s2, \s2 + vmax.s16 \d1, \s1, \s2 +.endif +.endm + +.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7 + vtrn.32 \r0, \r4 + vtrn.32 \r1, \r5 + vtrn.32 \r2, \r6 + vtrn.32 \r3, \r7 + vtrn.16 \r0, \r2 + vtrn.16 \r1, \r3 + vtrn.16 \r4, \r6 + vtrn.16 \r5, \r7 + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 + vtrn.8 \r4, \r5 + vtrn.8 \r6, \r7 +.endm + +.macro TRANSPOSE4x4 r0 r1 r2 r3 + vtrn.16 \r0, \r2 + vtrn.16 \r1, \r3 + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 +.endm + +.macro TRANSPOSE4x4_16 d0 d1 d2 d3 + vtrn.32 \d0, \d2 + vtrn.32 \d1, \d3 + vtrn.16 \d0, \d1 + vtrn.16 \d2, \d3 +.endm diff -r f548abe8eae8 -r 14c4806a24eb source/common/arm/cpu-a.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/source/common/arm/cpu-a.S Wed Jan 27 16:20:19 2016 +0530 @@ -0,0 +1,108 @@ +/***************************************************************************** + * cpu-a.S: arm cpu detection + ***************************************************************************** + * Copyright (C) 2016 x265 project + * + * Authors: David Conrad <lesse...@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "asm.S" + +.align 2 + +// done in gas because .fpu neon overrides the refusal to assemble +// instructions the selected -march/-mcpu doesn't support +function x265_cpu_neon_test + vadd.i16 q0, q0, q0 + bx lr +endfunc + +// return: 0 on success +// 1 if counters were already enabled +// 9 if lo-res counters were already enabled +function x265_cpu_enable_armv7_counter, export=0 + mrc p15, 0, r2, c9, c12, 0 // read PMNC + ands r0, r2, #1 + andne r0, r2, #9 + + orr r2, r2, #1 // enable counters + bic r2, r2, #8 // full resolution + mcreq p15, 0, r2, c9, c12, 0 // write PMNC + mov r2, #1 << 31 // enable cycle counter + mcr p15, 0, r2, c9, c12, 1 // write CNTENS + bx lr +endfunc + +function x265_cpu_disable_armv7_counter, export=0 + mrc p15, 0, r0, c9, c12, 0 // read PMNC + bic r0, r0, #1 // disable counters + mcr p15, 0, r0, c9, c12, 0 // write PMNC + bx lr +endfunc + + +.macro READ_TIME r + mrc p15, 0, \r, c9, c13, 0 +.endm + +// return: 0 if transfers neon -> arm transfers take more than 10 cycles +// nonzero otherwise +function x265_cpu_fast_neon_mrc_test + // check for user access to performance counters + mrc p15, 0, r0, c9, c14, 0 + cmp r0, #0 + bxeq lr + + push {r4-r6,lr} + bl x265_cpu_enable_armv7_counter + ands r1, r0, #8 + mov r3, #0 + mov ip, #4 + mov r6, #4 + moveq r5, #1 + movne r5, #64 + +average_loop: + mov r4, r5 + READ_TIME r1 +1: subs r4, r4, #1 +.rept 8 + vmov.u32 lr, d0[0] + add lr, lr, lr +.endr + bgt 1b + READ_TIME r2 + + subs r6, r6, #1 + sub r2, r2, r1 + cmpgt r2, #30 << 3 // assume context switch if it took over 30 cycles + addle r3, r3, r2 + subsle ip, ip, #1 + bgt average_loop + + // disable counters if we enabled them + ands r0, r0, #1 + bleq x265_cpu_disable_armv7_counter + + lsr r0, r3, #5 + cmp r0, #10 + movgt r0, #0 + pop {r4-r6,pc} +endfunc diff -r f548abe8eae8 -r 14c4806a24eb source/test/CMakeLists.txt --- a/source/test/CMakeLists.txt Wed Jan 20 18:27:42 2016 +0530 +++ b/source/test/CMakeLists.txt Wed Jan 27 16:20:19 2016 +0530 @@ -23,7 +23,13 @@ # add ARM assembly files if(ARM OR CROSS_COMPILE_ARM) - set(YASM_SRC) + enable_language(ASM) + set(YASM_SRC checkasm-arm.S) + add_custom_command( + OUTPUT checkasm-arm.obj + COMMAND ${CMAKE_CXX_COMPILER} + ARGS ${YASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj + DEPENDS checkasm-arm.S) endif(ARM OR CROSS_COMPILE_ARM) # add PowerPC assembly files diff -r f548abe8eae8 -r 14c4806a24eb source/test/checkasm-arm.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/source/test/checkasm-arm.S Wed Jan 27 16:20:19 2016 +0530 @@ -0,0 +1,132 @@ +/**************************************************************************** + * checkasm-arm.S: assembly check tool + ***************************************************************************** + * Copyright (C) 2016 x265 project + * + * Authors: Martin Storsjo <mar...@martin.st> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "../common/arm/asm.S" + +.section .rodata +.align 4 +register_init: +.quad 0x21f86d66c8ca00ce +.quad 0x75b6ba21077c48ad +.quad 0xed56bb2dcb3c7736 +.quad 0x8bda43d3fd1a7e06 +.quad 0xb64a9c9e5d318408 +.quad 0xdf9a54b303f1d3a3 +.quad 0x4a75479abd64e097 +.quad 0x249214109d5d1c88 + +error_message: +.asciz "failed to preserve register" + +.text + +@ max number of args used by any x265 asm function. +#define MAX_ARGS 15 + +#define ARG_STACK 4*(MAX_ARGS - 2) + +.macro clobbercheck variant +.equ pushed, 4*10 +function x265_checkasm_call_\variant + push {r4-r11, lr} +.ifc \variant, neon + vpush {q4-q7} +.equ pushed, pushed + 16*4 +.endif + + movrel r12, register_init +.ifc \variant, neon + vldm r12, {q4-q7} +.endif + ldm r12, {r4-r11} + + push {r1} + + sub sp, sp, #ARG_STACK +.equ pos, 0 +.rept MAX_ARGS-2 + ldr r12, [sp, #ARG_STACK + pushed + 8 + pos] + str r12, [sp, #pos] +.equ pos, pos + 4 +.endr + + mov r12, r0 + mov r0, r2 + mov r1, r3 + ldrd r2, r3, [sp, #ARG_STACK + pushed] + blx r12 + add sp, sp, #ARG_STACK + pop {r2} + + push {r0, r1} + movrel r12, register_init +.ifc \variant, neon + vldm r12, {q0-q3} + veor q0, q0, q4 + veor q1, q1, q5 + veor q2, q2, q6 + veor q3, q3, q7 + vorr q0, q0, q1 + vorr q0, q0, q2 + vorr q0, q0, q3 + vorr d0, d0, d1 + vrev64.32 d1, d0 + vorr d0, d0, d1 + vmov.32 r3, d0[0] +.else + mov r3, #0 +.endif + +.macro check_reg reg1, reg2 + ldrd r0, r1, [r12], #8 + eor r0, r0, \reg1 + eor r1, r1, \reg2 + orr r3, r3, r0 + orr r3, r3, r1 +.endm + check_reg r4, r5 + check_reg r6, r7 + check_reg r8, r9 + check_reg r10, r11 +.purgem check_reg + + cmp r3, #0 + beq 0f + + mov r12, #0 + str r12, [r2] + movrel r0, error_message + bl puts +0: + pop {r0, r1} +.ifc \variant, neon + vpop {q4-q7} +.endif + pop {r4-r11, pc} +endfunc +.endm + +clobbercheck neon +clobbercheck noneon _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel