Module Name: src Committed By: matt Date: Sat Jan 12 20:27:13 UTC 2013
Added Files: src/common/lib/libc/arch/arm/string: memset_arm.S Log Message: A version of memset that can do NEON, VFP as well as normal arm instructions To generate a diff of this commit: cvs rdiff -u -r0 -r1.1 src/common/lib/libc/arch/arm/string/memset_arm.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Added files: Index: src/common/lib/libc/arch/arm/string/memset_arm.S diff -u /dev/null src/common/lib/libc/arch/arm/string/memset_arm.S:1.1 --- /dev/null Sat Jan 12 20:27:13 2013 +++ src/common/lib/libc/arch/arm/string/memset_arm.S Sat Jan 12 20:27:13 2013 @@ -0,0 +1,173 @@ +/* $NetBSD: memset_arm.S,v 1.1 2013/01/12 20:27:13 matt Exp $ */ + +/*- + * Copyright (c) 2012 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Matt Thomas of 3am Software Foundry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include <machine/asm.h> + +#if defined(NEON) +#define STORE8 vst1.32 {d0}, [ip:64]! +#define STORE16 vst1.32 {d0-d1}, [ip:64]! +#define STORE32 vst1.32 {d0-d3}, [ip:64]! +#elif defined(VFP) +#define STORE8 vstmia ip!, {d0} +#define STORE16 vstmia ip!, {d0-d1} +#define STORE32 vstmia ip!, {d0-d3} +#elif defined(_ARM_ARCH_DWORD_OK) +#define STORE8 strd r2, [ip], #8 +#define STORE16 STORE8; STORE8 +#define STORE32 STORE16; STORE16 +#else +#define STORE8 stmia ip!, {r2,r3} +#define STORE16 STORE8; STORE8 +#define STORE32 STORE16; STORE16 +#endif +/* + * memset: Sets a block of memory to the specified value + * Using NEON instructions + * + * On entry: + * r0 - dest address + * r1 - byte to write + * r2 - number of bytes to write + * + * On exit: + * r0 - dest address + */ +/* LINTSTUB: Func: void *memset(void *, int, size_t) */ +ENTRY(memset) + ands r3, r1, #0xff /* We deal with bytes */ + orrne r3, r3, r3, lsl #8 /* replicate to all bytes */ + orrne r3, r3, r3, lsl #16 /* replicate to all bytes */ + movs r1, r2 /* we need r2 & r3 */ + RETc(eq) /* return if length is 0 */ + mov ip, r0 /* r0 needs to stay the same */ + + cmp r1, #12 /* is this a small memset? *? + blt .Lbyte_by_byte /* then do it byte by byte */ + + /* Ok first we will dword align the address */ + ands r2, ip, #7 /* grab the bottom three bits */ + beq .Lmemset_dwordaligned /* The addr is dword aligned */ + + rsb r2, r2, #8 /* how far until dword aligned? */ + sub r1, r1, r2 /* subtract it from remaining length */ + mov r2, r3 /* duplicate fill value */ + + tst ip, #1 /* halfword aligned? */ + strneb r3, [ip], #1 /* no, write a byte */ + tst ip, #2 /* word aligned? */ + strneh r3, [ip], #2 /* no, write a halfword */ + tst ip, #4 /* dword aligned? */ + strne r3, [ip], #4 /* no, write a word */ + + /* We are now doubleword aligned */ +.Lmemset_dwordaligned: +#if defined(NEON) + vdup.8 q0, r3 /* move fill to SIMD */ + vmov q1, q0 /* put fill in q1 (d2-d3) */ +#elif defined(VFP) + mov r2, r3 /* duplicate fill value */ + vmov d0, r2, r3 /* move to VFP */ + vmov d1, r2, r3 + vmov d2, r2, r3 + vmov d3, r2, r3 +#endif + +#if 1 + cmp r1, #128 + blt .Lmemset_mainloop + ands r2, ip, #63 /* check for 64-byte alignment */ + beq .Lmemset_mainloop + /* + * Let's align to a 64-byte boundary so that stores don't cross + * cacheline boundaries. We also know we have at least 128-bytes to + * copy so we don't have to worry about the length at the moment. + */ + rsb r2, r2, #64 /* how many bytes until 64 bytes */ + sub r1, r1, r2 ?* subtract from remaining length */ +#if !defined(NEON) && !defined(VFP) + mov r2, r3 /* put fill back in r2 */ +#endif + + tst ip, #8 /* quadword aligned? */ + beq 1f /* yes */ + STORE8 /* no, store a dword */ +1: tst ip, #16 /* octaword aligned? *? + beq 2f /* yes */ + STORE16 /* no, store a quadword */ +2: tst ip, #32 /* 32 word aligned? */ + beq .Lmemset_mainloop /* yes */ + STORE32 /* no, make 64-byte aligned +#endif + +.Lmemset_mainloop: +#if !defined(NEON) && !defined(VFP) + mov r2, r3 /* put fill back in r2 */ +#endif + subs r1, r1, #64 /* subtract an initial 64 */ + blt .Lmemset_lessthan_64bytes + +3: STORE32 /* store first octaword */ + STORE32 /* store second octaword */ + RETc(eq) /* return if done */ + subs r1, r1, #64 /* subtract another 64 */ + bge 3b /* and do other if still >= 0 */ +.Lmemset_lessthan_64bytes: + tst r1, #32 /* do we have 16 bytes left? */ + beq .Lmemset_lessthan_32bytes + STORE32 /* yes, store an octaword */ + bics r1, r1, #32 /* subtract 16 */ + RETc(eq) /* return if length is 0 */ +.Lmemset_lessthan_32bytes: + tst r1, #16 /* do we have 16 bytes left? */ + beq .Lmemset_lessthan_16bytes + STORE16 /* yes, store a quadword */ + bics r1, r1, #16 /* subtract 16 */ + RETc(eq) /* return if length is 0 */ +.Lmemset_lessthan_16bytes: + tst r1, #8 /* do we have 8 bytes left? */ + beq .Lmemset_lessthan_8bytes/* no */ + STORE8 /* yes, store a dword */ + bics r1, r1, #8 /* subtract 8 */ + RETc(eq) /* return if length is 0 */ +.Lmemset_lessthan_8bytes: + tst r1, #4 /* do we have a word left? */ + strne r2, [ip], #4 /* yes, so write one */ + tst r1, #2 /* do we have a halfword left? */ + strneh r2, [ip], #2 /* yes, so write one */ + tst r1, #1 /* do we have a byte left? */ + strneb r2, [ip], #1 /* yes, so write one */ + RET /* return */ + +.Lbyte_by_byte: + subs r1, r1, #1 /* can we write a byte? */ + RETc(lt) /* no, we're done */ + strb r3, [ip], #1 /* yes, so do it */ + b .Lbyte_by_byte /* try next byte */ +END(memset)