Module Name:    src
Committed By:   matt
Date:           Sat Jan 12 20:27:13 UTC 2013

Added Files:
        src/common/lib/libc/arch/arm/string: memset_arm.S

Log Message:
A version of memset that can do NEON, VFP as well as normal arm instructions


To generate a diff of this commit:
cvs rdiff -u -r0 -r1.1 src/common/lib/libc/arch/arm/string/memset_arm.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Added files:

Index: src/common/lib/libc/arch/arm/string/memset_arm.S
diff -u /dev/null src/common/lib/libc/arch/arm/string/memset_arm.S:1.1
--- /dev/null	Sat Jan 12 20:27:13 2013
+++ src/common/lib/libc/arch/arm/string/memset_arm.S	Sat Jan 12 20:27:13 2013
@@ -0,0 +1,173 @@
+/*	$NetBSD: memset_arm.S,v 1.1 2013/01/12 20:27:13 matt Exp $	*/
+
+/*-
+ * Copyright (c) 2012 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Matt Thomas of 3am Software Foundry.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <machine/asm.h>
+
+#if defined(NEON)
+#define	STORE8		vst1.32		{d0}, [ip:64]!
+#define	STORE16		vst1.32		{d0-d1}, [ip:64]!
+#define	STORE32		vst1.32		{d0-d3}, [ip:64]!
+#elif defined(VFP)
+#define	STORE8		vstmia		ip!, {d0}
+#define	STORE16		vstmia		ip!, {d0-d1}
+#define	STORE32		vstmia		ip!, {d0-d3}
+#elif defined(_ARM_ARCH_DWORD_OK)
+#define	STORE8		strd		r2, [ip], #8
+#define	STORE16		STORE8; STORE8
+#define	STORE32		STORE16; STORE16
+#else
+#define	STORE8		stmia		ip!, {r2,r3}
+#define	STORE16		STORE8; STORE8
+#define	STORE32		STORE16; STORE16
+#endif
+/*
+ * memset: Sets a block of memory to the specified value
+ * Using NEON instructions
+ *
+ * On entry:
+ *   r0 - dest address
+ *   r1 - byte to write
+ *   r2 - number of bytes to write
+ *
+ * On exit:
+ *   r0 - dest address
+ */
+/* LINTSTUB: Func: void *memset(void *, int, size_t) */
+ENTRY(memset)
+	ands		r3, r1, #0xff	/* We deal with bytes */
+	orrne		r3, r3, r3, lsl #8	/* replicate to all bytes */
+	orrne		r3, r3, r3, lsl #16	/* replicate to all bytes */
+	movs		r1, r2		/* we need r2 & r3 */
+	RETc(eq)			/* return if length is 0 */
+	mov		ip, r0		/* r0 needs to stay the same */
+
+	cmp		r1, #12		/* is this a small memset? *?
+	blt		.Lbyte_by_byte	/*   then do it byte by byte */
+
+	/* Ok first we will dword align the address */
+	ands		r2, ip, #7	/* grab the bottom three bits */
+	beq		.Lmemset_dwordaligned	/* The addr is dword aligned */
+
+	rsb		r2, r2, #8	/* how far until dword aligned? */
+	sub		r1, r1, r2	/* subtract it from remaining length */
+	mov		r2, r3		/* duplicate fill value */
+
+	tst		ip, #1		/* halfword aligned? */
+	strneb		r3, [ip], #1	/*   no, write a byte */
+	tst		ip, #2		/* word aligned? */
+	strneh		r3, [ip], #2	/*   no, write a halfword */
+	tst		ip, #4		/* dword aligned? */
+	strne		r3, [ip], #4	/*   no, write a word */
+
+	/* We are now doubleword aligned */
+.Lmemset_dwordaligned:
+#if defined(NEON)
+	vdup.8		q0, r3		/* move fill to SIMD */
+	vmov		q1, q0		/* put fill in q1 (d2-d3) */
+#elif defined(VFP)
+	mov		r2, r3		/* duplicate fill value */
+	vmov		d0, r2, r3	/* move to VFP */
+	vmov		d1, r2, r3
+	vmov		d2, r2, r3
+	vmov		d3, r2, r3
+#endif
+
+#if 1
+	cmp		r1, #128
+	blt		.Lmemset_mainloop
+	ands		r2, ip, #63	/* check for 64-byte alignment */
+	beq		.Lmemset_mainloop
+	/*
+	 * Let's align to a 64-byte boundary so that stores don't cross
+	 * cacheline boundaries.  We also know we have at least 128-bytes to
+	 * copy so we don't have to worry about the length at the moment.
+	 */
+	rsb		r2, r2, #64	/* how many bytes until 64 bytes */
+	sub		r1, r1, r2	?* subtract from remaining length */
+#if !defined(NEON) && !defined(VFP)
+	mov		r2, r3		/* put fill back in r2 */
+#endif
+
+	tst		ip, #8		/* quadword aligned? */
+	beq		1f		/*   yes */
+	STORE8				/*   no, store a dword */
+1:	tst		ip, #16		/* octaword aligned? *?
+	beq		2f		/*   yes */
+	STORE16				/*   no, store a quadword */
+2:	tst		ip, #32		/* 32 word aligned? */
+	beq		.Lmemset_mainloop		/*   yes */
+	STORE32				/*   no, make 64-byte aligned
+#endif
+
+.Lmemset_mainloop:
+#if !defined(NEON) && !defined(VFP)
+	mov		r2, r3		/* put fill back in r2 */
+#endif
+	subs		r1, r1, #64	/* subtract an initial 64 */
+	blt		.Lmemset_lessthan_64bytes
+
+3:	STORE32				/* store first octaword */
+	STORE32				/* store second octaword */
+	RETc(eq)			/* return if done */
+	subs		r1, r1, #64	/* subtract another 64 */
+	bge		3b		/* and do other if still >= 0 */
+.Lmemset_lessthan_64bytes:
+	tst		r1, #32		/* do we have 16 bytes left? */
+	beq		.Lmemset_lessthan_32bytes
+	STORE32				/*    yes, store an octaword */
+	bics		r1, r1, #32	/* subtract 16 */
+	RETc(eq)			/* return if length is 0 */
+.Lmemset_lessthan_32bytes:
+	tst		r1, #16		/* do we have 16 bytes left? */
+	beq		.Lmemset_lessthan_16bytes
+	STORE16				/*   yes, store a quadword */
+	bics		r1, r1, #16	/* subtract 16 */
+	RETc(eq)			/* return if length is 0 */
+.Lmemset_lessthan_16bytes:
+	tst		r1, #8		/* do we have 8 bytes left? */
+	beq		.Lmemset_lessthan_8bytes/*   no */
+	STORE8				/*   yes, store a dword */
+	bics		r1, r1, #8	/* subtract 8 */
+	RETc(eq)			/* return if length is 0 */
+.Lmemset_lessthan_8bytes:
+	tst		r1, #4		/* do we have a word left? */
+	strne		r2, [ip], #4	/*   yes, so write one */
+	tst		r1, #2		/* do we have a halfword left? */
+	strneh		r2, [ip], #2	/*   yes, so write one */
+	tst		r1, #1		/* do we have a byte left? */
+	strneb		r2, [ip], #1	/*   yes, so write one */
+	RET				/* return */
+
+.Lbyte_by_byte:
+	subs		r1, r1, #1	/* can we write a byte? */
+	RETc(lt)			/*   no, we're done */
+	strb		r3, [ip], #1	/*   yes, so do it */
+	b		.Lbyte_by_byte	/* try next byte */
+END(memset)

Reply via email to