Module Name:    src
Committed By:   matt
Date:           Sat Dec 15 22:23:31 UTC 2012

Modified Files:
        src/common/lib/libc/arch/arm/string: strlen_neon.S

Log Message:
Slighly improved (can deal with all 16 bytes being non-NUL and quickly
proceed to next qword).


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 src/common/lib/libc/arch/arm/string/strlen_neon.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/common/lib/libc/arch/arm/string/strlen_neon.S
diff -u src/common/lib/libc/arch/arm/string/strlen_neon.S:1.1 src/common/lib/libc/arch/arm/string/strlen_neon.S:1.2
--- src/common/lib/libc/arch/arm/string/strlen_neon.S:1.1	Sat Dec 15 19:26:34 2012
+++ src/common/lib/libc/arch/arm/string/strlen_neon.S	Sat Dec 15 22:23:31 2012
@@ -29,7 +29,7 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: strlen_neon.S,v 1.1 2012/12/15 19:26:34 matt Exp $")
+RCSID("$NetBSD: strlen_neon.S,v 1.2 2012/12/15 22:23:31 matt Exp $")
 	.text
 
 ENTRY(strlen)
@@ -39,6 +39,9 @@ ENTRY(strlen)
 	veor	q2, q2, q2	/* clear mask */
 	mov	r3, #7		/* NBBY - 1 */
 	vdup.32	q3, r3		/* dup throughout q3 */
+	mov	r3, #0x04	/* magic since there are 4 bytes per U32 */
+	orr	r3, r3, lsl #8	/* copy to next 8 bits */
+	orr	r3, r3, lsl #16	/* copy to upper 16 bits */
 	beq	.Lmain_loop
 	veor	q0, q0, q0	/* clear q0 */
 	vmvn	q2, q2		/* set all 16 bytes of mask to all 1s */
@@ -64,22 +67,23 @@ ENTRY(strlen)
 	vorr	q0, q0, q2	/* or "in" leading byte mask */
 	veor	q2, q2, q2	/* clear byte mask */
 	vceq.i8	q1, q0, #0	/* test each byte for 0 */
+	/* Why couldn't there be a 64-bit CLZ? */
 	vclz.i32 q1, q1		/* count leading zeroes to find the 0 byte */
 	vadd.i32 q1, q1, q3	/* round up to byte bounary */
 	vshr.u32 q1, q1, #3	/* convert to bytes */
-	vmov	r2, r3, d3	/* get lo & hi counts */
-	add	r0, r0, r3	/* add bytes to count */
-	cmp	r3, #4		/* less than 4 means a NUL encountered */
-	bxlt	lr		/* return */
-	add	r0, r0, r2	/* add bytes to count */
-	cmp	r2, #4		/* less than 4 means a NUL encountered */
-	bxlt	lr		/* return */
-	vmov	r2, r3, d2	/* get lo & hi counts */
-	add	r0, r0, r3	/* add bytes to count */
-	cmp	r3, #4		/* less than 4 means a NUL encountered */
-	bxlt	lr		/* return */
-	add	r0, r0, r2	/* add bytes to count */
-	cmp	r2, #4		/* less than 4 means a NUL encountered */
-	bxlt	lr		/* return */
-	b	.Lmain_loop
+	vmovn.i32 d0, q1	/* 4 I32 -> 4 I16 */
+	vmovn.i16 d0, q0	/* 4 I16 -> 4  I8 */
+	vmov	r2, s0		/* get counts */
+	cmp	r2, r3		/* count eq 4 in each byte? */
+	addeq	r0, #16		/*  no NULs */
+	beq	.Lmain_loop	/* get next qword */
+				/* r2[31:24] already has 1st word byte count */
+	tst	r2, #(4 << 24)	/* first word has 4 non-NUL? */
+	addne	r2, r2, r2, lsl #8 /* add second word byte-count */
+	tstne	r2, #(4 << 16)	/* second word has 4 non-NUL? */
+	addne	r2, r2, r2, lsl #16 /* add thirs word byte-count */
+	tstne	r2, #(4 << 8)	/* third has 4 non-NULL? */
+	addne	r2, r2, r2, lsl #24 /* add fourth word byte-count */
+	add	r0, r0, r2, lsr #24 /* add accumulated byte-count to length */
+	RET			/* and return. */
 END(strlen)

Reply via email to