Module Name:    src
Committed By:   ryo
Date:           Tue Aug 29 15:00:23 UTC 2017

Modified Files:
        src/common/lib/libc/arch/aarch64/string: memset.S

Log Message:
* aarch64/memset.S didn't work! fixed some bugs.
* maximum size of DCZID_EL0:BS (2048) supported.


To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 src/common/lib/libc/arch/aarch64/string/memset.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/common/lib/libc/arch/aarch64/string/memset.S
diff -u src/common/lib/libc/arch/aarch64/string/memset.S:1.1 src/common/lib/libc/arch/aarch64/string/memset.S:1.2
--- src/common/lib/libc/arch/aarch64/string/memset.S:1.1	Sun Aug 10 05:47:35 2014
+++ src/common/lib/libc/arch/aarch64/string/memset.S	Tue Aug 29 15:00:23 2017
@@ -1,4 +1,4 @@
-/* $NetBSD: memset.S,v 1.1 2014/08/10 05:47:35 matt Exp $ */
+/* $NetBSD: memset.S,v 1.2 2017/08/29 15:00:23 ryo Exp $ */
 
 /*-
  * Copyright (c) 2014 The NetBSD Foundation, Inc.
@@ -133,7 +133,7 @@ ENTRY(memset)
 	add	x13, x15, x2	/* get ending address */
 	asr	x13, x13, x9	/* "ending" block numebr */
 	cmp	x13, x12	/* how many blocks? */
-	b.eq	.Lfilled	/*   none, do it 16 bytes at a time */
+	b.ls	.Lfilled	/*   none, do it 16 bytes at a time */
 
 	/*
 	 * Now we have one or more blocks to deal with.  First now we need
@@ -144,7 +144,7 @@ ENTRY(memset)
 
 	sub	x7, x10, x7	/* subtract offset from block length */
 	sub	x2, x2, x7	/* subtract that from length */
-	asr	x7, x7, #2	/* qword -> word */
+	asr	x7, x7, #4	/* length -> N*16 */
 
 	tbz	x15, #0, .Lzero_hword_aligned
 	strb	wzr, [x15], #1
@@ -158,28 +158,18 @@ ENTRY(memset)
 	tbz	x15, #3, .Lzero_qword_aligned
 	str	xzr, [x15], #8
 .Lzero_qword_aligned:
-	cbz	x7, .Lblock_aligned /* no qwords? just branch */
-	adr	x6, .Lblock_aligned
-	sub	x6, x6, x7	/* backup to write the last N qwords */
-	br	x6		/* and do it */
+	cbz	x7, .Lblock_aligned	/* less than 16 bytes? just branch */
+	adr	x6, .Lunrolled_end
+	sub	x6, x6, x7, lsl #2	/* backup to write the last N insn */
+	br	x6			/* and do it */
+
 	/*
-	 * This is valid for cache lines <= 256 bytes.
+	 * The maximum size of DCZID_EL0:BS supported is 2048 bytes.
 	 */
+	.rept (2048 / 16) - 1
 	stp	xzr, xzr, [x15], #16
-	stp	xzr, xzr, [x15], #16
-	stp	xzr, xzr, [x15], #16
-	stp	xzr, xzr, [x15], #16
-	stp	xzr, xzr, [x15], #16
-	stp	xzr, xzr, [x15], #16
-	stp	xzr, xzr, [x15], #16
-	stp	xzr, xzr, [x15], #16
-	stp	xzr, xzr, [x15], #16
-	stp	xzr, xzr, [x15], #16
-	stp	xzr, xzr, [x15], #16
-	stp	xzr, xzr, [x15], #16
-	stp	xzr, xzr, [x15], #16
-	stp	xzr, xzr, [x15], #16
-	stp	xzr, xzr, [x15], #16
+	.endr
+.Lunrolled_end:
 
 /*
  * Now we are block aligned.
@@ -193,7 +183,7 @@ ENTRY(memset)
 	ret
 
 .Lblock_done:
-	and	x2, x2, x12	/* make positive again */
+	and	x2, x2, x11	/* make positive again */
 	mov	x6, xzr		/* fill 2nd xword */
 	b	.Lqword_loop	/* and finish filling */
 

Reply via email to