Module Name: src Committed By: skrll Date: Sun Feb 4 21:52:17 UTC 2018
Modified Files: src/common/lib/libc/arch/aarch64/string: memcmp.S memcpy.S Added Files: src/common/lib/libc/arch/aarch64/string: bcopy.S memmove.S Log Message: Working / new versions from Ryo Shimizu To generate a diff of this commit: cvs rdiff -u -r0 -r1.1 src/common/lib/libc/arch/aarch64/string/bcopy.S \ src/common/lib/libc/arch/aarch64/string/memmove.S cvs rdiff -u -r1.1 -r1.2 src/common/lib/libc/arch/aarch64/string/memcmp.S \ src/common/lib/libc/arch/aarch64/string/memcpy.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/common/lib/libc/arch/aarch64/string/memcmp.S diff -u src/common/lib/libc/arch/aarch64/string/memcmp.S:1.1 src/common/lib/libc/arch/aarch64/string/memcmp.S:1.2 --- src/common/lib/libc/arch/aarch64/string/memcmp.S:1.1 Sun Aug 10 05:47:35 2014 +++ src/common/lib/libc/arch/aarch64/string/memcmp.S Sun Feb 4 21:52:16 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: memcmp.S,v 1.1 2014/08/10 05:47:35 matt Exp $ */ +/* $NetBSD: memcmp.S,v 1.2 2018/02/04 21:52:16 skrll Exp $ */ /*- * Copyright (c) 2014 The NetBSD Foundation, Inc. @@ -31,7 +31,7 @@ #include <machine/asm.h> -RCSID("$NetBSD: memcmp.S,v 1.1 2014/08/10 05:47:35 matt Exp $") +RCSID("$NetBSD: memcmp.S,v 1.2 2018/02/04 21:52:16 skrll Exp $") ENTRY(memcmp) mov x9, x0 @@ -42,14 +42,14 @@ ENTRY(memcmp) cmp x2, #6 b.eq .Lmemcmp_6bytes #endif - cmp x2, #7 + cmp x2, #8 b.ls .Lmemcmp_lessthan8 ands x3, x9, #7 b.eq .Lmemcmp_dword_loop /* - * The two addresses have identical alignment but are not yet dword aligned. + * The src1 address is not dword aligned. */ add x2, x2, x3 /* add unalignment to length */ sub x2, x2, #8 /* now subtract a dword */ @@ -68,14 +68,7 @@ ENTRY(memcmp) lsr x6, x6, x3 /* discard leading bytes from data2 */ #endif subs x0, x4, x6 /* compare data */ -#ifdef __AARCH64EL__ b.ne .Lmemcmp_last_compare /* difference. find it */ -#else - b.eq .Lmemcmp_dword_loop /* no difference. go to loop */ - rev x4, x4 /* byte swap data1 */ - rev x6, x6 /* byte swap data2 */ - b .Lmemcmp_last_compare /* go find the difference. */ -#endif .Lmemcmp_dword_loop: subs x2, x2, #8 @@ -84,10 +77,6 @@ ENTRY(memcmp) ldr x6, [x10], #8 subs x0, x4, x6 b.eq .Lmemcmp_dword_loop /* no difference. go to loop */ -#ifdef __AARCH64EB__ - rev x4, x4 /* byte swap data1 */ - rev x6, x6 /* byte swap data2 */ -#endif b .Lmemcmp_last_compare /* go find the difference. */ .Lmemcmp_finish_dword: @@ -96,6 +85,8 @@ ENTRY(memcmp) */ tst x2, #7 b.eq .Lmemcmp_ret + mov x4, xzr + mov x6, xzr /* * */ @@ -120,16 +111,18 @@ ENTRY(memcmp) #endif .Lmemcmp_finish_hword: -#ifdef __AARCH64EB__ - rev x4, x4 /* byte swap data1 */ - rev x6, x6 /* byte swap data1 */ -#endif - tbz x2, #0, .Lmemcmp_last_compare + tbz x2, #0, .Lmemcmp_last_compare0 + ldrb w5, [x9] ldrb w7, [x10] +#ifdef __AARCH64EB__ + orr x4, x4, x5, lsl #8 + orr x6, x6, x7, lsl #8 +#else orr x4, x4, x5, lsl #48 orr x6, x6, x7, lsl #48 - b .Lmemcmp_last_compare /* go find the difference. */ +#endif + b .Lmemcmp_last_compare0 /* go find the difference. */ /* * D @@ -167,7 +160,7 @@ ENTRY(memcmp) #endif /* _KERNEL */ /* - * We have loaded the final bytes in x4 and x6 in LE format. Now we have + * We have loaded the final bytes in x4 and x6 in host-endian. Now we have * to figure what the difference is (if any). First we subtract. Any bytes * that are the same will be 0. So to find the first non-zero byte we byterev * and then use clz to find that byte. @@ -175,13 +168,25 @@ ENTRY(memcmp) * data dwords left to remove the equal part. Then we shift right to discard * the trailing bytes. Then we subtract and return. */ +.Lmemcmp_last_compare0: subs x0, x4, x6 b.eq .Lmemcmp_ret .Lmemcmp_last_compare: - rev x1, x0 /* byte reverse */ +#if __AARCH64EB__ + clz x1, x0 /* find first non-zero byte */ + rev x0, x0 +#else + rev x1, x0 clz x1, x1 /* find first non-zero byte */ - bfi x1, xzr, #0, #3 /* make it byte aligned */ - lsr x0, x0, x1 /* shift to LSB */ - sxtb w0, w0 /* sign extend */ +#endif + bfi x1, xzr, #0, #3 /* make it byte aligned */ + lsr x1, x0, x1 /* shift to LSB */ +#if __AARCH64EL__ + rev x4, x4 /* byte reverse */ + rev x6, x6 /* byte reverse */ +#endif + subs x0, x4, x6 + csetm x0, cc /* set mask bits as sign */ + bfm x0, x1, #0, #7 /* extend with sign bit */ ret END(memcmp) Index: src/common/lib/libc/arch/aarch64/string/memcpy.S diff -u src/common/lib/libc/arch/aarch64/string/memcpy.S:1.1 src/common/lib/libc/arch/aarch64/string/memcpy.S:1.2 --- src/common/lib/libc/arch/aarch64/string/memcpy.S:1.1 Sun Aug 10 05:47:35 2014 +++ src/common/lib/libc/arch/aarch64/string/memcpy.S Sun Feb 4 21:52:16 2018 @@ -1,126 +1,4 @@ -/* $NetBSD: memcpy.S,v 1.1 2014/08/10 05:47:35 matt Exp $ */ +/* $NetBSD: memcpy.S,v 1.2 2018/02/04 21:52:16 skrll Exp $ */ -/*- - * Copyright (c) 2014 The NetBSD Foundation, Inc. - * All rights reserved. - * - * This code is derived from software contributed to The NetBSD Foundation - * by Matt Thomas of 3am Software Foundry. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include <machine/asm.h> - -RCSID("$NetBSD: memcpy.S,v 1.1 2014/08/10 05:47:35 matt Exp $") - -/* LINTSTUB: void *memcpy(void * restrict, const void * restrict, size_t); */ - -ENTRY(memcpy) - mov x10, x0 - mov x11, x1 - cbz x2, .Lmemcpy_ret - - cmp x2, #7 - b.ls .Lmemcpy_last_dword - - ands x3, x10, #7 - b.eq .Lmemcpy_dword_aligned - -/* - * The dst address doesn't have dword alignment. The src address may or may - * not have the same alignment. Make dst dword aligned. Hope src will be - * dword aligned but if it isn't, take advantage of unaligned access. - */ - add x2, x2, x3 /* add unalignment to length */ - sub x2, x2, #8 /* now subtract a dword */ - - tbz x10, #0, .Lmemcpy_hword_aligned - ldrb w4, [x11], #1 - strb w4, [x10], #1 -.Lmemcpy_hword_aligned: - tbz x10, #1, .Lmemcpy_word_aligned - ldrh w4, [x11], #2 - strh w4, [x10], #2 -.Lmemcpy_word_aligned: - tbz x10, #2, .Lmemcpy_dword_aligned - ldr w4, [x11], #4 - str w4, [x10], #4 -.Lmemcpy_dword_aligned: - /* - * destination is now dword aligned. - */ - subs x2, x2, #32 - b.mi .Lmemcpy_last_oword - -.Lmemcpy_oword_loop: - ldp x4, x5, [x11], #16 - ldp x6, x7, [x11], #16 - stp x4, x5, [x10], #16 - stp x6, x7, [x10], #16 - cbz x2, .Lmemcpy_ret - subs x2, x2, #32 - b.pl .Lmemcpy_oword_loop - -.Lmemcpy_last_oword: - /* - * We have 31 bytes or less to copy. First see if we can write a qword - */ - tbz x2, #4, .Lmemcpy_last_qword - ldp x4, x5, [x11], #16 /* read word */ - stp x4, x5, [x10], #16 /* write word */ - -.Lmemcpy_last_qword: - /* - * We have 15 bytes or less to copy. First see if we can write a dword - */ - tbz x2, #3, .Lmemcpy_last_dword - ldr x4, [x11], #8 /* read word */ - str x4, [x10], #8 /* write word */ - -.Lmemcpy_last_dword: - /* - * We have 7 bytes or less to copy. First see if we can write a word - */ - tbz x2, #2, .Lmemcpy_last_word - ldr w4, [x11], #4 /* read word */ - str w4, [x10], #4 /* write word */ - -.Lmemcpy_last_word: - /* - * We have 3 bytes or less to copy. First see if we can write a hword - */ - tbz x2, #1, .Lmemcpy_last_hword - ldrh w4, [x11], #2 - strh w4, [x10], #2 - -.Lmemcpy_last_hword: - /* - * We have 1 or none bytes to copy. - */ - tbz x2, #0, .Lmemcpy_ret - ldrb w4, [x11] - strb w4, [x10] - -.Lmemcpy_ret: - ret -END(memcpy) +#define MEMCOPY +#include "bcopy.S" Added files: Index: src/common/lib/libc/arch/aarch64/string/bcopy.S diff -u /dev/null src/common/lib/libc/arch/aarch64/string/bcopy.S:1.1 --- /dev/null Sun Feb 4 21:52:17 2018 +++ src/common/lib/libc/arch/aarch64/string/bcopy.S Sun Feb 4 21:52:16 2018 @@ -0,0 +1,990 @@ +/* $NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $ */ + +/* + * Copyright (c) 2018 Ryo Shimizu <r...@nerv.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <machine/asm.h> + +#if defined(LIBC_SCCS) +RCSID("$NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $") +#endif + +#if defined(MEMCOPY) + +/* + * void *memcpy(void * restrict dst, const void * restrict src, size_t len); + */ +#define FUNCTION memcpy +#define NO_OVERLAP +#define SRC0 x1 +#define DST0 x0 +#define LEN x2 + +#elif defined(MEMMOVE) + +/* + * void *memmove(void *dst, const void *src, size_t len); + */ +#define FUNCTION memmove +#undef NO_OVERLAP +#define SRC0 x1 +#define DST0 x0 +#define LEN x2 + +#else /* !MEMCOPY && !MEMMOVE */ + +/* + * void bcopy(const void *src, void *dst, size_t len); + */ +#define FUNCTION bcopy +#define NO_OVERLAP +#define SRC0 x0 +#define DST0 x1 +#define LEN x2 + +#endif /* MEMCOPY/MEMMOVE/BCOPY */ + +/* caller-saved temporary registers. breakable. */ +#define TMP_X x3 +#define TMP_Xw w3 +#define TMP_D x4 +#define TMP_S x5 +#define DST x6 +#define SRC x7 +#define DATA0 x8 +#define DATA0w w8 +#define DATA1 x9 +#define DATA1w w9 +#define DATA2 x10 +#define SRC_ALIGNBIT x11 /* (SRC & 7) * 8 */ +#define DST_ALIGNBIT x12 /* (DST & 7) * 8 */ +#define SRC_DST_ALIGNBIT x13 /* = SRC_ALIGNBIT - DST_ALIGNBIT */ +#define DST_SRC_ALIGNBIT x14 /* = -SRC_DST_ALIGNBIT */ + +#define STP_ALIGN 16 /* align before stp/ldp. 8 or 16 */ +#define SMALLSIZE 32 + + .text + .align 5 + +#ifndef NO_OVERLAP +#ifndef STRICT_ALIGNMENT +backward_ignore_align: + prfm PLDL1KEEP, [SRC0] + add SRC0, SRC0, LEN + add DST, DST0, LEN + cmp LEN, #SMALLSIZE + bcs copy_backward +copy_backward_small: + cmp LEN, #8 + bcs 9f + + /* 0 <= len < 8 */ + /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ + tbz LEN, #2, 1f + ldr TMP_Xw, [SRC0, #-4]! + str TMP_Xw, [DST, #-4]! +1: + /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ + tbz LEN, #1, 1f + ldrh TMP_Xw, [SRC0, #-2]! + strh TMP_Xw, [DST, #-2]! +1: + /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ + tbz LEN, #0, 1f + ldrb TMP_Xw, [SRC0, #-1]! + strb TMP_Xw, [DST, #-1]! +1: + ret +9: + + cmp LEN, #16 + bcs 9f + + /* 8 <= len < 16 */ + /* *--(uint64_t *)dst = *--(uint64_t *)src; */ + ldr TMP_X, [SRC0, #-8]! + str TMP_X, [DST, #-8]! + /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ + tbz LEN, #2, 1f + ldr TMP_Xw, [SRC0, #-4]! + str TMP_Xw, [DST, #-4]! +1: + /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ + tbz LEN, #1, 1f + ldrh TMP_Xw, [SRC0, #-2]! + strh TMP_Xw, [DST, #-2]! +1: + /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ + tbz LEN, #0, 1f + ldrb TMP_Xw, [SRC0, #-1]! + strb TMP_Xw, [DST, #-1]! +1: + ret +9: + + /* 16 <= len < 32 */ + ldp DATA0, DATA1, [SRC0, #-16]! + stp DATA0, DATA1, [DST, #-16]! + /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ + tbz LEN, #3, 1f + ldr TMP_X, [SRC0, #-8]! + str TMP_X, [DST, #-8]! +1: + /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ + tbz LEN, #2, 1f + ldr TMP_Xw, [SRC0, #-4]! + str TMP_Xw, [DST, #-4]! +1: + /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ + tbz LEN, #1, 1f + ldrh TMP_Xw, [SRC0, #-2]! + strh TMP_Xw, [DST, #-2]! +1: + /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ + tbz LEN, #0, 1f + ldrb TMP_Xw, [SRC0, #-1]! + strb TMP_Xw, [DST, #-1]! +1: + ret +#endif /* !STRICT_ALIGNMENT */ + + .align 4 +copy_backward: + /* DST is not aligned at this point */ +#ifndef STRICT_ALIGNMENT + cmp LEN, #512 /* pre-alignment can be overhead when small */ + bcc 9f +#endif + /* if (DST & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ + tbz DST, #0, 1f + ldrb TMP_Xw, [SRC0, #-1]! + strb TMP_Xw, [DST, #-1]! + sub LEN, LEN, #1 +1: + /* if (DST & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ + tbz DST, #1, 1f + ldrh TMP_Xw, [SRC0, #-2]! + strh TMP_Xw, [DST, #-2]! + sub LEN, LEN, #2 +1: + /* if (DST & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ + tbz DST, #2, 1f + ldr TMP_Xw, [SRC0, #-4]! + str TMP_Xw, [DST, #-4]! + sub LEN, LEN, #4 +1: +#if (STP_ALIGN > 8) + /* if (DST & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ + tbz DST, #3, 1f + ldr TMP_X, [SRC0, #-8]! + str TMP_X, [DST, #-8]! + sub LEN, LEN, #8 +1: +#endif /* (STP_ALIGN > 8) */ +9: + + cmp LEN, #1024 + bhs backward_copy1k +backward_less1k: + /* copy 16*n bytes */ + and TMP_D, LEN, #(1023-15) /* len &= 1023; len &= ~15; */ + adr TMP_X, 8f + sub LEN, LEN, TMP_D + sub TMP_X, TMP_X, TMP_D, lsr #1 /* jump to (8f - len/2) */ + br TMP_X +backward_copy1k: /* copy 16*64 bytes */ + sub LEN, LEN, #1024 + .rept (1024 / 16) + ldp DATA0, DATA1, [SRC0, #-16]! /* *--dst = *--src; */ + stp DATA0, DATA1, [DST, #-16]! + .endr +8: + cbz LEN, done + cmp LEN, #1024 + bhs backward_copy1k + cmp LEN, #16 + bhs backward_less1k + + /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */ + tbz LEN, #4, 1f + ldp DATA0, DATA1, [SRC0, #-16]! + ldp DATA0, DATA1, [DST, #-16]! +1: + /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ + tbz LEN, #3, 1f + ldr TMP_X, [SRC0, #-8]! + str TMP_X, [DST, #-8]! +1: + /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ + tbz LEN, #2, 1f + ldr TMP_Xw, [SRC0, #-4]! + str TMP_Xw, [DST, #-4]! +1: + /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ + tbz LEN, #1, 1f + ldrh TMP_Xw, [SRC0, #-2]! + strh TMP_Xw, [DST, #-2]! +1: + /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ + tbz LEN, #0, 1f + ldrb TMP_Xw, [SRC0, #-1]! + strb TMP_Xw, [DST, #-1]! +1: + ret +#endif /* !NO_OVERLAP */ + + +#if defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) + .align 5 +backward_copy: + prfm PLDL1KEEP, [SRC0] + add DST, DST0, LEN + add SRC0, SRC0, LEN + cmp LEN, #SMALLSIZE + bcs strict_backward + + cmp LEN, #10 + bcs 9f +backward_tiny: + /* copy 1-10 bytes */ + adr TMP_X, 8f + sub TMP_X, TMP_X, LEN, lsl #3 /* jump to (8f - len*2) */ + br TMP_X + .rept 10 + ldrb TMP_Xw, [SRC0, #-1]! + strb TMP_Xw, [DST, #-1]! + .endr +8: + ret +9: + /* length is small(<32), and src or dst may be unaligned */ + eor TMP_X, SRC0, DST0 + ands TMP_X, TMP_X, #7 + bne notaligned_backward_small + +samealign_backward_small: + /* if (dst & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ + tbz DST, #0, 1f + ldrb TMP_Xw, [SRC0, #-1]! + strb TMP_Xw, [DST, #-1]! + sub LEN, LEN, #1 +1: + /* if (dst & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ + tbz DST, #1, 1f + ldrh TMP_Xw, [SRC0, #-2]! + strh TMP_Xw, [DST, #-2]! + sub LEN, LEN, #2 +1: + /* if (dst & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ + tbz DST, #2, 1f + ldr TMP_Xw, [SRC0, #-4]! + str TMP_Xw, [DST, #-4]! + sub LEN, LEN, #4 +1: + /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */ + tbz LEN, #4, 1f + ldp DATA0, DATA1, [SRC0, #-16]! + stp DATA0, DATA1, [DST, #-16]! +1: + /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ + tbz LEN, #3, 1f + ldr TMP_X, [SRC0, #-8]! + str TMP_X, [DST, #-8]! +1: + /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ + tbz LEN, #2, 1f + ldr TMP_Xw, [SRC0, #-4]! + str TMP_Xw, [DST, #-4]! +1: + /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ + tbz LEN, #1, 1f + ldrh TMP_Xw, [SRC0, #-2]! + strh TMP_Xw, [DST, #-2]! +1: + /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ + tbz LEN, #0, 1f + ldrb TMP_Xw, [SRC0, #-1]! + strb TMP_Xw, [DST, #-1]! +1: + ret + +notaligned_backward_small: + /* length is small, and src or dst may be unaligned */ + sub TMP_S, SRC0, LEN /* tmp_s = src - len */ +1: /* do { */ + ldrb TMP_Xw, [SRC0, #-1]! + strb TMP_Xw, [DST, #-1]! /* *(char *)dst++ = *(char *)src++ */ + cmp TMP_S, SRC0 /* while (tmp_s < src) */ + blo 1b + ret + +strict_backward: + /* src or dst may be unaligned */ + and SRC_ALIGNBIT, SRC0, #7 + and DST_ALIGNBIT, DST, #7 + lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3 + lsl DST_ALIGNBIT, DST_ALIGNBIT, #3 + sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT + cbz SRC_DST_ALIGNBIT, copy_backward /* same alignment? */ + + and SRC, SRC0, #~7 + and DST, DST, #~7 + neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT + +#if BYTE_ORDER == LITTLE_ENDIAN + tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */ + + cmp SRC, SRC0 /* don't access out of range */ + beq 1f + ldr DATA1, [SRC] +1: + ldr DATA0, [SRC, #-8]! + + lsl DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */ + lsr TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1<<dst_src_alignbit)| */ + orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */ + + b 9f /* } */ +5: /* else { */ + ldr DATA0, [SRC] /* data0 = *src; */ + lsr DATA1, DATA0, SRC_DST_ALIGNBIT /* data1=data0>>src_dst_abit;*/ +9: /* } */ + + cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */ + mov TMP_D, DST /* tmp_d = dst; */ + + tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */ + str DATA1w, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = data1; */ + lsr DATA1, DATA1, #32 /* data1 >>= 32; */ +1: /* } */ + tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */ + strh DATA1w, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = data1; */ + lsr DATA1, DATA1, #16 /* data1 >>= 16; */ +1: /* } */ + tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */ + strb DATA1w, [TMP_D] /* *(uint8_t *)tmp_d = data1; */ +1: /* } */ + + sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */ +9: /* } */ +#else /* BYTE_ORDER */ + tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */ + + cmp SRC, SRC0 /* don't access out of range */ + beq 1f + ldr DATA1, [SRC] +1: + ldr DATA0, [SRC, #-8]! + + lsr DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */ + lsl TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1>>dst_src_alignbit)| */ + orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */ + + b 9f /* } */ +5: /* else { */ + ldr DATA0, [SRC] /* data0 = *src; */ + lsr DATA1, DATA0, DST_SRC_ALIGNBIT /* data1=data0<<dst_src_abit;*/ +9: /* } */ + + cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */ + mov TMP_D, DST /* tmp_d = dst; */ + + tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */ + lsr TMP_X, DATA1, #32 /* x = data1 >> 32; */ + str TMP_Xw, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = x; */ +1: /* } */ + tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */ + lsr TMP_X, DATA1, #16 /* x = data1 >> 16; */ + strh TMP_Xw, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = x; */ +1: /* } */ + tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */ + lsr TMP_X, DATA1, #8 /* x = data1 >> 8; */ + strb TMP_Xw, [TMP_D], #1 /* *(uint8_t *)tmp_d++ = x; */ +1: /* } */ + + sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */ +9: /* } */ +#endif /* BYTE_ORDER */ + + +backward_shifting_copy_loop: + ldp DATA2, DATA1, [SRC, #-16]! +#if BYTE_ORDER == LITTLE_ENDIAN + /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */ + lsl DATA0, DATA0, DST_SRC_ALIGNBIT + lsr TMP_X, DATA1, SRC_DST_ALIGNBIT + orr DATA0, DATA0, TMP_X + /* data1 = (data2 >> src_dst_alignbit) | (data1 << dst_src_alignbit); */ + lsl DATA1, DATA1, DST_SRC_ALIGNBIT + lsr TMP_X, DATA2, SRC_DST_ALIGNBIT + orr DATA1, DATA1, TMP_X +#else /* BYTE_ORDER */ + /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */ + lsr DATA0, DATA0, DST_SRC_ALIGNBIT + lsl TMP_X, DATA1, SRC_DST_ALIGNBIT + orr DATA0, DATA0, TMP_X + /* data1 = (data2 << src_dst_alignbit) | (data1 >> dst_src_alignbit); */ + lsr DATA1, DATA1, DST_SRC_ALIGNBIT + lsl TMP_X, DATA2, SRC_DST_ALIGNBIT + orr DATA1, DATA1, TMP_X +#endif /* BYTE_ORDER */ + stp DATA1, DATA0, [DST, #-16]! + mov DATA0, DATA2 + sub LEN, LEN, #16 + cmp LEN, #16 + bhs backward_shifting_copy_loop + + + /* write 8 bytes */ + tbz LEN, #3, 9f + + ldr DATA1, [SRC, #-8]! +#if BYTE_ORDER == LITTLE_ENDIAN + /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */ + lsl DATA0, DATA0, DST_SRC_ALIGNBIT + lsr TMP_X, DATA1, SRC_DST_ALIGNBIT + orr DATA0, DATA0, TMP_X +#else /* BYTE_ORDER */ + /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */ + lsr DATA0, DATA0, DST_SRC_ALIGNBIT + lsl TMP_X, DATA1, SRC_DST_ALIGNBIT + orr DATA0, DATA0, TMP_X +#endif /* BYTE_ORDER */ + str DATA0, [DST, #-8]! + mov DATA0, DATA1 + sub LEN, LEN, #8 +9: + + cbz LEN, backward_shifting_copy_done + + /* copy last 1-7 bytes */ + and TMP_X, SRC_DST_ALIGNBIT, #63 + cmp LEN, TMP_X, lsr #3 + bls 1f + ldr DATA1, [SRC, #-8]! /* don't access out of range */ +1: + +#if BYTE_ORDER == LITTLE_ENDIAN + /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */ + lsl DATA0, DATA0, DST_SRC_ALIGNBIT + lsr TMP_X, DATA1, SRC_DST_ALIGNBIT + orr DATA0, DATA0, TMP_X +#else /* BYTE_ORDER */ + /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */ + lsr DATA0, DATA0, DST_SRC_ALIGNBIT + lsl TMP_X, DATA1, SRC_DST_ALIGNBIT + orr DATA0, DATA0, TMP_X +#endif /* BYTE_ORDER */ + +#if BYTE_ORDER == LITTLE_ENDIAN + tbz LEN, #2, 1f + ror DATA0, DATA0, #32 + str DATA0w, [DST, #-4]! +1: + tbz LEN, #1, 1f + ror DATA0, DATA0, #48 + strh DATA0w, [DST, #-2]! +1: + tbz LEN, #0, 1f + ror DATA0, DATA0, #56 + strb DATA0w, [DST, #-1]! +1: +#else /* BYTE_ORDER */ + tbz LEN, #2, 1f + str DATA0w, [DST, #-4]! + lsr DATA0, DATA0, #32 +1: + tbz LEN, #1, 1f + strh DATA0w, [DST, #-2]! + lsr DATA0, DATA0, #16 +1: + tbz LEN, #0, 1f + strb DATA0w, [DST, #-1]! +1: +#endif /* BYTE_ORDER */ +backward_shifting_copy_done: + ret +#endif /* defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) */ + + + .align 5 +ENTRY(FUNCTION) +#ifdef STRICT_ALIGNMENT + cbz LEN, done +#ifndef NO_OVERLAP + cmp SRC0, DST0 + beq done + bcc backward_copy +#endif /* NO_OVERLAP */ + mov DST, DST0 + cmp LEN, #SMALLSIZE + bcs strict_forward + + cmp LEN, #10 + bcs 9f +forward_tiny: + /* copy 1-10 bytes */ + adr TMP_X, 8f + sub TMP_X, TMP_X, LEN, lsl #3 /* jump to (8f - len*2) */ + br TMP_X + .rept 10 + ldrb TMP_Xw, [SRC0], #1 + strb TMP_Xw, [DST], #1 + .endr +8: + ret +9: + /* length is small(<32), and src or dst may be unaligned */ + eor TMP_X, SRC0, DST0 + ands TMP_X, TMP_X, #7 + bne notaligned_forward_small +samealign_forward_small: + /* if (dst & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ + tbz DST, #0, 1f + ldrb TMP_Xw, [SRC0], #1 + strb TMP_Xw, [DST], #1 + sub LEN, LEN, #1 +1: + /* if (dst & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ + tbz DST, #1, 1f + ldrh TMP_Xw, [SRC0], #2 + strh TMP_Xw, [DST], #2 + sub LEN, LEN, #2 +1: + /* if (dst & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ + tbz DST, #2, 1f + ldr TMP_Xw, [SRC0], #4 + str TMP_Xw, [DST], #4 + sub LEN, LEN, #4 +1: + /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */ + tbz LEN, #4, 1f + ldp DATA0, DATA1, [SRC0], #16 + stp DATA0, DATA1, [DST], #16 +1: + /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ + tbz LEN, #3, 1f + ldr TMP_X, [SRC0], #8 + str TMP_X, [DST], #8 +1: + /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ + tbz LEN, #2, 1f + ldr TMP_Xw, [SRC0], #4 + str TMP_Xw, [DST], #4 +1: + /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ + tbz LEN, #1, 1f + ldrh TMP_Xw, [SRC0], #2 + strh TMP_Xw, [DST], #2 +1: + /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ + tbz LEN, #0, 1f + ldrb TMP_Xw, [SRC0], #1 + strb TMP_Xw, [DST], #1 +1: + ret + +notaligned_forward_small: + /* src and dst are not aligned... */ + prfm PLDL1KEEP, [SRC0] + prfm PLDL1KEEP, [SRC0, #8] + prfm PLDL1KEEP, [SRC0, #16] + add TMP_S, SRC0, LEN /* tmp_s = src + len */ +1: /* do { */ + ldrb TMP_Xw, [SRC0], #1 + strb TMP_Xw, [DST], #1 /* *(char *)dst++ = *(char *)src++ */ + cmp SRC0, TMP_S /* while (src < tmp_s); */ + blo 1b + ret + +strict_forward: + /* src or dst may be unaligned */ + and SRC_ALIGNBIT, SRC0, #7 + and DST_ALIGNBIT, DST0, #7 + lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3 + lsl DST_ALIGNBIT, DST_ALIGNBIT, #3 + sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT + cbz SRC_DST_ALIGNBIT, copy_forward /* same alignment? */ + + and SRC, SRC0, #~7 + and DST, DST0, #~7 + neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT + +#if BYTE_ORDER == LITTLE_ENDIAN + tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */ + ldp DATA1, DATA0, [SRC], #16 + neg TMP_X, SRC_ALIGNBIT + lsr DATA1, DATA1, SRC_ALIGNBIT /* data1 = */ + lsl TMP_X, DATA0, TMP_X /* (data1 >> src_alignbit) | */ + orr DATA1, DATA1, TMP_X /* (data0 << -src_alignbit); */ + b 9f +5: + ldr DATA0, [SRC], #8 + lsr DATA1, DATA0, SRC_ALIGNBIT +9: + + cbz DST_ALIGNBIT, 5f + mov TMP_D, DST0 + /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1; } */ + tbz TMP_D, #0, 1f + strb DATA1w, [TMP_D], #1 + lsr DATA1, DATA1, #8 +1: + /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1; } */ + tbz TMP_D, #1, 1f + strh DATA1w, [TMP_D], #2 + lsr DATA1, DATA1, #16 +1: + /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1; } */ + tbz TMP_D, #2, 1f + str DATA1w, [TMP_D], #4 +1: + add DST, DST, #8 + b 9f +5: + str DATA1, [DST], #8 +9: + sub LEN, LEN, #8 + add LEN, LEN, DST_ALIGNBIT, lsr #3 +#else /* BYTE_ORDER */ + tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */ + ldp DATA1, DATA0, [SRC], #16 + neg TMP_X, SRC_ALIGNBIT + lsl DATA1, DATA1, SRC_ALIGNBIT /* data1 = */ + lsr TMP_X, DATA0, TMP_X /* (data1 << src_alignbit) | */ + orr DATA1, DATA1, TMP_X /* (data0 >> -src_alignbit); */ + b 9f +5: + ldr DATA0, [SRC], #8 + lsl DATA1, DATA0, SRC_ALIGNBIT +9: + + cbz DST_ALIGNBIT, 5f + mov TMP_D, DST0 + /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1 >> 56; } */ + tbz TMP_D, #0, 1f + lsr TMP_X, DATA1, #56 + strb TMP_Xw, [TMP_D], #1 +1: + /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1 >> 48; } */ + tbz TMP_D, #1, 1f + lsr TMP_X, DATA1, #48 + strh TMP_Xw, [TMP_D], #2 +1: + /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1 >> 32; } */ + tbz TMP_D, #2, 1f + lsr TMP_X, DATA1, #32 + str TMP_Xw, [TMP_D], #4 +1: + add DST, DST, #8 + b 9f +5: + str DATA1, [DST], #8 +9: + sub LEN, LEN, #8 + add LEN, LEN, DST_ALIGNBIT, lsr #3 +#endif /* BYTE_ORDER */ + +shifting_copy_loop: + ldp DATA1, DATA2, [SRC], #16 +#if BYTE_ORDER == LITTLE_ENDIAN + /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */ + lsr DATA0, DATA0, SRC_DST_ALIGNBIT + lsl TMP_X, DATA1, DST_SRC_ALIGNBIT + orr DATA0, DATA0, TMP_X + /* data1 = (data1 >> src_dst_alignbit) | (data2 << dst_src_alignbit) */ + lsr DATA1, DATA1, SRC_DST_ALIGNBIT + lsl TMP_X, DATA2, DST_SRC_ALIGNBIT + orr DATA1, DATA1, TMP_X +#else /* BYTE_ORDER */ + /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */ + lsl DATA0, DATA0, SRC_DST_ALIGNBIT + lsr TMP_X, DATA1, DST_SRC_ALIGNBIT + orr DATA0, DATA0, TMP_X + /* data1 = (data1 << src_dst_alignbit) | (data2 >> dst_src_alignbit) */ + lsl DATA1, DATA1, SRC_DST_ALIGNBIT + lsr TMP_X, DATA2, DST_SRC_ALIGNBIT + orr DATA1, DATA1, TMP_X +#endif /* BYTE_ORDER */ + stp DATA0, DATA1, [DST], #16 + mov DATA0, DATA2 + sub LEN, LEN, #16 + cmp LEN, #16 + bhs shifting_copy_loop + + + /* write 8 bytes */ + tbz LEN, #3, 9f + ldr DATA1, [SRC], #8 +#if BYTE_ORDER == LITTLE_ENDIAN + /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */ + lsr DATA0, DATA0, SRC_DST_ALIGNBIT + lsl TMP_X, DATA1, DST_SRC_ALIGNBIT + orr DATA0, DATA0, TMP_X +#else /* BYTE_ORDER */ + /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */ + lsl DATA0, DATA0, SRC_DST_ALIGNBIT + lsr TMP_X, DATA1, DST_SRC_ALIGNBIT + orr DATA0, DATA0, TMP_X +#endif /* BYTE_ORDER */ + str DATA0, [DST], #8 + mov DATA0, DATA1 + sub LEN, LEN, #8 +9: + + cbz LEN, shifting_copy_done + + /* copy last 1-7 bytes */ + and TMP_X, DST_SRC_ALIGNBIT, #63 + cmp LEN, TMP_X, lsr #3 + bls 1f + ldr DATA1, [SRC], #8 /* don't access out of range */ +1: + +#if BYTE_ORDER == LITTLE_ENDIAN + /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */ + lsr DATA0, DATA0, SRC_DST_ALIGNBIT + lsl TMP_X, DATA1, DST_SRC_ALIGNBIT + orr DATA0, DATA0, TMP_X +#else /* BYTE_ORDER */ + /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */ + lsl DATA0, DATA0, SRC_DST_ALIGNBIT + lsr TMP_X, DATA1, DST_SRC_ALIGNBIT + orr DATA0, DATA0, TMP_X +#endif /* BYTE_ORDER */ + +#if BYTE_ORDER == LITTLE_ENDIAN + /* if (len & 4) { *(uint32_t *)dst++ = data0; } */ + tbz LEN, #2, 1f + str DATA0w, [DST], #4 + lsr DATA0, DATA0, #32 +1: + /* if (len & 2) { *(uint16_t *)dst++ = data0; } */ + tbz LEN, #1, 1f + strh DATA0w, [DST], #2 + lsr DATA0, DATA0, #16 +1: + /* if (len & 1) { *(uint8_t *)dst++ = data0; } */ + tbz LEN, #0, 1f + strb DATA0w, [DST], #1 +1: +#else /* BYTE_ORDER */ + /* if (len & 4) { *(uint32_t *)dst++ = data0 >> 32; } */ + tbz LEN, #2, 1f + lsr TMP_X, DATA0, #32 + str TMP_Xw, [DST], #4 +1: + /* if (len & 2) { *(uint16_t *)dst++ = data0 >> 16; } */ + tbz LEN, #1, 1f + lsr TMP_X, DATA0, #16 + strh TMP_Xw, [DST], #2 +1: + /* if (len & 1) { *(uint8_t *)dst++ = data0 >> 8; } */ + tbz LEN, #0, 1f + lsr TMP_X, DATA0, #8 + strb TMP_Xw, [DST], #1 +1: +#endif /* BYTE_ORDER */ +shifting_copy_done: + ret + +#else /* STRICT_ALIGNMENT */ +#ifndef NO_OVERLAP + cbz LEN, done + cmp SRC0, DST0 + beq done + bcc backward_ignore_align +#endif /* NO_OVERLAP */ + + prfm PLDL1KEEP, [SRC0] + cmp LEN, #SMALLSIZE + bcs copy_forward + mov DST, DST0 + +copy_forward_small: + cmp LEN, #8 + bcs 9f + + /* 0 <= len < 8 */ + /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ + tbz LEN, #2, 1f + ldr TMP_Xw, [SRC0], #4 + str TMP_Xw, [DST], #4 +1: + /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ + tbz LEN, #1, 1f + ldrh TMP_Xw, [SRC0], #2 + strh TMP_Xw, [DST], #2 +1: + /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ + tbz LEN, #0, 1f + ldrb TMP_Xw, [SRC0], #1 + strb TMP_Xw, [DST], #1 +1: + ret +9: + + prfm PLDL1KEEP, [SRC0, #8] + cmp LEN, #16 + bcs 9f + + /* 8 <= len < 16 */ + /* *(uint64_t *)dst++ = *(uint64_t *)src++; */ + ldr TMP_X, [SRC0], #8 + str TMP_X, [DST], #8 + /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ + tbz LEN, #2, 1f + ldr TMP_Xw, [SRC0], #4 + str TMP_Xw, [DST], #4 +1: + /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ + tbz LEN, #1, 1f + ldrh TMP_Xw, [SRC0], #2 + strh TMP_Xw, [DST], #2 +1: + /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ + tbz LEN, #0, 1f + ldrb TMP_Xw, [SRC0], #1 + strb TMP_Xw, [DST], #1 +1: + ret +9: + + /* 16 <= len < 32 */ + prfm PLDL1KEEP, [SRC0, 16] + prfm PLDL1KEEP, [SRC0, 24] + ldp DATA0, DATA1, [SRC0], #16 + stp DATA0, DATA1, [DST], #16 + /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ + tbz LEN, #3, 1f + ldr TMP_X, [SRC0], #8 + str TMP_X, [DST], #8 +1: + /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ + tbz LEN, #2, 1f + ldr TMP_Xw, [SRC0], #4 + str TMP_Xw, [DST], #4 +1: + /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ + tbz LEN, #1, 1f + ldrh TMP_Xw, [SRC0], #2 + strh TMP_Xw, [DST], #2 +1: + /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ + tbz LEN, #0, 1f + ldrb TMP_Xw, [SRC0], #1 + strb TMP_Xw, [DST], #1 +1: + ret +#endif /* !STRICT_ALIGNMENT */ + + .align 4 +copy_forward: + /* DST is not aligned at this point */ + mov DST, DST0 +#ifndef STRICT_ALIGNMENT + cmp LEN, #512 /* pre-alignment can be overhead when small */ + bcc 9f +#endif /* STRICT_ALIGNMENT */ + /* if (DST & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ + tbz DST, #0, 1f + ldrb TMP_Xw, [SRC0], #1 + strb TMP_Xw, [DST], #1 + sub LEN, LEN, #1 +1: + /* if (DST & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ + tbz DST, #1, 1f + ldrh TMP_Xw, [SRC0], #2 + strh TMP_Xw, [DST], #2 + sub LEN, LEN, #2 +1: + /* if (DST & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ + tbz DST, #2, 1f + ldr TMP_Xw, [SRC0], #4 + str TMP_Xw, [DST], #4 + sub LEN, LEN, #4 +1: +#if (STP_ALIGN > 8) + /* if (DST & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ + tbz DST, #3, 1f + ldr TMP_X, [SRC0], #8 + str TMP_X, [DST], #8 + sub LEN, LEN, #8 +1: +#endif /* (STP_ALIGN > 8) */ +9: + + cmp LEN, #1024 + bhs forward_copy1k +forward_less1k: + /* copy 16*n bytes */ + and TMP_D, LEN, #(1023-15) /* len &= 1023; len &= ~15; */ + adr TMP_X, 8f + sub LEN, LEN, TMP_D + sub TMP_X, TMP_X, TMP_D, lsr #1 /* jump to (8f - len/2) */ + br TMP_X +forward_copy1k: /* copy 16*64 bytes */ + sub LEN, LEN, #1024 + .rept (1024 / 16) + ldp DATA0, DATA1, [SRC0], #16 /* *dst++ = *src++; */ + stp DATA0, DATA1, [DST], #16 + .endr +8: + cbz LEN, done + cmp LEN, #1024 + bhs forward_copy1k + cmp LEN, #16 + bhs forward_less1k + + /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */ + tbz LEN, #4, 1f + ldp DATA0, DATA1, [SRC0], #16 + stp DATA0, DATA1, [DST], #16 +1: + /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ + tbz LEN, #3, 1f + ldr TMP_X, [SRC0], #8 + str TMP_X, [DST], #8 +1: + /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ + tbz LEN, #2, 1f + ldr TMP_Xw, [SRC0], #4 + str TMP_Xw, [DST], #4 +1: + /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ + tbz LEN, #1, 1f + ldrh TMP_Xw, [SRC0], #2 + strh TMP_Xw, [DST], #2 +1: + /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ + tbz LEN, #0, 1f + ldrb TMP_Xw, [SRC0], #1 + strb TMP_Xw, [DST], #1 +1: +done: + ret +END(FUNCTION) Index: src/common/lib/libc/arch/aarch64/string/memmove.S diff -u /dev/null src/common/lib/libc/arch/aarch64/string/memmove.S:1.1 --- /dev/null Sun Feb 4 21:52:17 2018 +++ src/common/lib/libc/arch/aarch64/string/memmove.S Sun Feb 4 21:52:16 2018 @@ -0,0 +1,4 @@ +/* $NetBSD: memmove.S,v 1.1 2018/02/04 21:52:16 skrll Exp $ */ + +#define MEMMOVE +#include "bcopy.S"