[newlib-cygwin] aarch64: Sync with ARM-software/optimized-routines

Sebastian Huber via Cygwin-cvs Sun, 08 Oct 2023 22:38:18 -0700

https://sourceware.org/git/gitweb.cgi?p=newlib-cygwin.git;h=96ec8f868e1a0f5a75badfe4627a41f12cce742d


commit 96ec8f868e1a0f5a75badfe4627a41f12cce742d
Author: Sebastian Huber <[email protected]>
Date:   Tue Sep 12 10:33:09 2023 +0200

    aarch64: Sync with ARM-software/optimized-routines
    
    Update AArch64 assembly string routines from:
    
    https://github.com/ARM-software/optimized-routines
    
    commit 0cf84f26b6b8dcad8287fe30a4dcc1fdabd06560
    Author: Sebastian Huber <[email protected]>
    Date:   Thu Jul 27 17:14:57 2023 +0200
    
        string: Fix corrupt GNU_PROPERTY_TYPE (5) size
    
        For ELF32 the notes alignment is 4 and not 8.
    
    Add license and copyright information to COPYING.NEWLIB as entry (56).

Diff:
---
 COPYING.NEWLIB                          | 250 ++++++++++++++++++
 newlib/libc/machine/aarch64/asmdefs.h   | 106 ++++++++
 newlib/libc/machine/aarch64/memchr.S    |  73 ++----
 newlib/libc/machine/aarch64/memcmp.S    | 311 +++++++++++------------
 newlib/libc/machine/aarch64/memcpy.S    | 272 ++++++++++----------
 newlib/libc/machine/aarch64/memset.S    | 194 +++-----------
 newlib/libc/machine/aarch64/stpcpy.S    |  36 +--
 newlib/libc/machine/aarch64/strchr.S    | 107 +++-----
 newlib/libc/machine/aarch64/strchrnul.S |  90 +++----
 newlib/libc/machine/aarch64/strcmp.S    | 282 ++++++++++-----------
 newlib/libc/machine/aarch64/strcpy.S    | 437 ++++++++++----------------------
 newlib/libc/machine/aarch64/strlen.S    | 319 ++++++++++-------------
 newlib/libc/machine/aarch64/strncmp.S   | 323 ++++++++++++-----------
 newlib/libc/machine/aarch64/strnlen.S   | 256 +++++++------------
 newlib/libc/machine/aarch64/strrchr.S   |  86 ++-----
 15 files changed, 1476 insertions(+), 1666 deletions(-)

diff --git a/COPYING.NEWLIB b/COPYING.NEWLIB
index ee14bb349..176b3a470 100644
--- a/COPYING.NEWLIB
+++ b/COPYING.NEWLIB
@@ -1291,3 +1291,253 @@ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
ARISING IN ANY WAY
 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 SUCH DAMAGE.
 
+(56) MIT OR Apache-2.0 WITH LLVM-exception (newlib/libc/machine/aarch64)
+
+SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+
+MIT License
+-----------
+
+Copyright (c) 1999-2023, Arm Limited.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+
+Apache-2.0 WITH LLVM-exception
+------------------------------
+
+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+--- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
diff --git a/newlib/libc/machine/aarch64/asmdefs.h 
b/newlib/libc/machine/aarch64/asmdefs.h
new file mode 100644
index 000000000..131b95e1f
--- /dev/null
+++ b/newlib/libc/machine/aarch64/asmdefs.h
@@ -0,0 +1,106 @@
+/*
+ * Macros for asm code.  AArch64 version.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+/* Branch Target Identitication support.  */
+#define BTI_C          hint    34
+#define BTI_J          hint    36
+/* Return address signing support (pac-ret).  */
+#define PACIASP                hint    25; .cfi_window_save
+#define AUTIASP                hint    29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h.  */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
+#ifdef __ILP32__
+#define GNU_PROPERTY(type, value)      \
+  .section .note.gnu.property, "a";    \
+  .p2align 2;                          \
+  .word 4;                             \
+  .word 12;                            \
+  .word 5;                             \
+  .asciz "GNU";                                \
+  .word type;                          \
+  .word 4;                             \
+  .word value;                         \
+  .text
+#else
+#define GNU_PROPERTY(type, value)      \
+  .section .note.gnu.property, "a";    \
+  .p2align 3;                          \
+  .word 4;                             \
+  .word 16;                            \
+  .word 5;                             \
+  .asciz "GNU";                                \
+  .word type;                          \
+  .word 4;                             \
+  .word value;                         \
+  .word 0;                             \
+  .text
+#endif
+
+/* If set then the GNU Property Note section will be added to
+   mark objects to support BTI and PAC-RET.  */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files.  */
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment)   \
+  .global name;                \
+  .type name,%function;        \
+  .align alignment;            \
+  name:                        \
+  .cfi_startproc;      \
+  BTI_C;
+
+#define ENTRY(name)    ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name)      \
+  .global name;                \
+  .type name,%function;        \
+  name:
+
+#define END(name)      \
+  .cfi_endproc;                \
+  .size name, .-name;
+
+#define L(l) .L ## l
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n)  mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n)  mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
+/* Compiler supports SVE instructions  */
+#ifndef HAVE_SVE
+# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
+#   define HAVE_SVE 1
+# else
+#   define HAVE_SVE 0
+# endif
+#endif
+
+#endif
diff --git a/newlib/libc/machine/aarch64/memchr.S 
b/newlib/libc/machine/aarch64/memchr.S
index 53f5d6bc0..a0f305e0f 100644
--- a/newlib/libc/machine/aarch64/memchr.S
+++ b/newlib/libc/machine/aarch64/memchr.S
@@ -1,31 +1,8 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2014, ARM Limited
- * All rights Reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the company nor the names of its contributors
- *       may be used to endorse or promote products derived from this
- *       software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
@@ -37,6 +14,8 @@
  * Neon Available.
  */
 
+#include "asmdefs.h"
+
 /* Arguments and results.  */
 #define srcin          x0
 #define chrin          w1
@@ -70,17 +49,11 @@
  * identify exactly which byte has matched.
  */
 
-       .macro def_fn f p2align=0
-       .text
-       .p2align \p2align
-       .global \f
-       .type \f, %function
-\f:
-       .endm
-
-def_fn memchr
+ENTRY (memchr)
+       PTR_ARG (0)
+       SIZE_ARG (2)
        /* Do not dereference srcin if no bytes to compare.  */
-       cbz     cntin, .Lzero_length
+       cbz     cntin, L(zero_length)
        /*
         * Magic constant 0x40100401 allows us to identify which lane matches
         * the requested byte.
@@ -93,7 +66,7 @@ def_fn memchr
        dup     vrepmask.4s, wtmp2
        ands    soff, srcin, #31
        and     cntrem, cntin, #31
-       b.eq    .Lloop
+       b.eq    L(loop)
 
        /*
         * Input string is not 32-byte aligned. We calculate the syndrome
@@ -110,41 +83,41 @@ def_fn memchr
        and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
        addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b          /* 256->128 */
        addp    vend.16b, vend.16b, vend.16b                    /* 128->64 */
-       mov     synd, vend.2d[0]
+       mov     synd, vend.d[0]
        /* Clear the soff*2 lower bits */
        lsl     tmp, soff, #1
        lsr     synd, synd, tmp
        lsl     synd, synd, tmp
        /* The first block can also be the last */
-       b.ls    .Lmasklast
+       b.ls    L(masklast)
        /* Have we found something already? */
-       cbnz    synd, .Ltail
+       cbnz    synd, L(tail)
 
-.Lloop:
+L(loop):
        ld1     {vdata1.16b, vdata2.16b}, [src], #32
        subs    cntin, cntin, #32
        cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
        cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
        /* If we're out of data we finish regardless of the result */
-       b.ls    .Lend
+       b.ls    L(end)
        /* Use a fast check for the termination condition */
        orr     vend.16b, vhas_chr1.16b, vhas_chr2.16b
        addp    vend.2d, vend.2d, vend.2d
-       mov     synd, vend.2d[0]
+       mov     synd, vend.d[0]
        /* We're not out of data, loop if we haven't found the character */
-       cbz     synd, .Lloop
+       cbz     synd, L(loop)
 
-.Lend:
+L(end):
        /* Termination condition found, let's calculate the syndrome value */
        and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
        and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
        addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b          /* 256->128 */
        addp    vend.16b, vend.16b, vend.16b                    /* 128->64 */
-       mov     synd, vend.2d[0]
+       mov     synd, vend.d[0]
        /* Only do the clear for the last possible block */
-       b.hi    .Ltail
+       b.hs    L(tail)
 
-.Lmasklast:
+L(masklast):
        /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
        add     tmp, cntrem, soff
        and     tmp, tmp, #31
@@ -153,7 +126,7 @@ def_fn memchr
        lsl     synd, synd, tmp
        lsr     synd, synd, tmp
 
-.Ltail:
+L(tail):
        /* Count the trailing zeros using bit reversing */
        rbit    synd, synd
        /* Compensate the last post-increment */
@@ -168,9 +141,9 @@ def_fn memchr
        csel    result, xzr, result, eq
        ret
 
-.Lzero_length:
+L(zero_length):
        mov     result, #0
        ret
 
-       .size   memchr, . - memchr
+END (memchr)
 #endif
diff --git a/newlib/libc/machine/aarch64/memcmp.S 
b/newlib/libc/machine/aarch64/memcmp.S
index 605d99365..18874d321 100644
--- a/newlib/libc/machine/aarch64/memcmp.S
+++ b/newlib/libc/machine/aarch64/memcmp.S
@@ -1,57 +1,7 @@
 /* memcmp - compare memory
-
-   Copyright (c) 2018 Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
-/*
- * Copyright (c) 2017 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
  *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
@@ -60,103 +10,79 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  */
 
-#define L(l) .L ## l
-
-/* Parameters and result.  */
-#define src1           x0
-#define src2           x1
-#define limit          x2
-#define result         w0
-
-/* Internal variables.  */
-#define data1          x3
-#define data1w         w3
-#define data1h         x4
-#define data2          x5
-#define data2w         w5
-#define data2h         x6
-#define tmp1           x7
-#define tmp2           x8
-
-        .macro def_fn f p2align=0
-        .text
-        .p2align \p2align
-        .global \f
-        .type \f, %function
-\f:
-        .endm
-
-def_fn memcmp p2align=6
-       subs    limit, limit, 8
-       b.lo    L(less8)
-
-       ldr     data1, [src1], 8
-       ldr     data2, [src2], 8
-       cmp     data1, data2
-       b.ne    L(return)
-
-       subs    limit, limit, 8
-       b.gt    L(more16)
-
-       ldr     data1, [src1, limit]
-       ldr     data2, [src2, limit]
-       b       L(return)
-
-L(more16):
-       ldr     data1, [src1], 8
-       ldr     data2, [src2], 8
-       cmp     data1, data2
-       bne     L(return)
-
-       /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
-          strings.  */
-       subs    limit, limit, 16
+#include "asmdefs.h"
+
+#define src1   x0
+#define src2   x1
+#define limit  x2
+#define result w0
+
+#define data1  x3
+#define data1w w3
+#define data2  x4
+#define data2w w4
+#define data3  x5
+#define data3w w5
+#define data4  x6
+#define data4w w6
+#define tmp    x6
+#define src1end        x7
+#define src2end        x8
+
+
+ENTRY (memcmp)
+       PTR_ARG (0)
+       PTR_ARG (1)
+       SIZE_ARG (2)
+
+       cmp     limit, 16
+       b.lo    L(less16)
+       ldp     data1, data3, [src1]
+       ldp     data2, data4, [src2]
+       ccmp    data1, data2, 0, ne
+       ccmp    data3, data4, 0, eq
+       b.ne    L(return2)
+
+       add     src1end, src1, limit
+       add     src2end, src2, limit
+       cmp     limit, 32
        b.ls    L(last_bytes)
+       cmp     limit, 160
+       b.hs    L(loop_align)
+       sub     limit, limit, 32
 
-       /* We overlap loads between 0-32 bytes at either side of SRC1 when we
-          try to align, so limit it only to strings larger than 128 bytes.  */
-       cmp     limit, 96
-       b.ls    L(loop16)
-
-       /* Align src1 and adjust src2 with bytes not yet done.  */
-       and     tmp1, src1, 15
-       add     limit, limit, tmp1
-       sub     src1, src1, tmp1
-       sub     src2, src2, tmp1
-
-       /* Loop performing 16 bytes per iteration using aligned src1.
-          Limit is pre-decremented by 16 and must be larger than zero.
-          Exit if <= 16 bytes left to do or if the data is not equal.  */
        .p2align 4
-L(loop16):
-       ldp     data1, data1h, [src1], 16
-       ldp     data2, data2h, [src2], 16
-       subs    limit, limit, 16
-       ccmp    data1, data2, 0, hi
-       ccmp    data1h, data2h, 0, eq
-       b.eq    L(loop16)
-
+L(loop32):
+       ldp     data1, data3, [src1, 16]
+       ldp     data2, data4, [src2, 16]
        cmp     data1, data2
-       bne     L(return)
-       mov     data1, data1h
-       mov     data2, data2h
+       ccmp    data3, data4, 0, eq
+       b.ne    L(return2)
+       cmp     limit, 16
+       b.ls    L(last_bytes)
+
+       ldp     data1, data3, [src1, 32]
+       ldp     data2, data4, [src2, 32]
        cmp     data1, data2
-       bne     L(return)
+       ccmp    data3, data4, 0, eq
+       b.ne    L(return2)
+       add     src1, src1, 32
+       add     src2, src2, 32
+L(last64):
+       subs    limit, limit, 32
+       b.hi    L(loop32)
 
        /* Compare last 1-16 bytes using unaligned access.  */
 L(last_bytes):
-       add     src1, src1, limit
-       add     src2, src2, limit
-       ldp     data1, data1h, [src1]
-       ldp     data2, data2h, [src2]
-       cmp     data1, data2
-       bne     L(return)
-       mov     data1, data1h
-       mov     data2, data2h
+       ldp     data1, data3, [src1end, -16]
+       ldp     data2, data4, [src2end, -16]
+L(return2):
        cmp     data1, data2
+       csel    data1, data1, data3, ne
+       csel    data2, data2, data4, ne
 
        /* Compare data bytes and set return value to 0, -1 or 1.  */
 L(return):
@@ -164,33 +90,106 @@ L(return):
        rev     data1, data1
        rev     data2, data2
 #endif
-       cmp     data1, data2
-L(ret_eq):
+       cmp     data1, data2
        cset    result, ne
        cneg    result, result, lo
        ret
 
        .p2align 4
-       /* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less16):
+       add     src1end, src1, limit
+       add     src2end, src2, limit
+       tbz     limit, 3, L(less8)
+       ldr     data1, [src1]
+       ldr     data2, [src2]
+       ldr     data3, [src1end, -8]
+       ldr     data4, [src2end, -8]
+       b       L(return2)
+
+       .p2align 4
 L(less8):
-       adds    limit, limit, 4
-       b.lo    L(less4)
-       ldr     data1w, [src1], 4
-       ldr     data2w, [src2], 4
+       tbz     limit, 2, L(less4)
+       ldr     data1w, [src1]
+       ldr     data2w, [src2]
+       ldr     data3w, [src1end, -4]
+       ldr     data4w, [src2end, -4]
+       b       L(return2)
+
+L(less4):
+       tbz     limit, 1, L(less2)
+       ldrh    data1w, [src1]
+       ldrh    data2w, [src2]
        cmp     data1w, data2w
        b.ne    L(return)
-       sub     limit, limit, 4
-L(less4):
-       adds    limit, limit, 4
-       beq     L(ret_eq)
-L(byte_loop):
-       ldrb    data1w, [src1], 1
-       ldrb    data2w, [src2], 1
-       subs    limit, limit, 1
-       ccmp    data1w, data2w, 0, ne   /* NZCV = 0b0000.  */
-       b.eq    L(byte_loop)
+L(less2):
+       mov     result, 0
+       tbz     limit, 0, L(return_zero)
+       ldrb    data1w, [src1end, -1]
+       ldrb    data2w, [src2end, -1]
        sub     result, data1w, data2w
+L(return_zero):
+       ret
+
+L(loop_align):
+       ldp     data1, data3, [src1, 16]
+       ldp     data2, data4, [src2, 16]
+       cmp     data1, data2
+       ccmp    data3, data4, 0, eq
+       b.ne    L(return2)
+
+       /* Align src2 and adjust src1, src2 and limit.  */
+       and     tmp, src2, 15
+       sub     tmp, tmp, 16
+       sub     src2, src2, tmp
+       add     limit, limit, tmp
+       sub     src1, src1, tmp
+       sub     limit, limit, 64 + 16
+
+       .p2align 4
+L(loop64):
+       ldr     q0, [src1, 16]
+       ldr     q1, [src2, 16]
+       subs    limit, limit, 64
+       ldr     q2, [src1, 32]
+       ldr     q3, [src2, 32]
+       eor     v0.16b, v0.16b, v1.16b
+       eor     v1.16b, v2.16b, v3.16b
+       ldr     q2, [src1, 48]
+       ldr     q3, [src2, 48]
+       umaxp   v0.16b, v0.16b, v1.16b
+       ldr     q4, [src1, 64]!
+       ldr     q5, [src2, 64]!
+       eor     v1.16b, v2.16b, v3.16b
+       eor     v2.16b, v4.16b, v5.16b
+       umaxp   v1.16b, v1.16b, v2.16b
+       umaxp   v0.16b, v0.16b, v1.16b
+       umaxp   v0.16b, v0.16b, v0.16b
+       fmov    tmp, d0
+       ccmp    tmp, 0, 0, hi
+       b.eq    L(loop64)
+
+       /* If equal, process last 1-64 bytes using scalar loop.  */
+       add     limit, limit, 64 + 16
+       cbz     tmp, L(last64)
+
+       /* Determine the 8-byte aligned offset of the first difference.  */
+#ifdef __AARCH64EB__
+       rev16   tmp, tmp
+#endif
+       rev     tmp, tmp
+       clz     tmp, tmp
+       bic     tmp, tmp, 7
+       sub     tmp, tmp, 48
+       ldr     data1, [src1, tmp]
+       ldr     data2, [src2, tmp]
+#ifndef __AARCH64EB__
+       rev     data1, data1
+       rev     data2, data2
+#endif
+       mov     result, 1
+       cmp     data1, data2
+       cneg    result, result, lo
        ret
 
-       .size   memcmp, . - memcmp
+END (memcmp)
 #endif
diff --git a/newlib/libc/machine/aarch64/memcpy.S 
b/newlib/libc/machine/aarch64/memcpy.S
index 463bad0a1..248e7843a 100644
--- a/newlib/libc/machine/aarch64/memcpy.S
+++ b/newlib/libc/machine/aarch64/memcpy.S
@@ -1,55 +1,8 @@
-/* Copyright (c) 2012-2013, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
 /*
- * Copyright (c) 2015 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
+ * memcpy - copy memory area
  *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -61,6 +14,7 @@
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See memcpy-stub.c  */
 #else
+#include "asmdefs.h"
 
 #define dstin  x0
 #define src    x1
@@ -71,122 +25,139 @@
 #define A_l    x6
 #define A_lw   w6
 #define A_h    x7
-#define A_hw   w7
 #define B_l    x8
 #define B_lw   w8
 #define B_h    x9
 #define C_l    x10
+#define C_lw   w10
 #define C_h    x11
 #define D_l    x12
 #define D_h    x13
-#define E_l    src
-#define E_h    count
-#define F_l    srcend
-#define F_h    dst
-#define tmp1   x9
-
-#define L(l) .L ## l
-
-       .macro def_fn f p2align=0
-       .text
-       .p2align \p2align
-       .global \f
-       .type \f, %function
-\f:
-       .endm
-
-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
-   medium copies of 17..96 bytes which are fully unrolled. Large copies
-   of more than 96 bytes align the destination and use an unrolled loop
-   processing 64 bytes per iteration.
-   Small and medium copies read all data before writing, allowing any
-   kind of overlap, and memmove tailcalls memcpy for these cases as
-   well as non-overlapping copies.
+#define E_l    x14
+#define E_h    x15
+#define F_l    x16
+#define F_h    x17
+#define G_l    count
+#define G_h    dst
+#define H_l    src
+#define H_h    srcend
+#define tmp1   x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per 
iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
 */
 
-def_fn memcpy p2align=6
-       prfm    PLDL1KEEP, [src]
+ENTRY_ALIAS (memmove)
+ENTRY (memcpy)
+       PTR_ARG (0)
+       PTR_ARG (1)
+       SIZE_ARG (2)
        add     srcend, src, count
        add     dstend, dstin, count
-       cmp     count, 16
-       b.ls    L(copy16)
-       cmp     count, 96
+       cmp     count, 128
        b.hi    L(copy_long)
+       cmp     count, 32
+       b.hi    L(copy32_128)
 
-       /* Medium copies: 17..96 bytes.  */
-       sub     tmp1, count, 1
+       /* Small copies: 0..32 bytes.  */
+       cmp     count, 16
+       b.lo    L(copy16)
        ldp     A_l, A_h, [src]
-       tbnz    tmp1, 6, L(copy96)
        ldp     D_l, D_h, [srcend, -16]
-       tbz     tmp1, 5, 1f
-       ldp     B_l, B_h, [src, 16]
-       ldp     C_l, C_h, [srcend, -32]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstend, -32]
-1:
        stp     A_l, A_h, [dstin]
        stp     D_l, D_h, [dstend, -16]
        ret
 
-       .p2align 4
-       /* Small copies: 0..16 bytes.  */
+       /* Copy 8-15 bytes.  */
 L(copy16):
-       cmp     count, 8
-       b.lo    1f
+       tbz     count, 3, L(copy8)
        ldr     A_l, [src]
        ldr     A_h, [srcend, -8]
        str     A_l, [dstin]
        str     A_h, [dstend, -8]
        ret
-       .p2align 4
-1:
-       tbz     count, 2, 1f
+
+       .p2align 3
+       /* Copy 4-7 bytes.  */
+L(copy8):
+       tbz     count, 2, L(copy4)
        ldr     A_lw, [src]
-       ldr     A_hw, [srcend, -4]
+       ldr     B_lw, [srcend, -4]
        str     A_lw, [dstin]
-       str     A_hw, [dstend, -4]
+       str     B_lw, [dstend, -4]
        ret
 
-       /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
-          byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-1:
-       cbz     count, 2f
+       /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+       cbz     count, L(copy0)
        lsr     tmp1, count, 1
        ldrb    A_lw, [src]
-       ldrb    A_hw, [srcend, -1]
+       ldrb    C_lw, [srcend, -1]
        ldrb    B_lw, [src, tmp1]
        strb    A_lw, [dstin]
        strb    B_lw, [dstin, tmp1]
-       strb    A_hw, [dstend, -1]
-2:     ret
+       strb    C_lw, [dstend, -1]
+L(copy0):
+       ret
 
        .p2align 4
-       /* Copy 64..96 bytes.  Copy 64 bytes from the start and
-          32 bytes from the end.  */
-L(copy96):
+       /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+       ldp     A_l, A_h, [src]
        ldp     B_l, B_h, [src, 16]
-       ldp     C_l, C_h, [src, 32]
-       ldp     D_l, D_h, [src, 48]
-       ldp     E_l, E_h, [srcend, -32]
-       ldp     F_l, F_h, [srcend, -16]
+       ldp     C_l, C_h, [srcend, -32]
+       ldp     D_l, D_h, [srcend, -16]
+       cmp     count, 64
+       b.hi    L(copy128)
        stp     A_l, A_h, [dstin]
        stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstin, 32]
-       stp     D_l, D_h, [dstin, 48]
-       stp     E_l, E_h, [dstend, -32]
-       stp     F_l, F_h, [dstend, -16]
+       stp     C_l, C_h, [dstend, -32]
+       stp     D_l, D_h, [dstend, -16]
        ret
 
-       /* Align DST to 16 byte alignment so that we don't cross cache line
-          boundaries on both loads and stores.  There are at least 96 bytes
-          to copy, so copy 16 bytes unaligned and then align.  The loop
-          copies 64 bytes per iteration and prefetches one iteration ahead.  */
+       .p2align 4
+       /* Copy 65..128 bytes.  */
+L(copy128):
+       ldp     E_l, E_h, [src, 32]
+       ldp     F_l, F_h, [src, 48]
+       cmp     count, 96
+       b.ls    L(copy96)
+       ldp     G_l, G_h, [srcend, -64]
+       ldp     H_l, H_h, [srcend, -48]
+       stp     G_l, G_h, [dstend, -64]
+       stp     H_l, H_h, [dstend, -48]
+L(copy96):
+       stp     A_l, A_h, [dstin]
+       stp     B_l, B_h, [dstin, 16]
+       stp     E_l, E_h, [dstin, 32]
+       stp     F_l, F_h, [dstin, 48]
+       stp     C_l, C_h, [dstend, -32]
+       stp     D_l, D_h, [dstend, -16]
+       ret
 
        .p2align 4
+       /* Copy more than 128 bytes.  */
 L(copy_long):
+       /* Use backwards copy if there is an overlap.  */
+       sub     tmp1, dstin, src
+       cbz     tmp1, L(copy0)
+       cmp     tmp1, count
+       b.lo    L(copy_long_backwards)
+
+       /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+       ldp     D_l, D_h, [src]
        and     tmp1, dstin, 15
        bic     dst, dstin, 15
-       ldp     D_l, D_h, [src]
        sub     src, src, tmp1
        add     count, count, tmp1      /* Count is now 16 too large.  */
        ldp     A_l, A_h, [src, 16]
@@ -195,8 +166,9 @@ L(copy_long):
        ldp     C_l, C_h, [src, 48]
        ldp     D_l, D_h, [src, 64]!
        subs    count, count, 128 + 16  /* Test and readjust count.  */
-       b.ls    2f
-1:
+       b.ls    L(copy64_from_end)
+
+L(loop64):
        stp     A_l, A_h, [dst, 16]
        ldp     A_l, A_h, [src, 16]
        stp     B_l, B_h, [dst, 32]
@@ -206,12 +178,10 @@ L(copy_long):
        stp     D_l, D_h, [dst, 64]!
        ldp     D_l, D_h, [src, 64]!
        subs    count, count, 64
-       b.hi    1b
+       b.hi    L(loop64)
 
-       /* Write the last full set of 64 bytes.  The remainder is at most 64
-          bytes, so it is safe to always copy 64 bytes from the end even if
-          there is just 1 byte left.  */
-2:
+       /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
        ldp     E_l, E_h, [srcend, -64]
        stp     A_l, A_h, [dst, 16]
        ldp     A_l, A_h, [srcend, -48]
@@ -226,5 +196,51 @@ L(copy_long):
        stp     C_l, C_h, [dstend, -16]
        ret
 
-       .size   memcpy, . - memcpy
+       .p2align 4
+
+       /* Large backwards copy for overlapping copies.
+          Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+       ldp     D_l, D_h, [srcend, -16]
+       and     tmp1, dstend, 15
+       sub     srcend, srcend, tmp1
+       sub     count, count, tmp1
+       ldp     A_l, A_h, [srcend, -16]
+       stp     D_l, D_h, [dstend, -16]
+       ldp     B_l, B_h, [srcend, -32]
+       ldp     C_l, C_h, [srcend, -48]
+       ldp     D_l, D_h, [srcend, -64]!
+       sub     dstend, dstend, tmp1
+       subs    count, count, 128
+       b.ls    L(copy64_from_start)
+
+L(loop64_backwards):
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [srcend, -16]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [srcend, -32]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [srcend, -48]
+       stp     D_l, D_h, [dstend, -64]!
+       ldp     D_l, D_h, [srcend, -64]!
+       subs    count, count, 64
+       b.hi    L(loop64_backwards)
+
+       /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+       ldp     G_l, G_h, [src, 48]
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [src, 32]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [src, 16]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [src]
+       stp     D_l, D_h, [dstend, -64]
+       stp     G_l, G_h, [dstin, 48]
+       stp     A_l, A_h, [dstin, 32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstin]
+       ret
+
+END (memcpy)
 #endif
diff --git a/newlib/libc/machine/aarch64/memset.S 
b/newlib/libc/machine/aarch64/memset.S
index 103e3f8bb..ca76439a9 100644
--- a/newlib/libc/machine/aarch64/memset.S
+++ b/newlib/libc/machine/aarch64/memset.S
@@ -1,66 +1,20 @@
-/* Copyright (c) 2012-2013, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
 /*
- * Copyright (c) 2015 ARM Ltd
- * All rights reserved.
+ * memset - fill memory with a constant byte
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
- *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  *
  */
 
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See memset-stub.c  */
 #else
+#include "asmdefs.h"
 
 #define dstin  x0
 #define val    x1
@@ -68,24 +22,11 @@
 #define count  x2
 #define dst    x3
 #define dstend x4
-#define tmp1   x5
-#define tmp1w  w5
-#define tmp2   x6
-#define tmp2w  w6
-#define zva_len x7
-#define zva_lenw w7
-
-#define L(l) .L ## l
+#define zva_val        x5
 
-       .macro def_fn f p2align=0
-       .text
-       .p2align \p2align
-       .global \f
-       .type \f, %function
-\f:
-       .endm
-
-def_fn memset p2align=6
+ENTRY (memset)
+       PTR_ARG (0)
+       SIZE_ARG (2)
 
        dup     v0.16B, valw
        add     dstend, dstin, count
@@ -101,7 +42,7 @@ def_fn memset p2align=6
        str     val, [dstin]
        str     val, [dstend, -8]
        ret
-       nop
+       .p2align 4
 1:     tbz     count, 2, 2f
        str     valw, [dstin]
        str     valw, [dstend, -4]
@@ -131,110 +72,49 @@ L(set96):
        stp     q0, q0, [dstend, -32]
        ret
 
-       .p2align 3
-       nop
+       .p2align 4
 L(set_long):
        and     valw, valw, 255
        bic     dst, dstin, 15
        str     q0, [dstin]
-       cmp     count, 256
-       ccmp    valw, 0, 0, cs
-       b.eq    L(try_zva)
-L(no_zva):
-       sub     count, dstend, dst      /* Count is 16 too large.  */
-       sub     dst, dst, 16            /* Dst is biased by -32.  */
-       sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
-1:     stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dst, 64]!
-L(tail64):
-       subs    count, count, 64
-       b.hi    1b
-2:     stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
-       ret
-
-       .p2align 3
-L(try_zva):
-       mrs     tmp1, dczid_el0
-       tbnz    tmp1w, 4, L(no_zva)
-       and     tmp1w, tmp1w, 15
-       cmp     tmp1w, 4        /* ZVA size is 64 bytes.  */
-       b.ne     L(zva_128)
-
-       /* Write the first and last 64 byte aligned block using stp rather
-          than using DC ZVA.  This is faster on some cores.
-        */
-L(zva_64):
+       cmp     count, 160
+       ccmp    valw, 0, 0, hs
+       b.ne    L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+       mrs     zva_val, dczid_el0
+       and     zva_val, zva_val, 31
+       cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
+       b.ne    L(no_zva)
+#endif
        str     q0, [dst, 16]
        stp     q0, q0, [dst, 32]
        bic     dst, dst, 63
-       stp     q0, q0, [dst, 64]
-       stp     q0, q0, [dst, 96]
-       sub     count, dstend, dst      /* Count is now 128 too large.  */
-       sub     count, count, 128+64+64 /* Adjust count and bias for loop.  */
-       add     dst, dst, 128
-       nop
-1:     dc      zva, dst
+       sub     count, dstend, dst      /* Count is now 64 too large.  */
+       sub     count, count, 128       /* Adjust count and bias for loop.  */
+
+       .p2align 4
+L(zva_loop):
        add     dst, dst, 64
+       dc      zva, dst
        subs    count, count, 64
-       b.hi    1b
-       stp     q0, q0, [dst, 0]
-       stp     q0, q0, [dst, 32]
+       b.hi    L(zva_loop)
        stp     q0, q0, [dstend, -64]
        stp     q0, q0, [dstend, -32]
        ret
 
-       .p2align 3
-L(zva_128):
-       cmp     tmp1w, 5        /* ZVA size is 128 bytes.  */
-       b.ne    L(zva_other)
-
-       str     q0, [dst, 16]
+L(no_zva):
+       sub     count, dstend, dst      /* Count is 16 too large.  */
+       sub     dst, dst, 16            /* Dst is biased by -32.  */
+       sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
+L(no_zva_loop):
        stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dst, 64]
-       stp     q0, q0, [dst, 96]
-       bic     dst, dst, 127
-       sub     count, dstend, dst      /* Count is now 128 too large.  */
-       sub     count, count, 128+128   /* Adjust count and bias for loop.  */
-       add     dst, dst, 128
-1:     dc      zva, dst
-       add     dst, dst, 128
-       subs    count, count, 128
-       b.hi    1b
-       stp     q0, q0, [dstend, -128]
-       stp     q0, q0, [dstend, -96]
+       stp     q0, q0, [dst, 64]!
+       subs    count, count, 64
+       b.hi    L(no_zva_loop)
        stp     q0, q0, [dstend, -64]
        stp     q0, q0, [dstend, -32]
        ret
 
-L(zva_other):
-       mov     tmp2w, 4
-       lsl     zva_lenw, tmp2w, tmp1w
-       add     tmp1, zva_len, 64       /* Max alignment bytes written.  */
-       cmp     count, tmp1
-       blo     L(no_zva)
-
-       sub     tmp2, zva_len, 1
-       add     tmp1, dst, zva_len
-       add     dst, dst, 16
-       subs    count, tmp1, dst        /* Actual alignment bytes to write.  */
-       bic     tmp1, tmp1, tmp2        /* Aligned dc zva start address.  */
-       beq     2f
-1:     stp     q0, q0, [dst], 64
-       stp     q0, q0, [dst, -32]
-       subs    count, count, 64
-       b.hi    1b
-2:     mov     dst, tmp1
-       sub     count, dstend, tmp1     /* Remaining bytes to write.  */
-       subs    count, count, zva_len
-       b.lo    4f
-3:     dc      zva, dst
-       add     dst, dst, zva_len
-       subs    count, count, zva_len
-       b.hs    3b
-4:     add     count, count, zva_len
-       sub     dst, dst, 32            /* Bias dst for tail loop.  */
-       b       L(tail64)
-
-       .size   memset, . - memset
+END (memset)
 #endif
diff --git a/newlib/libc/machine/aarch64/stpcpy.S 
b/newlib/libc/machine/aarch64/stpcpy.S
index 696b45889..155c68d75 100644
--- a/newlib/libc/machine/aarch64/stpcpy.S
+++ b/newlib/libc/machine/aarch64/stpcpy.S
@@ -1,34 +1,10 @@
 /*
-   stpcpy - copy a string returning pointer to end.
+ * stpcpy - copy a string returning pointer to end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
 
-   Copyright (c) 2015 ARM Ltd.
-   All Rights Reserved.
+#define BUILD_STPCPY 1
 
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the company nor the names of its contributors
-         may be used to endorse or promote products derived from this
-         software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
-
-/* This is just a wrapper that uses strcpy code with appropriate
-   pre-defines.  */
-
-#define BUILD_STPCPY
 #include "strcpy.S"
diff --git a/newlib/libc/machine/aarch64/strchr.S 
b/newlib/libc/machine/aarch64/strchr.S
index 2448dbc7d..500d9aff2 100644
--- a/newlib/libc/machine/aarch64/strchr.S
+++ b/newlib/libc/machine/aarch64/strchr.S
@@ -1,32 +1,9 @@
 /*
-   strchr - find a character in a string
-
-   Copyright (c) 2014, ARM Limited
-   All rights Reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the company nor the names of its contributors
-         may be used to endorse or promote products derived from this
-         software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
-
+ * strchr - find a character in a string
+ *
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See strchr-stub.c  */
 #else
@@ -37,6 +14,8 @@
  * Neon Available.
  */
 
+#include "asmdefs.h"
+
 /* Arguments and results.  */
 #define srcin          x0
 #define chrin          w1
@@ -74,26 +53,19 @@
 
 /* Locals and temporaries.  */
 
-       .macro def_fn f p2align=0
-       .text
-       .p2align \p2align
-       .global \f
-       .type \f, %function
-\f:
-       .endm
-
-def_fn strchr
-       /* Magic constant 0x40100401 to allow us to identify which lane
-          matches the requested byte.  Magic constant 0x80200802 used
-          similarly for NUL termination.  */
-       mov     wtmp2, #0x0401
-       movk    wtmp2, #0x4010, lsl #16
+ENTRY (strchr)
+       PTR_ARG (0)
+       /* Magic constant 0xc0300c03 to allow us to identify which lane
+          matches the requested byte.  Even bits are set if the character
+          matches, odd bits if either the char is NUL or matches.  */
+       mov     wtmp2, 0x0c03
+       movk    wtmp2, 0xc030, lsl 16
        dup     vrepchr.16b, chrin
        bic     src, srcin, #31         /* Work with aligned 32-byte hunks.  */
        dup     vrepmask_c.4s, wtmp2
        ands    tmp1, srcin, #31
        add     vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
-       b.eq    .Lloop
+       b.eq    L(loop)
 
        /* Input string is not 32-byte aligned.  Rather than forcing
           the padding bytes to a safe value, we calculate the syndrome
@@ -105,49 +77,42 @@ def_fn strchr
        cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
        cmeq    vhas_nul2.16b, vdata2.16b, #0
        cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
-       and     vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
-       and     vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
-       and     vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
-       and     vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
-       orr     vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-       orr     vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+       bif     vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+       bif     vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+       and     vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+       and     vend2.16b, vhas_nul2.16b, vrepmask_c.16b
        lsl     tmp1, tmp1, #1
        addp    vend1.16b, vend1.16b, vend2.16b         // 256->128
        mov     tmp3, #~0
        addp    vend1.16b, vend1.16b, vend2.16b         // 128->64
        lsr     tmp1, tmp3, tmp1
 
-       mov     tmp3, vend1.2d[0]
+       mov     tmp3, vend1.d[0]
        bic     tmp1, tmp3, tmp1        // Mask padding bits.
-       cbnz    tmp1, .Ltail
+       cbnz    tmp1, L(tail)
 
-.Lloop:
+       .p2align 4
+L(loop):
        ld1     {vdata1.16b, vdata2.16b}, [src], #32
-       cmeq    vhas_nul1.16b, vdata1.16b, #0
        cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
-       cmeq    vhas_nul2.16b, vdata2.16b, #0
        cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
-       /* Use a fast check for the termination condition.  */
-       orr     vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-       orr     vend2.16b, vhas_nul2.16b, vhas_chr2.16b
-       orr     vend1.16b, vend1.16b, vend2.16b
-       addp    vend1.2d, vend1.2d, vend1.2d
-       mov     tmp1, vend1.2d[0]
-       cbz     tmp1, .Lloop
+       cmhs    vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+       cmhs    vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+       orr     vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+       umaxp   vend1.16b, vend1.16b, vend1.16b
+       mov     tmp1, vend1.d[0]
+       cbz     tmp1, L(loop)
 
        /* Termination condition found.  Now need to establish exactly why
           we terminated.  */
-       and     vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
-       and     vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
-       and     vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
-       and     vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
-       orr     vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-       orr     vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+       bif     vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+       bif     vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+       and     vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+       and     vend2.16b, vhas_nul2.16b, vrepmask_c.16b
        addp    vend1.16b, vend1.16b, vend2.16b         // 256->128
        addp    vend1.16b, vend1.16b, vend2.16b         // 128->64
-
-       mov     tmp1, vend1.2d[0]
-.Ltail:
+       mov     tmp1, vend1.d[0]
+L(tail):
        /* Count the trailing zeros, by bit reversing...  */
        rbit    tmp1, tmp1
        /* Re-bias source.  */
@@ -160,5 +125,5 @@ def_fn strchr
        csel    result, result, xzr, eq
        ret
 
-       .size   strchr, . - strchr
+END (strchr)
 #endif
diff --git a/newlib/libc/machine/aarch64/strchrnul.S 
b/newlib/libc/machine/aarch64/strchrnul.S
index a0ac13b7f..ceaf4dca1 100644
--- a/newlib/libc/machine/aarch64/strchrnul.S
+++ b/newlib/libc/machine/aarch64/strchrnul.S
@@ -1,32 +1,9 @@
 /*
-   strchrnul - find a character or nul in a string
-
-   Copyright (c) 2014, ARM Limited
-   All rights Reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the company nor the names of its contributors
-         may be used to endorse or promote products derived from this
-         software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
-
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See strchrnul-stub.c  */
 #else
@@ -37,6 +14,8 @@
  * Neon Available.
  */
 
+#include "asmdefs.h"
+
 /* Arguments and results.  */
 #define srcin          x0
 #define chrin          w1
@@ -70,15 +49,8 @@
 
 /* Locals and temporaries.  */
 
-       .macro def_fn f p2align=0
-       .text
-       .p2align \p2align
-       .global \f
-       .type \f, %function
-\f:
-       .endm
-
-def_fn strchrnul
+ENTRY (strchrnul)
+       PTR_ARG (0)
        /* Magic constant 0x40100401 to allow us to identify which lane
           matches the termination condition.  */
        mov     wtmp2, #0x0401
@@ -87,7 +59,7 @@ def_fn strchrnul
        bic     src, srcin, #31         /* Work with aligned 32-byte hunks.  */
        dup     vrepmask.4s, wtmp2
        ands    tmp1, srcin, #31
-       b.eq    .Lloop
+       b.eq    L(loop)
 
        /* Input string is not 32-byte aligned.  Rather than forcing
           the padding bytes to a safe value, we calculate the syndrome
@@ -95,47 +67,43 @@ def_fn strchrnul
           syndrome that are related to the padding.  */
        ld1     {vdata1.16b, vdata2.16b}, [src], #32
        neg     tmp1, tmp1
-       cmeq    vhas_nul1.16b, vdata1.16b, #0
        cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
-       cmeq    vhas_nul2.16b, vdata2.16b, #0
        cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
-       orr     vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
-       orr     vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
-       and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-       and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+       cmhs    vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+       cmhs    vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+       and     vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+       and     vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
        lsl     tmp1, tmp1, #1
        addp    vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
        mov     tmp3, #~0
        addp    vend1.16b, vend1.16b, vend1.16b         // 128->64
        lsr     tmp1, tmp3, tmp1
 
-       mov     tmp3, vend1.2d[0]
+       mov     tmp3, vend1.d[0]
        bic     tmp1, tmp3, tmp1        // Mask padding bits.
-       cbnz    tmp1, .Ltail
+       cbnz    tmp1, L(tail)
 
-.Lloop:
+       .p2align 4
+L(loop):
        ld1     {vdata1.16b, vdata2.16b}, [src], #32
-       cmeq    vhas_nul1.16b, vdata1.16b, #0
        cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
-       cmeq    vhas_nul2.16b, vdata2.16b, #0
        cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
-       /* Use a fast check for the termination condition.  */
-       orr     vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
-       orr     vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
-       orr     vend1.16b, vhas_chr1.16b, vhas_chr2.16b
-       addp    vend1.2d, vend1.2d, vend1.2d
-       mov     tmp1, vend1.2d[0]
-       cbz     tmp1, .Lloop
+       cmhs    vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+       cmhs    vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+       orr     vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+       umaxp   vend1.16b, vend1.16b, vend1.16b
+       mov     tmp1, vend1.d[0]
+       cbz     tmp1, L(loop)
 
        /* Termination condition found.  Now need to establish exactly why
           we terminated.  */
-       and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-       and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+       and     vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+       and     vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
        addp    vend1.16b, vhas_chr1.16b, vhas_chr2.16b         // 256->128
        addp    vend1.16b, vend1.16b, vend1.16b         // 128->64
 
-       mov     tmp1, vend1.2d[0]
-.Ltail:
+       mov     tmp1, vend1.d[0]
+L(tail):
        /* Count the trailing zeros, by bit reversing...  */
        rbit    tmp1, tmp1
        /* Re-bias source.  */
@@ -145,5 +113,5 @@ def_fn strchrnul
        add     result, src, tmp1, lsr #1
        ret
 
-       .size   strchrnul, . - strchrnul
+END (strchrnul)
 #endif
diff --git a/newlib/libc/machine/aarch64/strcmp.S 
b/newlib/libc/machine/aarch64/strcmp.S
index e2bef2d49..691a1760e 100644
--- a/newlib/libc/machine/aarch64/strcmp.S
+++ b/newlib/libc/machine/aarch64/strcmp.S
@@ -1,202 +1,192 @@
-/* Copyright (c) 2012-2018, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
-/* Assumptions:
+/*
+ * strcmp - compare two strings
  *
- * ARMv8-a, AArch64
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See strcmp-stub.c  */
 #else
 
-       .macro def_fn f p2align=0
-       .text
-       .p2align \p2align
-       .global \f
-       .type \f, %function
-\f:
-       .endm
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64.
+ * MTE compatible.
+ */
 
-#define L(label) .L ## label
+#include "asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
 
-/* Parameters and result.  */
 #define src1           x0
 #define src2           x1
 #define result         x0
 
-/* Internal variables.  */
 #define data1          x2
 #define data1w         w2
 #define data2          x3
 #define data2w         w3
 #define has_nul                x4
 #define diff           x5
+#define off1           x5
 #define syndrome       x6
-#define tmp1           x7
-#define tmp2           x8
-#define tmp3           x9
-#define zeroones       x10
-#define pos            x11
-
-       /* Start of performance-critical section  -- one 64B cache line.  */
-def_fn strcmp p2align=6
-       eor     tmp1, src1, src2
-       mov     zeroones, #REP8_01
-       tst     tmp1, #7
+#define tmp            x6
+#define data3          x7
+#define zeroones       x8
+#define shift          x9
+#define off2           x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.  */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+   can be done in parallel across the entire word.
+   Since carry propagation makes 0x1 bytes before a NUL byte appear
+   NUL too in big-endian, byte-reverse the data before the NUL check.  */
+
+
+ENTRY (strcmp)
+       PTR_ARG (0)
+       PTR_ARG (1)
+       sub     off2, src2, src1
+       mov     zeroones, REP8_01
+       and     tmp, src1, 7
+       tst     off2, 7
        b.ne    L(misaligned8)
-       ands    tmp1, src1, #7
-       b.ne    L(mutual_align)
-       /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-          (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-          can be done in parallel across the entire word.  */
+       cbnz    tmp, L(mutual_align)
+
+       .p2align 4
+
 L(loop_aligned):
-       ldr     data1, [src1], #8
-       ldr     data2, [src2], #8
+       ldr     data2, [src1, off2]
+       ldr     data1, [src1], 8
 L(start_realigned):
-       sub     tmp1, data1, zeroones
-       orr     tmp2, data1, #REP8_7f
-       eor     diff, data1, data2      /* Non-zero if differences found.  */
-       bic     has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+       rev     tmp, data1
+       sub     has_nul, tmp, zeroones
+       orr     tmp, tmp, REP8_7f
+#else
+       sub     has_nul, data1, zeroones
+       orr     tmp, data1, REP8_7f
+#endif
+       bics    has_nul, has_nul, tmp   /* Non-zero if NUL terminator.  */
+       ccmp    data1, data2, 0, eq
+       b.eq    L(loop_aligned)
+#ifdef __AARCH64EB__
+       rev     has_nul, has_nul
+#endif
+       eor     diff, data1, data2
        orr     syndrome, diff, has_nul
-       cbz     syndrome, L(loop_aligned)
-       /* End of performance-critical section  -- one 64B cache line.  */
-
 L(end):
-#ifndef        __AARCH64EB__
+#ifndef __AARCH64EB__
        rev     syndrome, syndrome
        rev     data1, data1
-       /* The MS-non-zero bit of the syndrome marks either the first bit
-          that is different, or the top bit of the first zero byte.
-          Shifting left now will bring the critical information into the
-          top bits.  */
-       clz     pos, syndrome
        rev     data2, data2
-       lsl     data1, data1, pos
-       lsl     data2, data2, pos
-       /* But we need to zero-extend (char is unsigned) the value and then
-          perform a signed 32-bit subtraction.  */
-       lsr     data1, data1, #56
-       sub     result, data1, data2, lsr #56
-       ret
-#else
-       /* For big-endian we cannot use the trick with the syndrome value
-          as carry-propagation can corrupt the upper bits if the trailing
-          bytes in the string contain 0x01.  */
-       /* However, if there is no NUL byte in the dword, we can generate
-          the result directly.  We can't just subtract the bytes as the
-          MSB might be significant.  */
-       cbnz    has_nul, 1f
-       cmp     data1, data2
-       cset    result, ne
-       cneg    result, result, lo
-       ret
-1:
-       /* Re-compute the NUL-byte detection, using a byte-reversed value.  */
-       rev     tmp3, data1
-       sub     tmp1, tmp3, zeroones
-       orr     tmp2, tmp3, #REP8_7f
-       bic     has_nul, tmp1, tmp2
-       rev     has_nul, has_nul
-       orr     syndrome, diff, has_nul
-       clz     pos, syndrome
-       /* The MS-non-zero bit of the syndrome marks either the first bit
-          that is different, or the top bit of the first zero byte.
+#endif
+       clz     shift, syndrome
+       /* The most-significant-non-zero bit of the syndrome marks either the
+          first bit that is different, or the top bit of the first zero byte.
           Shifting left now will bring the critical information into the
           top bits.  */
-       lsl     data1, data1, pos
-       lsl     data2, data2, pos
+       lsl     data1, data1, shift
+       lsl     data2, data2, shift
        /* But we need to zero-extend (char is unsigned) the value and then
           perform a signed 32-bit subtraction.  */
-       lsr     data1, data1, #56
-       sub     result, data1, data2, lsr #56
+       lsr     data1, data1, 56
+       sub     result, data1, data2, lsr 56
        ret
-#endif
+
+       .p2align 4
 
 L(mutual_align):
        /* Sources are mutually aligned, but are not currently at an
           alignment boundary.  Round down the addresses and then mask off
-          the bytes that preceed the start point.  */
-       bic     src1, src1, #7
-       bic     src2, src2, #7
-       lsl     tmp1, tmp1, #3          /* Bytes beyond alignment -> bits.  */
-       ldr     data1, [src1], #8
-       neg     tmp1, tmp1              /* Bits to alignment -64.  */
-       ldr     data2, [src2], #8
-       mov     tmp2, #~0
-#ifdef __AARCH64EB__
-       /* Big-endian.  Early bytes are at MSB.  */
-       lsl     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
-#else
-       /* Little-endian.  Early bytes are at LSB.  */
-       lsr     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
-#endif
-       orr     data1, data1, tmp2
-       orr     data2, data2, tmp2
+          the bytes that precede the start point.  */
+       bic     src1, src1, 7
+       ldr     data2, [src1, off2]
+       ldr     data1, [src1], 8
+       neg     shift, src2, lsl 3      /* Bits to alignment -64.  */
+       mov     tmp, -1
+       LS_FW   tmp, tmp, shift
+       orr     data1, data1, tmp
+       orr     data2, data2, tmp
        b       L(start_realigned)
 
 L(misaligned8):
        /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
-          checking to make sure that we don't access beyond page boundary in
-          SRC2.  */
-       tst     src1, #7
-       b.eq    L(loop_misaligned)
+          checking to make sure that we don't access beyond the end of SRC2.  
*/
+       cbz     tmp, L(src1_aligned)
 L(do_misaligned):
-       ldrb    data1w, [src1], #1
-       ldrb    data2w, [src2], #1
-       cmp     data1w, #1
-       ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
+       ldrb    data1w, [src1], 1
+       ldrb    data2w, [src2], 1
+       cmp     data1w, 0
+       ccmp    data1w, data2w, 0, ne   /* NZCV = 0b0000.  */
        b.ne    L(done)
-       tst     src1, #7
+       tst     src1, 7
        b.ne    L(do_misaligned)
 
-L(loop_misaligned):
-       /* Test if we are within the last dword of the end of a 4K page.  If
-          yes then jump back to the misaligned loop to copy a byte at a time.  
*/
-       and     tmp1, src2, #0xff8
-       eor     tmp1, tmp1, #0xff8
-       cbz     tmp1, L(do_misaligned)
-       ldr     data1, [src1], #8
-       ldr     data2, [src2], #8
-
-       sub     tmp1, data1, zeroones
-       orr     tmp2, data1, #REP8_7f
-       eor     diff, data1, data2      /* Non-zero if differences found.  */
-       bic     has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
+L(src1_aligned):
+       neg     shift, src2, lsl 3
+       bic     src2, src2, 7
+       ldr     data3, [src2], 8
+#ifdef __AARCH64EB__
+       rev     data3, data3
+#endif
+       lsr     tmp, zeroones, shift
+       orr     data3, data3, tmp
+       sub     has_nul, data3, zeroones
+       orr     tmp, data3, REP8_7f
+       bics    has_nul, has_nul, tmp
+       b.ne    L(tail)
+
+       sub     off1, src2, src1
+
+       .p2align 4
+
+L(loop_unaligned):
+       ldr     data3, [src1, off1]
+       ldr     data2, [src1, off2]
+#ifdef __AARCH64EB__
+       rev     data3, data3
+#endif
+       sub     has_nul, data3, zeroones
+       orr     tmp, data3, REP8_7f
+       ldr     data1, [src1], 8
+       bics    has_nul, has_nul, tmp
+       ccmp    data1, data2, 0, eq
+       b.eq    L(loop_unaligned)
+
+       lsl     tmp, has_nul, shift
+#ifdef __AARCH64EB__
+       rev     tmp, tmp
+#endif
+       eor     diff, data1, data2
+       orr     syndrome, diff, tmp
+       cbnz    syndrome, L(end)
+L(tail):
+       ldr     data1, [src1]
+       neg     shift, shift
+       lsr     data2, data3, shift
+       lsr     has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+       rev     data2, data2
+       rev     has_nul, has_nul
+#endif
+       eor     diff, data1, data2
        orr     syndrome, diff, has_nul
-       cbz     syndrome, L(loop_misaligned)
        b       L(end)
 
 L(done):
        sub     result, data1, data2
        ret
-       .size   strcmp, .-strcmp
 
+END (strcmp)
 #endif
diff --git a/newlib/libc/machine/aarch64/strcpy.S 
b/newlib/libc/machine/aarch64/strcpy.S
index e5405f253..57c46f390 100644
--- a/newlib/libc/machine/aarch64/strcpy.S
+++ b/newlib/libc/machine/aarch64/strcpy.S
@@ -1,341 +1,160 @@
 /*
-   strcpy/stpcpy - copy a string returning pointer to start/end.
-
-   Copyright (c) 2013, 2014, 2015 ARM Ltd.
-   All Rights Reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the company nor the names of its contributors
-         may be used to endorse or promote products derived from this
-         software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
-
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See strchr-stub.c  */
 #else
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
-/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
+#include "asmdefs.h"
 
-   To test the page crossing code path more thoroughly, compile with
-   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
-   entry path.  This option is not intended for production use.  */
-
-/* Arguments and results.  */
 #define dstin          x0
 #define srcin          x1
+#define result         x0
 
-/* Locals and temporaries.  */
 #define src            x2
 #define dst            x3
-#define data1          x4
-#define data1w         w4
-#define data2          x5
-#define data2w         w5
-#define has_nul1       x6
-#define has_nul2       x7
-#define tmp1           x8
-#define tmp2           x9
-#define tmp3           x10
-#define tmp4           x11
-#define zeroones       x12
-#define data1a         x13
-#define data2a         x14
-#define pos            x15
-#define len            x16
-#define to_align       x17
+#define len            x4
+#define synd           x4
+#define        tmp             x5
+#define shift          x5
+#define data1          x6
+#define dataw1         w6
+#define data2          x7
+#define dataw2         w7
+
+#define dataq          q0
+#define vdata          v0
+#define vhas_nul       v1
+#define vend           v2
+#define dend           d2
+#define dataq2         q1
 
 #ifdef BUILD_STPCPY
-#define STRCPY stpcpy
+# define STRCPY stpcpy
+# define IFSTPCPY(X,...) X,__VA_ARGS__
 #else
-#define STRCPY strcpy
+# define STRCPY strcpy
+# define IFSTPCPY(X,...)
 #endif
 
-       .macro def_fn f p2align=0
-       .text
-       .p2align \p2align
-       .global \f
-       .type \f, %function
-\f:
-       .endm
-
-       /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-          (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-          can be done in parallel across the entire word.  */
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
-       /* AArch64 systems have a minimum page size of 4k.  We can do a quick
-          page size check for crossing this boundary on entry and if we
-          do not, then we can short-circuit much of the entry code.  We
-          expect early page-crossing strings to be rare (probability of
-          16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
-          predictable, even with random strings.
-
-          We don't bother checking for larger page sizes, the cost of setting
-          up the correct page size is just not worth the extra gain from
-          a small reduction in the cases taking the slow path.  Note that
-          we only care about whether the first fetch, which may be
-          misaligned, crosses a page boundary - after that we move to aligned
-          fetches for the remainder of the string.  */
-
-#ifdef STRCPY_TEST_PAGE_CROSS
-       /* Make everything that isn't Qword aligned look like a page cross.  */
-#define MIN_PAGE_P2 4
-#else
-#define MIN_PAGE_P2 12
-#endif
-
-#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
-
-def_fn STRCPY p2align=6
-       /* For moderately short strings, the fastest way to do the copy is to
-          calculate the length of the string in the same way as strlen, then
-          essentially do a memcpy of the result.  This avoids the need for
-          multiple byte copies and further means that by the time we
-          reach the bulk copy loop we know we can always use DWord
-          accesses.  We expect strcpy to rarely be called repeatedly
-          with the same source string, so branch prediction is likely to
-          always be difficult - we mitigate against this by preferring
-          conditional select operations over branches whenever this is
-          feasible.  */
-       and     tmp2, srcin, #(MIN_PAGE_SIZE - 1)
-       mov     zeroones, #REP8_01
-       and     to_align, srcin, #15
-       cmp     tmp2, #(MIN_PAGE_SIZE - 16)
-       neg     tmp1, to_align
-       /* The first fetch will straddle a (possible) page boundary iff
-          srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
-          aligned string will never fail the page align check, so will
-          always take the fast path.  */
-       b.gt    .Lpage_cross
-
-.Lpage_cross_ok:
-       ldp     data1, data2, [srcin]
-#ifdef __AARCH64EB__
-       /* Because we expect the end to be found within 16 characters
-          (profiling shows this is the most common case), it's worth
-          swapping the bytes now to save having to recalculate the
-          termination syndrome later.  We preserve data1 and data2
-          so that we can re-use the values later on.  */
-       rev     tmp2, data1
-       sub     tmp1, tmp2, zeroones
-       orr     tmp2, tmp2, #REP8_7f
-       bics    has_nul1, tmp1, tmp2
-       b.ne    .Lfp_le8
-       rev     tmp4, data2
-       sub     tmp3, tmp4, zeroones
-       orr     tmp4, tmp4, #REP8_7f
-#else
-       sub     tmp1, data1, zeroones
-       orr     tmp2, data1, #REP8_7f
-       bics    has_nul1, tmp1, tmp2
-       b.ne    .Lfp_le8
-       sub     tmp3, data2, zeroones
-       orr     tmp4, data2, #REP8_7f
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four 
bits
+   per byte. We take 4 bits of every comparison byte with shift right and 
narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
+
+ENTRY (STRCPY)
+       PTR_ARG (0)
+       PTR_ARG (1)
+       bic     src, srcin, 15
+       ld1     {vdata.16b}, [src]
+       cmeq    vhas_nul.16b, vdata.16b, 0
+       lsl     shift, srcin, 2
+       shrn    vend.8b, vhas_nul.8h, 4
+       fmov    synd, dend
+       lsr     synd, synd, shift
+       cbnz    synd, L(tail)
+
+       ldr     dataq, [src, 16]!
+       cmeq    vhas_nul.16b, vdata.16b, 0
+       shrn    vend.8b, vhas_nul.8h, 4
+       fmov    synd, dend
+       cbz     synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+       rbit    synd, synd
 #endif
-       bics    has_nul2, tmp3, tmp4
-       b.eq    .Lbulk_entry
+       sub     tmp, src, srcin
+       clz     len, synd
+       add     len, tmp, len, lsr 2
+       tbz     len, 4, L(less16)
+       sub     tmp, len, 15
+       ldr     dataq, [srcin]
+       ldr     dataq2, [srcin, tmp]
+       str     dataq, [dstin]
+       str     dataq2, [dstin, tmp]
+       IFSTPCPY (add result, dstin, len)
+       ret
 
-       /* The string is short (<=16 bytes).  We don't know exactly how
-          short though, yet.  Work out the exact length so that we can
-          quickly select the optimal copy strategy.  */
-.Lfp_gt8:
-       rev     has_nul2, has_nul2
-       clz     pos, has_nul2
-       mov     tmp2, #56
-       add     dst, dstin, pos, lsr #3         /* Bits to bytes.  */
-       sub     pos, tmp2, pos
-#ifdef __AARCH64EB__
-       lsr     data2, data2, pos
-#else
-       lsl     data2, data2, pos
-#endif
-       str     data2, [dst, #1]
+L(tail):
+       rbit    synd, synd
+       clz     len, synd
+       lsr     len, len, 2
+L(less16):
+       tbz     len, 3, L(less8)
+       sub     tmp, len, 7
+       ldr     data1, [srcin]
+       ldr     data2, [srcin, tmp]
        str     data1, [dstin]
-#ifdef BUILD_STPCPY
-       add     dstin, dst, #8
-#endif
+       str     data2, [dstin, tmp]
+       IFSTPCPY (add result, dstin, len)
        ret
 
-.Lfp_le8:
-       rev     has_nul1, has_nul1
-       clz     pos, has_nul1
-       add     dst, dstin, pos, lsr #3         /* Bits to bytes.  */
-       subs    tmp2, pos, #24                  /* Pos in bits. */
-       b.lt    .Lfp_lt4
-#ifdef __AARCH64EB__
-       mov     tmp2, #56
-       sub     pos, tmp2, pos
-       lsr     data2, data1, pos
-       lsr     data1, data1, #32
-#else
-       lsr     data2, data1, tmp2
-#endif
-       /* 4->7 bytes to copy.  */
-       str     data2w, [dst, #-3]
-       str     data1w, [dstin]
-#ifdef BUILD_STPCPY
-       mov     dstin, dst
-#endif
-       ret
-.Lfp_lt4:
-       cbz     pos, .Lfp_lt2
-       /* 2->3 bytes to copy.  */
-#ifdef __AARCH64EB__
-       lsr     data1, data1, #48
-#endif
-       strh    data1w, [dstin]
-       /* Fall-through, one byte (max) to go.  */
-.Lfp_lt2:
-       /* Null-terminated string.  Last character must be zero!  */
-       strb    wzr, [dst]
-#ifdef BUILD_STPCPY
-       mov     dstin, dst
-#endif
+       .p2align 4
+L(less8):
+       subs    tmp, len, 3
+       b.lo    L(less4)
+       ldr     dataw1, [srcin]
+       ldr     dataw2, [srcin, tmp]
+       str     dataw1, [dstin]
+       str     dataw2, [dstin, tmp]
+       IFSTPCPY (add result, dstin, len)
        ret
 
-       .p2align 6
-       /* Aligning here ensures that the entry code and main loop all lies
-          within one 64-byte cache line.  */
-.Lbulk_entry:
-       sub     to_align, to_align, #16
-       stp     data1, data2, [dstin]
-       sub     src, srcin, to_align
-       sub     dst, dstin, to_align
-       b       .Lentry_no_page_cross
-
-       /* The inner loop deals with two Dwords at a time.  This has a
-          slightly higher start-up cost, but we should win quite quickly,
-          especially on cores with a high number of issue slots per
-          cycle, as we get much better parallelism out of the operations.  */
-.Lmain_loop:
-       stp     data1, data2, [dst], #16
-.Lentry_no_page_cross:
-       ldp     data1, data2, [src], #16
-       sub     tmp1, data1, zeroones
-       orr     tmp2, data1, #REP8_7f
-       sub     tmp3, data2, zeroones
-       orr     tmp4, data2, #REP8_7f
-       bic     has_nul1, tmp1, tmp2
-       bics    has_nul2, tmp3, tmp4
-       ccmp    has_nul1, #0, #0, eq    /* NZCV = 0000  */
-       b.eq    .Lmain_loop
-
-       /* Since we know we are copying at least 16 bytes, the fastest way
-          to deal with the tail is to determine the location of the
-          trailing NUL, then (re)copy the 16 bytes leading up to that.  */
-       cmp     has_nul1, #0
-#ifdef __AARCH64EB__
-       /* For big-endian, carry propagation (if the final byte in the
-          string is 0x01) means we cannot use has_nul directly.  The
-          easiest way to get the correct byte is to byte-swap the data
-          and calculate the syndrome a second time.  */
-       csel    data1, data1, data2, ne
-       rev     data1, data1
-       sub     tmp1, data1, zeroones
-       orr     tmp2, data1, #REP8_7f
-       bic     has_nul1, tmp1, tmp2
-#else
-       csel    has_nul1, has_nul1, has_nul2, ne
-#endif
-       rev     has_nul1, has_nul1
-       clz     pos, has_nul1
-       add     tmp1, pos, #72
-       add     pos, pos, #8
-       csel    pos, pos, tmp1, ne
-       add     src, src, pos, lsr #3
-       add     dst, dst, pos, lsr #3
-       ldp     data1, data2, [src, #-32]
-       stp     data1, data2, [dst, #-16]
-#ifdef BUILD_STPCPY
-       sub     dstin, dst, #1
-#endif
+L(less4):
+       cbz     len, L(zerobyte)
+       ldrh    dataw1, [srcin]
+       strh    dataw1, [dstin]
+L(zerobyte):
+       strb    wzr, [dstin, len]
+       IFSTPCPY (add result, dstin, len)
        ret
 
-.Lpage_cross:
-       bic     src, srcin, #15
-       /* Start by loading two words at [srcin & ~15], then forcing the
-          bytes that precede srcin to 0xff.  This means they never look
-          like termination bytes.  */
-       ldp     data1, data2, [src]
-       lsl     tmp1, tmp1, #3  /* Bytes beyond alignment -> bits.  */
-       tst     to_align, #7
-       csetm   tmp2, ne
-#ifdef __AARCH64EB__
-       lsl     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
-#else
-       lsr     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
+       .p2align 4
+L(start_loop):
+       sub     tmp, srcin, dstin
+       ldr     dataq2, [srcin]
+       sub     dst, src, tmp
+       str     dataq2, [dstin]
+L(loop):
+       str     dataq, [dst], 32
+       ldr     dataq, [src, 16]
+       cmeq    vhas_nul.16b, vdata.16b, 0
+       umaxp   vend.16b, vhas_nul.16b, vhas_nul.16b
+       fmov    synd, dend
+       cbnz    synd, L(loopend)
+       str     dataq, [dst, -16]
+       ldr     dataq, [src, 32]!
+       cmeq    vhas_nul.16b, vdata.16b, 0
+       umaxp   vend.16b, vhas_nul.16b, vhas_nul.16b
+       fmov    synd, dend
+       cbz     synd, L(loop)
+       add     dst, dst, 16
+L(loopend):
+       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
+       fmov    synd, dend
+       sub     dst, dst, 31
+#ifndef __AARCH64EB__
+       rbit    synd, synd
 #endif
-       orr     data1, data1, tmp2
-       orr     data2a, data2, tmp2
-       cmp     to_align, #8
-       csinv   data1, data1, xzr, lt
-       csel    data2, data2, data2a, lt
-       sub     tmp1, data1, zeroones
-       orr     tmp2, data1, #REP8_7f
-       sub     tmp3, data2, zeroones
-       orr     tmp4, data2, #REP8_7f
-       bic     has_nul1, tmp1, tmp2
-       bics    has_nul2, tmp3, tmp4
-       ccmp    has_nul1, #0, #0, eq    /* NZCV = 0000  */
-       b.eq    .Lpage_cross_ok
-       /* We now need to make data1 and data2 look like they've been
-          loaded directly from srcin.  Do a rotate on the 128-bit value.  */
-       lsl     tmp1, to_align, #3      /* Bytes->bits.  */
-       neg     tmp2, to_align, lsl #3
-#ifdef __AARCH64EB__
-       lsl     data1a, data1, tmp1
-       lsr     tmp4, data2, tmp2
-       lsl     data2, data2, tmp1
-       orr     tmp4, tmp4, data1a
-       cmp     to_align, #8
-       csel    data1, tmp4, data2, lt
-       rev     tmp2, data1
-       rev     tmp4, data2
-       sub     tmp1, tmp2, zeroones
-       orr     tmp2, tmp2, #REP8_7f
-       sub     tmp3, tmp4, zeroones
-       orr     tmp4, tmp4, #REP8_7f
-#else
-       lsr     data1a, data1, tmp1
-       lsl     tmp4, data2, tmp2
-       lsr     data2, data2, tmp1
-       orr     tmp4, tmp4, data1a
-       cmp     to_align, #8
-       csel    data1, tmp4, data2, lt
-       sub     tmp1, data1, zeroones
-       orr     tmp2, data1, #REP8_7f
-       sub     tmp3, data2, zeroones
-       orr     tmp4, data2, #REP8_7f
-#endif
-       bic     has_nul1, tmp1, tmp2
-       cbnz    has_nul1, .Lfp_le8
-       bic     has_nul2, tmp3, tmp4
-       b       .Lfp_gt8
+       clz     len, synd
+       lsr     len, len, 2
+       add     dst, dst, len
+       ldr     dataq, [dst, tmp]
+       str     dataq, [dst]
+       IFSTPCPY (add result, dst, 15)
+       ret
 
-       .size   STRCPY, . - STRCPY
+END (STRCPY)
 #endif
diff --git a/newlib/libc/machine/aarch64/strlen.S 
b/newlib/libc/machine/aarch64/strlen.S
index 872d136ef..68a6f357c 100644
--- a/newlib/libc/machine/aarch64/strlen.S
+++ b/newlib/libc/machine/aarch64/strlen.S
@@ -1,115 +1,92 @@
-/* Copyright (c) 2013-2015, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-        notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-        notice, this list of conditions and the following disclaimer in the
-        documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-        names of its contributors may be used to endorse or promote products
-        derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
+/*
+ * strlen - calculate the length of a string.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See strlen-stub.c  */
 #else
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
  */
 
-/* To test the page crossing code path more thoroughly, compile with
-   -DTEST_PAGE_CROSS - this will force all calls through the slower
-   entry path.  This option is not intended for production use.         */
-
-/* Arguments and results.  */
-#define srcin          x0
-#define len            x0
-
-/* Locals and temporaries.  */
-#define src            x1
-#define data1          x2
-#define data2          x3
-#define has_nul1       x4
-#define has_nul2       x5
-#define tmp1           x4
-#define tmp2           x5
-#define tmp3           x6
-#define tmp4           x7
-#define zeroones       x8
-
-#define L(l) .L ## l
-
-       .macro def_fn f p2align=0
-       .text
-       .p2align \p2align
-       .global \f
-       .type \f, %function
-\f:
-       .endm
-
-       /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-          (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-          can be done in parallel across the entire word. A faster check
-          (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
-          false hits for characters 129..255.  */
+#include "asmdefs.h"
+
+#define srcin  x0
+#define len    x0
+
+#define src    x1
+#define data1  x2
+#define data2  x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1   x4
+#define tmp2   x5
+#define tmp3   x6
+#define tmp4   x7
+#define zeroones x8
+
+#define maskv  v0
+#define maskd  d0
+#define dataq1 q1
+#define dataq2 q2
+#define datav1 v1
+#define datav2 v2
+#define tmp    x2
+#define tmpw   w2
+#define synd   x3
+#define syndw  w3
+#define shift  x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+   byte is zero, and can be done in parallel across the entire word.  */
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
+
+/* To test the page crossing code path more thoroughly, compile with
+   -DTEST_PAGE_CROSS - this will force all calls through the slower
+   entry path.  This option is not intended for production use.  */
 
 #ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 15
+# define MIN_PAGE_SIZE 32
 #else
 # define MIN_PAGE_SIZE 4096
 #endif
 
-       /* Since strings are short on average, we check the first 16 bytes
-          of the string for a NUL character.  In order to do an unaligned ldp
-          safely we have to do a page cross check first.  If there is a NUL
-          byte we calculate the length from the 2 8-byte words using
-          conditional select to reduce branch mispredictions (it is unlikely
-          strlen will be repeatedly called on strings with the same length).
-
-          If the string is longer than 16 bytes, we align src so don't need
-          further page cross checks, and process 32 bytes per iteration
-          using the fast NUL check.  If we encounter non-ASCII characters,
-          fallback to a second loop using the full NUL check.
-
-          If the page cross check fails, we read 16 bytes from an aligned
-          address, remove any characters before the string, and continue
-          in the main loop using aligned loads.  Since strings crossing a
-          page in the first 16 bytes are rare (probability of
-          16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
-
-          AArch64 systems have a minimum page size of 4k.  We don't bother
-          checking for larger page sizes - the cost of setting up the correct
-          page size is just not worth the extra gain from a small reduction in
-          the cases taking the slow path.  Note that we only care about
-          whether the first fetch, which may be misaligned, crosses a page
-          boundary.  */
-
-def_fn strlen p2align=6
+/* Core algorithm:
+
+   Since strings are short on average, we check the first 32 bytes of the
+   string for a NUL character without aligning the string.  In order to use
+   unaligned loads safely we must do a page cross check first.
+
+   If there is a NUL byte we calculate the length from the 2 8-byte words
+   using conditional select to reduce branch mispredictions (it is unlikely
+   strlen will be repeatedly called on strings with the same length).
+
+   If the string is longer than 32 bytes, align src so we don't need further
+   page cross checks, and process 32 bytes per iteration using a fast SIMD
+   loop.
+
+   If the page cross check fails, we read 32 bytes from an aligned address,
+   and ignore any characters before the string.  If it contains a NUL
+   character, return the length, if not, continue in the main loop.  */
+
+ENTRY (strlen)
+       PTR_ARG (0)
        and     tmp1, srcin, MIN_PAGE_SIZE - 1
-       mov     zeroones, REP8_01
-       cmp     tmp1, MIN_PAGE_SIZE - 16
-       b.gt    L(page_cross)
+       cmp     tmp1, MIN_PAGE_SIZE - 32
+       b.hi    L(page_cross)
+
+       /* Look for a NUL byte in the first 16 bytes.  */
        ldp     data1, data2, [srcin]
+       mov     zeroones, REP8_01
+
 #ifdef __AARCH64EB__
        /* For big-endian, carry propagation (if the final byte in the
           string is 0x01) means we cannot use has_nul1/2 directly.
@@ -125,114 +102,96 @@ def_fn strlen p2align=6
        bics    has_nul1, tmp1, tmp2
        bic     has_nul2, tmp3, tmp4
        ccmp    has_nul2, 0, 0, eq
-       beq     L(main_loop_entry)
+       b.eq    L(bytes16_31)
 
-       /* Enter with C = has_nul1 == 0.  */
+       /* Find the exact offset of the first NUL byte in the first 16 bytes
+          from the string start.  Enter with C = has_nul1 == 0.  */
        csel    has_nul1, has_nul1, has_nul2, cc
        mov     len, 8
        rev     has_nul1, has_nul1
-       clz     tmp1, has_nul1
        csel    len, xzr, len, cc
+       clz     tmp1, has_nul1
        add     len, len, tmp1, lsr 3
        ret
 
-       /* The inner loop processes 32 bytes per iteration and uses the fast
-          NUL check.  If we encounter non-ASCII characters, use a second
-          loop with the accurate NUL check.  */
-       .p2align 4
-L(main_loop_entry):
-       bic     src, srcin, 15
-       sub     src, src, 16
-L(main_loop):
-       ldp     data1, data2, [src, 32]!
-.Lpage_cross_entry:
-       sub     tmp1, data1, zeroones
-       sub     tmp3, data2, zeroones
-       orr     tmp2, tmp1, tmp3
-       tst     tmp2, zeroones, lsl 7
-       bne     1f
-       ldp     data1, data2, [src, 16]
+       /* Look for a NUL byte at offset 16..31 in the string.  */
+L(bytes16_31):
+       ldp     data1, data2, [srcin, 16]
+#ifdef __AARCH64EB__
+       rev     data1, data1
+       rev     data2, data2
+#endif
        sub     tmp1, data1, zeroones
-       sub     tmp3, data2, zeroones
-       orr     tmp2, tmp1, tmp3
-       tst     tmp2, zeroones, lsl 7
-       beq     L(main_loop)
-       add     src, src, 16
-1:
-       /* The fast check failed, so do the slower, accurate NUL check.  */
        orr     tmp2, data1, REP8_7f
+       sub     tmp3, data2, zeroones
        orr     tmp4, data2, REP8_7f
        bics    has_nul1, tmp1, tmp2
        bic     has_nul2, tmp3, tmp4
        ccmp    has_nul2, 0, 0, eq
-       beq     L(nonascii_loop)
+       b.eq    L(loop_entry)
 
-       /* Enter with C = has_nul1 == 0.  */
-L(tail):
-#ifdef __AARCH64EB__
-       /* For big-endian, carry propagation (if the final byte in the
-          string is 0x01) means we cannot use has_nul1/2 directly.  The
-          easiest way to get the correct byte is to byte-swap the data
-          and calculate the syndrome a second time.  */
-       csel    data1, data1, data2, cc
-       rev     data1, data1
-       sub     tmp1, data1, zeroones
-       orr     tmp2, data1, REP8_7f
-       bic     has_nul1, tmp1, tmp2
-#else
+       /* Find the exact offset of the first NUL byte at offset 16..31 from
+          the string start.  Enter with C = has_nul1 == 0.  */
        csel    has_nul1, has_nul1, has_nul2, cc
-#endif
-       sub     len, src, srcin
+       mov     len, 24
        rev     has_nul1, has_nul1
-       add     tmp2, len, 8
+       mov     tmp3, 16
        clz     tmp1, has_nul1
-       csel    len, len, tmp2, cc
+       csel    len, tmp3, len, cc
        add     len, len, tmp1, lsr 3
        ret
 
-L(nonascii_loop):
-       ldp     data1, data2, [src, 16]!
-       sub     tmp1, data1, zeroones
-       orr     tmp2, data1, REP8_7f
-       sub     tmp3, data2, zeroones
-       orr     tmp4, data2, REP8_7f
-       bics    has_nul1, tmp1, tmp2
-       bic     has_nul2, tmp3, tmp4
-       ccmp    has_nul2, 0, 0, eq
-       bne     L(tail)
-       ldp     data1, data2, [src, 16]!
-       sub     tmp1, data1, zeroones
-       orr     tmp2, data1, REP8_7f
-       sub     tmp3, data2, zeroones
-       orr     tmp4, data2, REP8_7f
-       bics    has_nul1, tmp1, tmp2
-       bic     has_nul2, tmp3, tmp4
-       ccmp    has_nul2, 0, 0, eq
-       beq     L(nonascii_loop)
-       b       L(tail)
+       nop
+L(loop_entry):
+       bic     src, srcin, 31
+
+       .p2align 5
+L(loop):
+       ldp     dataq1, dataq2, [src, 32]!
+       uminp   maskv.16b, datav1.16b, datav2.16b
+       uminp   maskv.16b, maskv.16b, maskv.16b
+       cmeq    maskv.8b, maskv.8b, 0
+       fmov    synd, maskd
+       cbz     synd, L(loop)
+
+       /* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
+       cmeq    maskv.16b, datav1.16b, 0
+       sub     len, src, srcin
+       cbnz    syndw, 1f
+       cmeq    maskv.16b, datav2.16b, 0
+       add     len, len, 16
+1:
+       /* Generate a bitmask and compute correct byte offset.  */
+       shrn    maskv.8b, maskv.8h, 4
+       fmov    synd, maskd
+#ifndef __AARCH64EB__
+       rbit    synd, synd
+#endif
+       clz     tmp, synd
+       add     len, len, tmp, lsr 2
+       ret
 
-       /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
-          srcin to 0x7f, so we ignore any NUL bytes before the string.
-          Then continue in the aligned loop.  */
 L(page_cross):
-       bic     src, srcin, 15
-       ldp     data1, data2, [src]
-       lsl     tmp1, srcin, 3
-       mov     tmp4, -1
-#ifdef __AARCH64EB__
-       /* Big-endian.  Early bytes are at MSB.  */
-       lsr     tmp1, tmp4, tmp1        /* Shift (tmp1 & 63).  */
[...]

[diff truncated at 100000 bytes]

[newlib-cygwin] aarch64: Sync with ARM-software/optimized-routines

Reply via email to