> Date: Mon, 9 Oct 2017 08:45:55 +0300
> From: Artturi Alm <artturi....@gmail.com>
> 
> Hi,
> 
> 
> has anyone looked at the netbsd xscale-versions of bcopyin/bcopyout/kcopy?
> 
> this is from netbsd bcopyinout.S:
> #if defined(__XSCALE__) || defined(_ARM_ARCH_6)
> /*
>  * armv6 and v7 have pld and strd so they can use the xscale
>  * bcopyinout as well.
>  */
> #include "bcopyinout_xscale.S"
> #else
> 
> untested diff below i just scavenged from one of my dead branches,
> just incase someone has the time and motivation to run it through some
> performance testing or w/e.

Please stop sending untested diffs.  Nobody has the motivation to look
at them, unless maybe they fix actual bugs.

> diff --git a/sys/arch/arm/arm/bcopyinout.S b/sys/arch/arm/arm/bcopyinout.S
> index 9a7d11865c0..bc2e58d22b7 100644
> --- a/sys/arch/arm/arm/bcopyinout.S
> +++ b/sys/arch/arm/arm/bcopyinout.S
> @@ -41,7 +41,7 @@
>  #include <machine/asm.h>
>  #include <arm/sysreg.h>
>  
> -#ifdef __XSCALE__
> +#ifdef CPU_ARMv7
>  #include "bcopyinout_xscale.S"
>  #else
>  
> diff --git a/sys/arch/arm/arm/bcopyinout_xscale.S 
> b/sys/arch/arm/arm/bcopyinout_xscale.S
> new file mode 100644
> index 00000000000..2e740eb96c2
> --- /dev/null
> +++ b/sys/arch/arm/arm/bcopyinout_xscale.S
> @@ -0,0 +1,1139 @@
> +/* $OpenBSD$ */
> +/*   $NetBSD: bcopyinout_xscale.S,v 1.11 2013/12/01 02:54:33 joerg Exp $     
> */
> +
> +/*
> + * Copyright 2003 Wasabi Systems, Inc.
> + * All rights reserved.
> + *
> + * Written by Steve C. Woodford for Wasabi Systems, Inc.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + * 3. All advertising materials mentioning features or use of this software
> + *    must display the following acknowledgement:
> + *      This product includes software developed for the NetBSD Project by
> + *      Wasabi Systems, Inc.
> + * 4. The name of Wasabi Systems, Inc. may not be used to endorse
> + *    or promote products derived from this software without specific prior
> + *    written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
> + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
> + * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
> + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
> + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
> + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
> + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
> + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
> + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
> + * POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +     .text
> +     .align  2
> +
> +/*
> + * r0 = user space address
> + * r1 = kernel space address
> + * r2 = length
> + *
> + * Copies bytes from user space to kernel space
> + */
> +ENTRY(copyin)
> +     cmp     r2, #0x00
> +#if /* XXX or <= 0 like below? */ 1
> +     moveq   r0, #0
> +     moveq   pc, lr
> +#else
> +     movle   r0, #0x00
> +     RETc(le)                        /* Bail early if length is <= 0 */
> +#endif
> +     push    {r10-r11, lr}
> +
> +     /* Get curcpu from TPIDRPRW. */
> +     mrc     CP15_TPIDRPRW(r10)
> +     ldr     r10, [r10, #CI_CURPCB]
> +
> +     mov     r3, #0x00
> +     adr     ip, .Lcopyin_fault
> +     ldr     r11, [r10, #PCB_ONFAULT]
> +     str     ip, [r10, #PCB_ONFAULT]
> +     bl      .Lcopyin_guts
> +     str     r11, [r10, #PCB_ONFAULT]
> +     mov     r0, #0x00
> +     pop     {r10-r11, pc}
> +
> +.Lcopyin_fault:
> +     str     r11, [r10, #PCB_ONFAULT]
> +     cmp     r3, #0x00
> +     popgt   {r4-r7}         /* r3 > 0 Restore r4-r7 */
> +     poplt   {r4-r9}         /* r3 < 0 Restore r4-r9 */
> +     pop     {r10-r11, pc}
> +
> +.Lcopyin_guts:
> +     pld     [r0]
> +     /* Word-align the destination buffer */
> +     ands    ip, r1, #0x03           /* Already word aligned? */
> +     beq     .Lcopyin_wordaligned    /* Yup */
> +     rsb     ip, ip, #0x04
> +     cmp     r2, ip                  /* Enough bytes left to align it? */
> +     blt     .Lcopyin_l4_2           /* Nope. Just copy bytewise */
> +     sub     r2, r2, ip
> +     rsbs    ip, ip, #0x03
> +     addne   pc, pc, ip, lsl #3
> +     nop
> +     ldrbt   ip, [r0], #0x01
> +     strb    ip, [r1], #0x01
> +     ldrbt   ip, [r0], #0x01
> +     strb    ip, [r1], #0x01
> +     ldrbt   ip, [r0], #0x01
> +     strb    ip, [r1], #0x01
> +     cmp     r2, #0x00               /* All done? */
> +     moveq   pc, lr
> +
> +     /* Destination buffer is now word aligned */
> +.Lcopyin_wordaligned:
> +     ands    ip, r0, #0x03           /* Is src also word-aligned? */
> +     bne     .Lcopyin_bad_align      /* Nope. Things just got bad */
> +     cmp     r2, #0x08               /* Less than 8 bytes remaining? */
> +     blt     .Lcopyin_w_less_than8
> +
> +     /* Quad-align the destination buffer */
> +     tst     r1, #0x07               /* Already quad aligned? */
> +     ldrtne  ip, [r0], #0x04
> +     push    {r4-r9}         /* Free up some registers */
> +     mov     r3, #-1                 /* Signal restore r4-r9 */
> +     tst     r1, #0x07               /* XXX: bug work-around */
> +     subne   r2, r2, #0x04
> +     strne   ip, [r1], #0x04
> +
> +     /* Destination buffer quad aligned, source is word aligned */
> +     subs    r2, r2, #0x80
> +     blt     .Lcopyin_w_lessthan128
> +
> +     /* Copy 128 bytes at a time */
> +.Lcopyin_w_loop128:
> +     ldrt    r4, [r0], #0x04         /* LD:00-03 */
> +     ldrt    r5, [r0], #0x04         /* LD:04-07 */
> +     pld     [r0, #0x18]             /* Prefetch 0x20 */
> +     ldrt    r6, [r0], #0x04         /* LD:08-0b */
> +     ldrt    r7, [r0], #0x04         /* LD:0c-0f */
> +     ldrt    r8, [r0], #0x04         /* LD:10-13 */
> +     ldrt    r9, [r0], #0x04         /* LD:14-17 */
> +     strd    r4, r5, [r1], #0x08     /* ST:00-07 */
> +     ldrt    r4, [r0], #0x04         /* LD:18-1b */
> +     ldrt    r5, [r0], #0x04         /* LD:1c-1f */
> +     strd    r6, r7, [r1], #0x08     /* ST:08-0f */
> +     ldrt    r6, [r0], #0x04         /* LD:20-23 */
> +     ldrt    r7, [r0], #0x04         /* LD:24-27 */
> +     pld     [r0, #0x18]             /* Prefetch 0x40 */
> +     strd    r8, r9, [r1], #0x08     /* ST:10-17 */
> +     ldrt    r8, [r0], #0x04         /* LD:28-2b */
> +     ldrt    r9, [r0], #0x04         /* LD:2c-2f */
> +     strd    r4, r5, [r1], #0x08     /* ST:18-1f */
> +     ldrt    r4, [r0], #0x04         /* LD:30-33 */
> +     ldrt    r5, [r0], #0x04         /* LD:34-37 */
> +     strd    r6, r7, [r1], #0x08             /* ST:20-27 */
> +     ldrt    r6, [r0], #0x04         /* LD:38-3b */
> +     ldrt    r7, [r0], #0x04         /* LD:3c-3f */
> +     strd    r8, r9, [r1], #0x08     /* ST:28-2f */
> +     ldrt    r8, [r0], #0x04         /* LD:40-43 */
> +     ldrt    r9, [r0], #0x04         /* LD:44-47 */
> +     pld     [r0, #0x18]             /* Prefetch 0x60 */
> +     strd    r4, r5, [r1], #0x08     /* ST:30-37 */
> +     ldrt    r4, [r0], #0x04         /* LD:48-4b */
> +     ldrt    r5, [r0], #0x04         /* LD:4c-4f */
> +     strd    r6, r7, [r1], #0x08     /* ST:38-3f */
> +     ldrt    r6, [r0], #0x04         /* LD:50-53 */
> +     ldrt    r7, [r0], #0x04         /* LD:54-57 */
> +     strd    r8, r9, [r1], #0x08     /* ST:40-47 */
> +     ldrt    r8, [r0], #0x04         /* LD:58-5b */
> +     ldrt    r9, [r0], #0x04         /* LD:5c-5f */
> +     strd    r4, r5, [r1], #0x08     /* ST:48-4f */
> +     ldrt    r4, [r0], #0x04         /* LD:60-63 */
> +     ldrt    r5, [r0], #0x04         /* LD:64-67 */
> +     pld     [r0, #0x18]             /* Prefetch 0x80 */
> +     strd    r6, r7, [r1], #0x08     /* ST:50-57 */
> +     ldrt    r6, [r0], #0x04         /* LD:68-6b */
> +     ldrt    r7, [r0], #0x04         /* LD:6c-6f */
> +     strd    r8, r9, [r1], #0x08     /* ST:58-5f */
> +     ldrt    r8, [r0], #0x04         /* LD:70-73 */
> +     ldrt    r9, [r0], #0x04         /* LD:74-77 */
> +     strd    r4, r5, [r1], #0x08     /* ST:60-67 */
> +     ldrt    r4, [r0], #0x04         /* LD:78-7b */
> +     ldrt    r5, [r0], #0x04         /* LD:7c-7f */
> +     strd    r6, r7, [r1], #0x08     /* ST:68-6f */
> +     strd    r8, r9, [r1], #0x08     /* ST:70-77 */
> +     subs    r2, r2, #0x80
> +     strd    r4, r5, [r1], #0x08     /* ST:78-7f */
> +     bge     .Lcopyin_w_loop128
> +
> +.Lcopyin_w_lessthan128:
> +     adds    r2, r2, #0x80           /* Adjust for extra sub */
> +     popeq   {r4-r9}
> +     moveq   pc, lr                  /* Return now if done */
> +     subs    r2, r2, #0x20
> +     blt     .Lcopyin_w_lessthan32
> +
> +     /* Copy 32 bytes at a time */
> +.Lcopyin_w_loop32:
> +     ldrt    r4, [r0], #0x04
> +     ldrt    r5, [r0], #0x04
> +     pld     [r0, #0x18]
> +     ldrt    r6, [r0], #0x04
> +     ldrt    r7, [r0], #0x04
> +     ldrt    r8, [r0], #0x04
> +     ldrt    r9, [r0], #0x04
> +     strd    r4, r5, [r1], #0x08
> +     ldrt    r4, [r0], #0x04
> +     ldrt    r5, [r0], #0x04
> +     strd    r6, r7, [r1], #0x08
> +     strd    r8, r9, [r1], #0x08
> +     subs    r2, r2, #0x20
> +     strd    r4, r5, [r1], #0x08
> +     bge     .Lcopyin_w_loop32
> +
> +.Lcopyin_w_lessthan32:
> +     adds    r2, r2, #0x20           /* Adjust for extra sub */
> +     popeq   {r4-r9}
> +     moveq   pc, lr                  /* Return now if done */
> +
> +     and     r4, r2, #0x18
> +     rsb     r5, r4, #0x18
> +     subs    r2, r2, r4
> +     add     pc, pc, r5, lsl #1
> +     nop
> +
> +     /* At least 24 bytes remaining */
> +     ldrt    r4, [r0], #0x04
> +     ldrt    r5, [r0], #0x04
> +     nop
> +     strd    r4, r5, [r1], #0x08
> +
> +     /* At least 16 bytes remaining */
> +     ldrt    r4, [r0], #0x04
> +     ldrt    r5, [r0], #0x04
> +     nop
> +     strd    r4, r5, [r1], #0x08
> +
> +     /* At least 8 bytes remaining */
> +     ldrt    r4, [r0], #0x04
> +     ldrt    r5, [r0], #0x04
> +     nop
> +     strd    r4, r5, [r1], #0x08
> +
> +     /* Less than 8 bytes remaining */
> +     pop     {r4-r9}
> +     moveq   pc, lr                  /* Return now if done */
> +     mov     r3, #0x00
> +
> +.Lcopyin_w_less_than8:
> +     subs    r2, r2, #0x04
> +     ldrtge  ip, [r0], #0x04
> +     strge   ip, [r1], #0x04
> +     moveq   pc, lr                  /* Return now if done */
> +     addlt   r2, r2, #0x04
> +     ldrbt   ip, [r0], #0x01
> +     cmp     r2, #0x02
> +     ldrbtge r2, [r0], #0x01
> +     strb    ip, [r1], #0x01
> +     ldrbtgt ip, [r0]
> +     strbge  r2, [r1], #0x01
> +     strbgt  ip, [r1]
> +     RET
> +
> +/*
> + * At this point, it has not been possible to word align both buffers.
> + * The destination buffer (r1) is word aligned, but the source buffer
> + * (r0) is not.
> + */
> +.Lcopyin_bad_align:
> +     push    {r4-r7}
> +     mov     r3, #0x01
> +     bic     r0, r0, #0x03
> +     cmp     ip, #2
> +     ldrt    ip, [r0], #0x04
> +     bgt     .Lcopyin_bad3
> +     beq     .Lcopyin_bad2
> +     b       .Lcopyin_bad1
> +
> +.Lcopyin_bad1_loop16:
> +     mov     r4, ip, lsr #8
> +     ldrt    r5, [r0], #0x04
> +     pld     [r0, #0x018]
> +     ldrt    r6, [r0], #0x04
> +     ldrt    r7, [r0], #0x04
> +     ldrt    ip, [r0], #0x04
> +     orr     r4, r4, r5, lsl #24
> +     mov     r5, r5, lsr #8
> +     orr     r5, r5, r6, lsl #24
> +     mov     r6, r6, lsr #8
> +     orr     r6, r6, r7, lsl #24
> +     mov     r7, r7, lsr #8
> +     orr     r7, r7, ip, lsl #24
> +     str     r4, [r1], #0x04
> +     str     r5, [r1], #0x04
> +     str     r6, [r1], #0x04
> +     str     r7, [r1], #0x04
> +.Lcopyin_bad1:
> +     subs    r2, r2, #0x10
> +     bge     .Lcopyin_bad1_loop16
> +
> +     adds    r2, r2, #0x10
> +     popeq   {r4-r7}
> +     moveq   pc, lr                  /* Return now if done */
> +     subs    r2, r2, #0x04
> +     sublt   r0, r0, #0x03
> +     blt     .Lcopyin_l4
> +
> +.Lcopyin_bad1_loop4:
> +     mov     r4, ip, lsr #8
> +     ldrt    ip, [r0], #0x04
> +     subs    r2, r2, #0x04
> +     orr     r4, r4, ip, lsl #24
> +     str     r4, [r1], #0x04
> +     bge     .Lcopyin_bad1_loop4
> +     sub     r0, r0, #0x03
> +     b       .Lcopyin_l4
> +
> +.Lcopyin_bad2_loop16:
> +     mov     r4, ip, lsr #16
> +     ldrt    r5, [r0], #0x04
> +     pld     [r0, #0x018]
> +     ldrt    r6, [r0], #0x04
> +     ldrt    r7, [r0], #0x04
> +     ldrt    ip, [r0], #0x04
> +     orr     r4, r4, r5, lsl #16
> +     mov     r5, r5, lsr #16
> +     orr     r5, r5, r6, lsl #16
> +     mov     r6, r6, lsr #16
> +     orr     r6, r6, r7, lsl #16
> +     mov     r7, r7, lsr #16
> +     orr     r7, r7, ip, lsl #16
> +     str     r4, [r1], #0x04
> +     str     r5, [r1], #0x04
> +     str     r6, [r1], #0x04
> +     str     r7, [r1], #0x04
> +.Lcopyin_bad2:
> +     subs    r2, r2, #0x10
> +     bge     .Lcopyin_bad2_loop16
> +
> +     adds    r2, r2, #0x10
> +     popeq   {r4-r7}
> +     moveq   pc, lr                  /* Return now if done */
> +     subs    r2, r2, #0x04
> +     sublt   r0, r0, #0x02
> +     blt     .Lcopyin_l4
> +
> +.Lcopyin_bad2_loop4:
> +     mov     r4, ip, lsr #16
> +     ldrt    ip, [r0], #0x04
> +     subs    r2, r2, #0x04
> +     orr     r4, r4, ip, lsl #16
> +     str     r4, [r1], #0x04
> +     bge     .Lcopyin_bad2_loop4
> +     sub     r0, r0, #0x02
> +     b       .Lcopyin_l4
> +
> +.Lcopyin_bad3_loop16:
> +     mov     r4, ip, lsr #24
> +     ldrt    r5, [r0], #0x04
> +     pld     [r0, #0x018]
> +     ldrt    r6, [r0], #0x04
> +     ldrt    r7, [r0], #0x04
> +     ldrt    ip, [r0], #0x04
> +     orr     r4, r4, r5, lsl #8
> +     mov     r5, r5, lsr #24
> +     orr     r5, r5, r6, lsl #8
> +     mov     r6, r6, lsr #24
> +     orr     r6, r6, r7, lsl #8
> +     mov     r7, r7, lsr #24
> +     orr     r7, r7, ip, lsl #8
> +     str     r4, [r1], #0x04
> +     str     r5, [r1], #0x04
> +     str     r6, [r1], #0x04
> +     str     r7, [r1], #0x04
> +.Lcopyin_bad3:
> +     subs    r2, r2, #0x10
> +     bge     .Lcopyin_bad3_loop16
> +
> +     adds    r2, r2, #0x10
> +     popeq   {r4-r7}
> +     moveq   pc, lr                  /* Return now if done */
> +     subs    r2, r2, #0x04
> +     sublt   r0, r0, #0x01
> +     blt     .Lcopyin_l4
> +
> +.Lcopyin_bad3_loop4:
> +     mov     r4, ip, lsr #24
> +     ldrt    ip, [r0], #0x04
> +     subs    r2, r2, #0x04
> +     orr     r4, r4, ip, lsl #8
> +     str     r4, [r1], #0x04
> +     bge     .Lcopyin_bad3_loop4
> +     sub     r0, r0, #0x01
> +
> +.Lcopyin_l4:
> +     pop     {r4-r7}
> +     mov     r3, #0x00
> +     adds    r2, r2, #0x04
> +     moveq   pc, lr
> +.Lcopyin_l4_2:
> +     rsbs    r2, r2, #0x03
> +     addne   pc, pc, r2, lsl #3
> +     nop
> +     ldrbt   ip, [r0], #0x01
> +     strb    ip, [r1], #0x01
> +     ldrbt   ip, [r0], #0x01
> +     strb    ip, [r1], #0x01
> +     ldrbt   ip, [r0]
> +     strb    ip, [r1]
> +     RET
> +END(copyin)
> +
> +
> +/*
> + * r0 = kernel space address
> + * r1 = user space address
> + * r2 = length
> + *
> + * Copies bytes from kernel space to user space
> + */
> +ENTRY(copyout)
> +     cmp     r2, #0x00
> +#if /* XXX or <= 0 like below? */ 1
> +     moveq   r0, #0
> +     moveq   pc, lr
> +#else
> +     movle   r0, #0x00
> +     RETc(le)                        /* Bail early if length is <= 0 */
> +#endif
> +
> +     push    {r10-r11, lr}
> +
> +     /* Get curcpu from TPIDRPRW. */
> +     mrc     CP15_TPIDRPRW(r10)
> +     ldr     r10, [r10, #CI_CURPCB]
> +
> +     mov     r3, #0x00
> +     adr     ip, .Lcopyout_fault
> +     ldr     r11, [r10, #PCB_ONFAULT]
> +     str     ip, [r10, #PCB_ONFAULT]
> +     bl      .Lcopyout_guts
> +     str     r11, [r10, #PCB_ONFAULT]
> +     mov     r0, #0x00
> +     pop     {r10-r11, pc}
> +
> +.Lcopyout_fault:
> +     str     r11, [r10, #PCB_ONFAULT]
> +     cmp     r3, #0x00
> +     popgt   {r4-r7}         /* r3 > 0 Restore r4-r7 */
> +     poplt   {r4-r9}         /* r3 < 0 Restore r4-r9 */
> +     pop     {r10-r11, pc}
> +
> +.Lcopyout_guts:
> +     pld     [r0]
> +     /* Word-align the destination buffer */
> +     ands    ip, r1, #0x03           /* Already word aligned? */
> +     beq     .Lcopyout_wordaligned   /* Yup */
> +     rsb     ip, ip, #0x04
> +     cmp     r2, ip                  /* Enough bytes left to align it? */
> +     blt     .Lcopyout_l4_2          /* Nope. Just copy bytewise */
> +     sub     r2, r2, ip
> +     rsbs    ip, ip, #0x03
> +     addne   pc, pc, ip, lsl #3
> +     nop
> +     ldrb    ip, [r0], #0x01
> +     strbt   ip, [r1], #0x01
> +     ldrb    ip, [r0], #0x01
> +     strbt   ip, [r1], #0x01
> +     ldrb    ip, [r0], #0x01
> +     strbt   ip, [r1], #0x01
> +     cmp     r2, #0x00               /* All done? */
> +     moveq   pc, lr
> +
> +     /* Destination buffer is now word aligned */
> +.Lcopyout_wordaligned:
> +     ands    ip, r0, #0x03           /* Is src also word-aligned? */
> +     bne     .Lcopyout_bad_align     /* Nope. Things just got bad */
> +     cmp     r2, #0x08               /* Less than 8 bytes remaining? */
> +     blt     .Lcopyout_w_less_than8
> +
> +     /* Quad-align the destination buffer */
> +     tst     r1, #0x07               /* Already quad aligned? */
> +     ldrne   ip, [r0], #0x04
> +     push    {r4-r9}         /* Free up some registers */
> +     mov     r3, #-1                 /* Signal restore r4-r9 */
> +     tst     r1, #0x07               /* XXX: bug work-around */
> +     subne   r2, r2, #0x04
> +     strtne  ip, [r1], #0x04
> +
> +     /* Destination buffer quad aligned, source is word aligned */
> +     subs    r2, r2, #0x80
> +     blt     .Lcopyout_w_lessthan128
> +
> +     /* Copy 128 bytes at a time */
> +.Lcopyout_w_loop128:
> +     ldr     r4, [r0], #0x04         /* LD:00-03 */
> +     ldr     r5, [r0], #0x04         /* LD:04-07 */
> +     pld     [r0, #0x18]             /* Prefetch 0x20 */
> +     ldr     r6, [r0], #0x04         /* LD:08-0b */
> +     ldr     r7, [r0], #0x04         /* LD:0c-0f */
> +     ldr     r8, [r0], #0x04         /* LD:10-13 */
> +     ldr     r9, [r0], #0x04         /* LD:14-17 */
> +     strt    r4, [r1], #0x04         /* ST:00-03 */
> +     strt    r5, [r1], #0x04         /* ST:04-07 */
> +     ldr     r4, [r0], #0x04         /* LD:18-1b */
> +     ldr     r5, [r0], #0x04         /* LD:1c-1f */
> +     strt    r6, [r1], #0x04         /* ST:08-0b */
> +     strt    r7, [r1], #0x04         /* ST:0c-0f */
> +     ldr     r6, [r0], #0x04         /* LD:20-23 */
> +     ldr     r7, [r0], #0x04         /* LD:24-27 */
> +     pld     [r0, #0x18]             /* Prefetch 0x40 */
> +     strt    r8, [r1], #0x04         /* ST:10-13 */
> +     strt    r9, [r1], #0x04         /* ST:14-17 */
> +     ldr     r8, [r0], #0x04         /* LD:28-2b */
> +     ldr     r9, [r0], #0x04         /* LD:2c-2f */
> +     strt    r4, [r1], #0x04         /* ST:18-1b */
> +     strt    r5, [r1], #0x04         /* ST:1c-1f */
> +     ldr     r4, [r0], #0x04         /* LD:30-33 */
> +     ldr     r5, [r0], #0x04         /* LD:34-37 */
> +     strt    r6, [r1], #0x04         /* ST:20-23 */
> +     strt    r7, [r1], #0x04         /* ST:24-27 */
> +     ldr     r6, [r0], #0x04         /* LD:38-3b */
> +     ldr     r7, [r0], #0x04         /* LD:3c-3f */
> +     strt    r8, [r1], #0x04         /* ST:28-2b */
> +     strt    r9, [r1], #0x04         /* ST:2c-2f */
> +     ldr     r8, [r0], #0x04         /* LD:40-43 */
> +     ldr     r9, [r0], #0x04         /* LD:44-47 */
> +     pld     [r0, #0x18]             /* Prefetch 0x60 */
> +     strt    r4, [r1], #0x04         /* ST:30-33 */
> +     strt    r5, [r1], #0x04         /* ST:34-37 */
> +     ldr     r4, [r0], #0x04         /* LD:48-4b */
> +     ldr     r5, [r0], #0x04         /* LD:4c-4f */
> +     strt    r6, [r1], #0x04         /* ST:38-3b */
> +     strt    r7, [r1], #0x04         /* ST:3c-3f */
> +     ldr     r6, [r0], #0x04         /* LD:50-53 */
> +     ldr     r7, [r0], #0x04         /* LD:54-57 */
> +     strt    r8, [r1], #0x04         /* ST:40-43 */
> +     strt    r9, [r1], #0x04         /* ST:44-47 */
> +     ldr     r8, [r0], #0x04         /* LD:58-5b */
> +     ldr     r9, [r0], #0x04         /* LD:5c-5f */
> +     strt    r4, [r1], #0x04         /* ST:48-4b */
> +     strt    r5, [r1], #0x04         /* ST:4c-4f */
> +     ldr     r4, [r0], #0x04         /* LD:60-63 */
> +     ldr     r5, [r0], #0x04         /* LD:64-67 */
> +     pld     [r0, #0x18]             /* Prefetch 0x80 */
> +     strt    r6, [r1], #0x04         /* ST:50-53 */
> +     strt    r7, [r1], #0x04         /* ST:54-57 */
> +     ldr     r6, [r0], #0x04         /* LD:68-6b */
> +     ldr     r7, [r0], #0x04         /* LD:6c-6f */
> +     strt    r8, [r1], #0x04         /* ST:58-5b */
> +     strt    r9, [r1], #0x04         /* ST:5c-5f */
> +     ldr     r8, [r0], #0x04         /* LD:70-73 */
> +     ldr     r9, [r0], #0x04         /* LD:74-77 */
> +     strt    r4, [r1], #0x04         /* ST:60-63 */
> +     strt    r5, [r1], #0x04         /* ST:64-67 */
> +     ldr     r4, [r0], #0x04         /* LD:78-7b */
> +     ldr     r5, [r0], #0x04         /* LD:7c-7f */
> +     strt    r6, [r1], #0x04         /* ST:68-6b */
> +     strt    r7, [r1], #0x04         /* ST:6c-6f */
> +     strt    r8, [r1], #0x04         /* ST:70-73 */
> +     strt    r9, [r1], #0x04         /* ST:74-77 */
> +     subs    r2, r2, #0x80
> +     strt    r4, [r1], #0x04         /* ST:78-7b */
> +     strt    r5, [r1], #0x04         /* ST:7c-7f */
> +     bge     .Lcopyout_w_loop128
> +
> +.Lcopyout_w_lessthan128:
> +     adds    r2, r2, #0x80           /* Adjust for extra sub */
> +     popeq   {r4-r9}
> +     moveq   pc, lr                  /* Return now if done */
> +     subs    r2, r2, #0x20
> +     blt     .Lcopyout_w_lessthan32
> +
> +     /* Copy 32 bytes at a time */
> +.Lcopyout_w_loop32:
> +     ldr     r4, [r0], #0x04
> +     ldr     r5, [r0], #0x04
> +     pld     [r0, #0x18]
> +     ldr     r6, [r0], #0x04
> +     ldr     r7, [r0], #0x04
> +     ldr     r8, [r0], #0x04
> +     ldr     r9, [r0], #0x04
> +     strt    r4, [r1], #0x04
> +     strt    r5, [r1], #0x04
> +     ldr     r4, [r0], #0x04
> +     ldr     r5, [r0], #0x04
> +     strt    r6, [r1], #0x04
> +     strt    r7, [r1], #0x04
> +     strt    r8, [r1], #0x04
> +     strt    r9, [r1], #0x04
> +     subs    r2, r2, #0x20
> +     strt    r4, [r1], #0x04
> +     strt    r5, [r1], #0x04
> +     bge     .Lcopyout_w_loop32
> +
> +.Lcopyout_w_lessthan32:
> +     adds    r2, r2, #0x20           /* Adjust for extra sub */
> +     popeq   {r4-r9}
> +     moveq   pc, lr                  /* Return now if done */
> +
> +     and     r4, r2, #0x18
> +     rsb     r5, r4, #0x18
> +     subs    r2, r2, r4
> +     add     pc, pc, r5, lsl #1
> +     nop
> +
> +     /* At least 24 bytes remaining */
> +     ldr     r4, [r0], #0x04
> +     ldr     r5, [r0], #0x04
> +     strt    r4, [r1], #0x04
> +     strt    r5, [r1], #0x04
> +
> +     /* At least 16 bytes remaining */
> +     ldr     r4, [r0], #0x04
> +     ldr     r5, [r0], #0x04
> +     strt    r4, [r1], #0x04
> +     strt    r5, [r1], #0x04
> +
> +     /* At least 8 bytes remaining */
> +     ldr     r4, [r0], #0x04
> +     ldr     r5, [r0], #0x04
> +     strt    r4, [r1], #0x04
> +     strt    r5, [r1], #0x04
> +
> +     /* Less than 8 bytes remaining */
> +     pop     {r4-r9}
> +     moveq   pc, lr                  /* Return now if done */
> +     mov     r3, #0x00
> +
> +.Lcopyout_w_less_than8:
> +     subs    r2, r2, #0x04
> +     ldrge   ip, [r0], #0x04
> +     strtge  ip, [r1], #0x04
> +     moveq   pc, lr                  /* Return now if done */
> +     addlt   r2, r2, #0x04
> +     ldrb    ip, [r0], #0x01
> +     cmp     r2, #0x02
> +     ldrbge  r2, [r0], #0x01
> +     strbt   ip, [r1], #0x01
> +     ldrbgt  ip, [r0]
> +     strbtge r2, [r1], #0x01
> +     strbtgt ip, [r1]
> +     RET
> +
> +/*
> + * At this point, it has not been possible to word align both buffers.
> + * The destination buffer (r1) is word aligned, but the source buffer
> + * (r0) is not.
> + */
> +.Lcopyout_bad_align:
> +     push    {r4-r7}
> +     mov     r3, #0x01
> +     bic     r0, r0, #0x03
> +     cmp     ip, #2
> +     ldr     ip, [r0], #0x04
> +     bgt     .Lcopyout_bad3
> +     beq     .Lcopyout_bad2
> +     b       .Lcopyout_bad1
> +
> +.Lcopyout_bad1_loop16:
> +     mov     r4, ip, lsr #8
> +     ldr     r5, [r0], #0x04
> +     pld     [r0, #0x018]
> +     ldr     r6, [r0], #0x04
> +     ldr     r7, [r0], #0x04
> +     ldr     ip, [r0], #0x04
> +     orr     r4, r4, r5, lsl #24
> +     mov     r5, r5, lsr #8
> +     orr     r5, r5, r6, lsl #24
> +     mov     r6, r6, lsr #8
> +     orr     r6, r6, r7, lsl #24
> +     mov     r7, r7, lsr #8
> +     orr     r7, r7, ip, lsl #24
> +     strt    r4, [r1], #0x04
> +     strt    r5, [r1], #0x04
> +     strt    r6, [r1], #0x04
> +     strt    r7, [r1], #0x04
> +.Lcopyout_bad1:
> +     subs    r2, r2, #0x10
> +     bge     .Lcopyout_bad1_loop16
> +
> +     adds    r2, r2, #0x10
> +     popeq   {r4-r7}
> +     moveq   pc, lr                  /* Return now if done */
> +     subs    r2, r2, #0x04
> +     sublt   r0, r0, #0x03
> +     blt     .Lcopyout_l4
> +
> +.Lcopyout_bad1_loop4:
> +     mov     r4, ip, lsr #8
> +     ldr     ip, [r0], #0x04
> +     subs    r2, r2, #0x04
> +     orr     r4, r4, ip, lsl #24
> +     strt    r4, [r1], #0x04
> +     bge     .Lcopyout_bad1_loop4
> +     sub     r0, r0, #0x03
> +     b       .Lcopyout_l4
> +
> +.Lcopyout_bad2_loop16:
> +     mov     r4, ip, lsr #16
> +     ldr     r5, [r0], #0x04
> +     pld     [r0, #0x018]
> +     ldr     r6, [r0], #0x04
> +     ldr     r7, [r0], #0x04
> +     ldr     ip, [r0], #0x04
> +     orr     r4, r4, r5, lsl #16
> +     mov     r5, r5, lsr #16
> +     orr     r5, r5, r6, lsl #16
> +     mov     r6, r6, lsr #16
> +     orr     r6, r6, r7, lsl #16
> +     mov     r7, r7, lsr #16
> +     orr     r7, r7, ip, lsl #16
> +     strt    r4, [r1], #0x04
> +     strt    r5, [r1], #0x04
> +     strt    r6, [r1], #0x04
> +     strt    r7, [r1], #0x04
> +.Lcopyout_bad2:
> +     subs    r2, r2, #0x10
> +     bge     .Lcopyout_bad2_loop16
> +
> +     adds    r2, r2, #0x10
> +     popeq   {r4-r7}
> +     moveq   pc, lr                  /* Return now if done */
> +     subs    r2, r2, #0x04
> +     sublt   r0, r0, #0x02
> +     blt     .Lcopyout_l4
> +
> +.Lcopyout_bad2_loop4:
> +     mov     r4, ip, lsr #16
> +     ldr     ip, [r0], #0x04
> +     subs    r2, r2, #0x04
> +     orr     r4, r4, ip, lsl #16
> +     strt    r4, [r1], #0x04
> +     bge     .Lcopyout_bad2_loop4
> +     sub     r0, r0, #0x02
> +     b       .Lcopyout_l4
> +
> +.Lcopyout_bad3_loop16:
> +     mov     r4, ip, lsr #24
> +     ldr     r5, [r0], #0x04
> +     pld     [r0, #0x018]
> +     ldr     r6, [r0], #0x04
> +     ldr     r7, [r0], #0x04
> +     ldr     ip, [r0], #0x04
> +     orr     r4, r4, r5, lsl #8
> +     mov     r5, r5, lsr #24
> +     orr     r5, r5, r6, lsl #8
> +     mov     r6, r6, lsr #24
> +     orr     r6, r6, r7, lsl #8
> +     mov     r7, r7, lsr #24
> +     orr     r7, r7, ip, lsl #8
> +     strt    r4, [r1], #0x04
> +     strt    r5, [r1], #0x04
> +     strt    r6, [r1], #0x04
> +     strt    r7, [r1], #0x04
> +.Lcopyout_bad3:
> +     subs    r2, r2, #0x10
> +     bge     .Lcopyout_bad3_loop16
> +
> +     adds    r2, r2, #0x10
> +     popeq   {r4-r7}
> +     moveq   pc, lr                  /* Return now if done */
> +     subs    r2, r2, #0x04
> +     sublt   r0, r0, #0x01
> +     blt     .Lcopyout_l4
> +
> +.Lcopyout_bad3_loop4:
> +     mov     r4, ip, lsr #24
> +     ldr     ip, [r0], #0x04
> +     subs    r2, r2, #0x04
> +     orr     r4, r4, ip, lsl #8
> +     strt    r4, [r1], #0x04
> +     bge     .Lcopyout_bad3_loop4
> +     sub     r0, r0, #0x01
> +
> +.Lcopyout_l4:
> +     pop     {r4-r7}
> +     mov     r3, #0x00
> +     adds    r2, r2, #0x04
> +     moveq   pc, lr
> +.Lcopyout_l4_2:
> +     rsbs    r2, r2, #0x03
> +     addne   pc, pc, r2, lsl #3
> +     nop
> +     ldrb    ip, [r0], #0x01
> +     strbt   ip, [r1], #0x01
> +     ldrb    ip, [r0], #0x01
> +     strbt   ip, [r1], #0x01
> +     ldrb    ip, [r0]
> +     strbt   ip, [r1]
> +     RET
> +END(copyout)
> +
> +/*
> + * r0 = kernel space source address
> + * r1 = kernel space destination address
> + * r2 = length
> + *
> + * Copies bytes from kernel space to kernel space, aborting on page fault
> + */
> +ENTRY(kcopy)
> +     cmp     r2, #0x00
> +#if /* XXX or <= 0 like below? */ 1
> +     moveq   r0, #0
> +     moveq   pc, lr
> +#else
> +     movle   r0, #0x00
> +     RETc(le)                        /* Bail early if length is <= 0 */
> +#endif
> +
> +     push    {r10-r11, lr}
> +
> +     /* Get curcpu from TPIDRPRW. */
> +     mrc     CP15_TPIDRPRW(r10)
> +     ldr     r10, [r10, #CI_CURPCB]
> +
> +     mov     r3, #0x00
> +     adr     ip, .Lkcopy_fault
> +     ldr     r11, [r10, #PCB_ONFAULT]
> +     str     ip, [r10, #PCB_ONFAULT]
> +     bl      .Lkcopy_guts
> +     str     r11, [r10, #PCB_ONFAULT]
> +     mov     r0, #0x00
> +     pop     {r10-r11, pc}
> +
> +.Lkcopy_fault:
> +     str     r11, [r10, #PCB_ONFAULT]
> +     cmp     r3, #0x00
> +     popgt   {r4-r7}         /* r3 > 0 Restore r4-r7 */
> +     poplt   {r4-r9}         /* r3 < 0 Restore r4-r9 */
> +     pop     {r10-r11, pc}
> +
> +.Lkcopy_guts:
> +     pld     [r0]
> +     /* Word-align the destination buffer */
> +     ands    ip, r1, #0x03           /* Already word aligned? */
> +     beq     .Lkcopy_wordaligned     /* Yup */
> +     rsb     ip, ip, #0x04
> +     cmp     r2, ip                  /* Enough bytes left to align it? */
> +     blt     .Lkcopy_bad_endgame2    /* Nope. Just copy bytewise */
> +     sub     r2, r2, ip
> +     rsbs    ip, ip, #0x03
> +     addne   pc, pc, ip, lsl #3
> +     nop
> +     ldrb    ip, [r0], #0x01
> +     strb    ip, [r1], #0x01
> +     ldrb    ip, [r0], #0x01
> +     strb    ip, [r1], #0x01
> +     ldrb    ip, [r0], #0x01
> +     strb    ip, [r1], #0x01
> +     cmp     r2, #0x00               /* All done? */
> +     RETc(eq)
> +
> +     /* Destination buffer is now word aligned */
> +.Lkcopy_wordaligned:
> +     ands    ip, r0, #0x03           /* Is src also word-aligned? */
> +     bne     .Lkcopy_bad_align       /* Nope. Things just got bad */
> +     cmp     r2, #0x08               /* Less than 8 bytes remaining? */
> +     blt     .Lkcopy_w_less_than8
> +
> +     /* Quad-align the destination buffer */
> +     tst     r1, #0x07               /* Already quad aligned? */
> +     ldrne   ip, [r0], #0x04
> +     push    {r4-r9}         /* Free up some registers */
> +     mov     r3, #-1                 /* Signal restore r4-r9 */
> +     subne   r2, r2, #0x04
> +     strne   ip, [r1], #0x04
> +
> +     /* Destination buffer quad aligned, source is word aligned */
> +     subs    r2, r2, #0x80
> +     blt     .Lkcopy_w_lessthan128
> +
> +     /* Copy 128 bytes at a time */
> +.Lkcopy_w_loop128:
> +     ldr     r4, [r0], #0x04         /* LD:00-03 */
> +     ldr     r5, [r0], #0x04         /* LD:04-07 */
> +     pld     [r0, #0x18]             /* Prefetch 0x20 */
> +     ldr     r6, [r0], #0x04         /* LD:08-0b */
> +     ldr     r7, [r0], #0x04         /* LD:0c-0f */
> +     ldr     r8, [r0], #0x04         /* LD:10-13 */
> +     ldr     r9, [r0], #0x04         /* LD:14-17 */
> +     strd    r4, r5, [r1], #0x08     /* ST:00-07 */
> +     ldr     r4, [r0], #0x04         /* LD:18-1b */
> +     ldr     r5, [r0], #0x04         /* LD:1c-1f */
> +     strd    r6, r7, [r1], #0x08     /* ST:08-0f */
> +     ldr     r6, [r0], #0x04         /* LD:20-23 */
> +     ldr     r7, [r0], #0x04         /* LD:24-27 */
> +     pld     [r0, #0x18]             /* Prefetch 0x40 */
> +     strd    r8, r9, [r1], #0x08     /* ST:10-17 */
> +     ldr     r8, [r0], #0x04         /* LD:28-2b */
> +     ldr     r9, [r0], #0x04         /* LD:2c-2f */
> +     strd    r4, r5, [r1], #0x08     /* ST:18-1f */
> +     ldr     r4, [r0], #0x04         /* LD:30-33 */
> +     ldr     r5, [r0], #0x04         /* LD:34-37 */
> +     strd    r6, r7, [r1], #0x08     /* ST:20-27 */
> +     ldr     r6, [r0], #0x04         /* LD:38-3b */
> +     ldr     r7, [r0], #0x04         /* LD:3c-3f */
> +     strd    r8, r9, [r1], #0x08     /* ST:28-2f */
> +     ldr     r8, [r0], #0x04         /* LD:40-43 */
> +     ldr     r9, [r0], #0x04         /* LD:44-47 */
> +     pld     [r0, #0x18]             /* Prefetch 0x60 */
> +     strd    r4, r5, [r1], #0x08     /* ST:30-37 */
> +     ldr     r4, [r0], #0x04         /* LD:48-4b */
> +     ldr     r5, [r0], #0x04         /* LD:4c-4f */
> +     strd    r6, r7, [r1], #0x08     /* ST:38-3f */
> +     ldr     r6, [r0], #0x04         /* LD:50-53 */
> +     ldr     r7, [r0], #0x04         /* LD:54-57 */
> +     strd    r8, r9, [r1], #0x08     /* ST:40-47 */
> +     ldr     r8, [r0], #0x04         /* LD:58-5b */
> +     ldr     r9, [r0], #0x04         /* LD:5c-5f */
> +     strd    r4, r5, [r1], #0x08     /* ST:48-4f */
> +     ldr     r4, [r0], #0x04         /* LD:60-63 */
> +     ldr     r5, [r0], #0x04         /* LD:64-67 */
> +     pld     [r0, #0x18]             /* Prefetch 0x80 */
> +     strd    r6, r7, [r1], #0x08     /* ST:50-57 */
> +     ldr     r6, [r0], #0x04         /* LD:68-6b */
> +     ldr     r7, [r0], #0x04         /* LD:6c-6f */
> +     strd    r8, r9, [r1], #0x08     /* ST:58-5f */
> +     ldr     r8, [r0], #0x04         /* LD:70-73 */
> +     ldr     r9, [r0], #0x04         /* LD:74-77 */
> +     strd    r4, r5, [r1], #0x08     /* ST:60-67 */
> +     ldr     r4, [r0], #0x04         /* LD:78-7b */
> +     ldr     r5, [r0], #0x04         /* LD:7c-7f */
> +     strd    r6, r7, [r1], #0x08     /* ST:68-6f */
> +     strd    r8, r9, [r1], #0x08     /* ST:70-77 */
> +     subs    r2, r2, #0x80
> +     strd    r4, r5, [r1], #0x08     /* ST:78-7f */
> +     bge     .Lkcopy_w_loop128
> +
> +.Lkcopy_w_lessthan128:
> +     adds    r2, r2, #0x80           /* Adjust for extra sub */
> +     popeq   {r4-r9}
> +     moveq   pc, lr                  /* Return now if done */
> +     subs    r2, r2, #0x20
> +     blt     .Lkcopy_w_lessthan32
> +
> +     /* Copy 32 bytes at a time */
> +.Lkcopy_w_loop32:
> +     ldr     r4, [r0], #0x04
> +     ldr     r5, [r0], #0x04
> +     pld     [r0, #0x18]
> +     ldr     r6, [r0], #0x04
> +     ldr     r7, [r0], #0x04
> +     ldr     r8, [r0], #0x04
> +     ldr     r9, [r0], #0x04
> +     strd    r4, r5, [r1], #0x08
> +     ldr     r4, [r0], #0x04
> +     ldr     r5, [r0], #0x04
> +     strd    r6, r7, [r1], #0x08
> +     strd    r8, r9, [r1], #0x08
> +     subs    r2, r2, #0x20
> +     strd    r4, r5, [r1], #0x08
> +     bge     .Lkcopy_w_loop32
> +
> +.Lkcopy_w_lessthan32:
> +     adds    r2, r2, #0x20           /* Adjust for extra sub */
> +     popeq   {r4-r9}
> +     moveq   pc, lr                  /* Return now if done */
> +
> +     and     r4, r2, #0x18
> +     rsb     r5, r4, #0x18
> +     subs    r2, r2, r4
> +     add     pc, pc, r5, lsl #1
> +     nop
> +
> +     /* At least 24 bytes remaining */
> +     ldr     r4, [r0], #0x04
> +     ldr     r5, [r0], #0x04
> +     nop
> +     strd    r4, r5, [r1], #0x08
> +
> +     /* At least 16 bytes remaining */
> +     ldr     r4, [r0], #0x04
> +     ldr     r5, [r0], #0x04
> +     nop
> +     strd    r4, r5, [r1], #0x08
> +
> +     /* At least 8 bytes remaining */
> +     ldr     r4, [r0], #0x04
> +     ldr     r5, [r0], #0x04
> +     nop
> +     strd    r4, r5, [r1], #0x08
> +
> +     /* Less than 8 bytes remaining */
> +     pop     {r4-r9}
> +     moveq   pc, lr                  /* Return now if done */
> +     mov     r3, #0x00
> +
> +.Lkcopy_w_less_than8:
> +     subs    r2, r2, #0x04
> +     ldrge   ip, [r0], #0x04
> +     strge   ip, [r1], #0x04
> +     moveq   pc, lr                  /* Return now if done */
> +     addlt   r2, r2, #0x04
> +     ldrb    ip, [r0], #0x01
> +     cmp     r2, #0x02
> +     ldrbge  r2, [r0], #0x01
> +     strb    ip, [r1], #0x01
> +     ldrbgt  ip, [r0]
> +     strbge  r2, [r1], #0x01
> +     strbgt  ip, [r1]
> +     RET
> +
> +/*
> + * At this point, it has not been possible to word align both buffers.
> + * The destination buffer (r1) is word aligned, but the source buffer
> + * (r0) is not.
> + */
> +.Lkcopy_bad_align:
> +     push    {r4-r7}
> +     mov     r3, #0x01
> +     bic     r0, r0, #0x03
> +     cmp     ip, #2
> +     ldr     ip, [r0], #0x04
> +     bgt     .Lkcopy_bad3
> +     beq     .Lkcopy_bad2
> +     b       .Lkcopy_bad1
> +
> +.Lkcopy_bad1_loop16:
> +     mov     r4, ip, lsr #8
> +     ldr     r5, [r0], #0x04
> +     pld     [r0, #0x018]
> +     ldr     r6, [r0], #0x04
> +     ldr     r7, [r0], #0x04
> +     ldr     ip, [r0], #0x04
> +     orr     r4, r4, r5, lsl #24
> +     mov     r5, r5, lsr #8
> +     orr     r5, r5, r6, lsl #24
> +     mov     r6, r6, lsr #8
> +     orr     r6, r6, r7, lsl #24
> +     mov     r7, r7, lsr #8
> +     orr     r7, r7, ip, lsl #24
> +     str     r4, [r1], #0x04
> +     str     r5, [r1], #0x04
> +     str     r6, [r1], #0x04
> +     str     r7, [r1], #0x04
> +.Lkcopy_bad1:
> +     subs    r2, r2, #0x10
> +     bge     .Lkcopy_bad1_loop16
> +
> +     adds    r2, r2, #0x10
> +     popeq   {r4-r7}
> +     moveq   pc, lr                  /* Return now if done */
> +     subs    r2, r2, #0x04
> +     sublt   r0, r0, #0x03
> +     blt     .Lkcopy_bad_endgame
> +
> +.Lkcopy_bad1_loop4:
> +     mov     r4, ip, lsr #8
> +     ldr     ip, [r0], #0x04
> +     subs    r2, r2, #0x04
> +     orr     r4, r4, ip, lsl #24
> +     str     r4, [r1], #0x04
> +     bge     .Lkcopy_bad1_loop4
> +     sub     r0, r0, #0x03
> +     b       .Lkcopy_bad_endgame
> +
> +.Lkcopy_bad2_loop16:
> +     mov     r4, ip, lsr #16
> +     ldr     r5, [r0], #0x04
> +     pld     [r0, #0x018]
> +     ldr     r6, [r0], #0x04
> +     ldr     r7, [r0], #0x04
> +     ldr     ip, [r0], #0x04
> +     orr     r4, r4, r5, lsl #16
> +     mov     r5, r5, lsr #16
> +     orr     r5, r5, r6, lsl #16
> +     mov     r6, r6, lsr #16
> +     orr     r6, r6, r7, lsl #16
> +     mov     r7, r7, lsr #16
> +     orr     r7, r7, ip, lsl #16
> +     str     r4, [r1], #0x04
> +     str     r5, [r1], #0x04
> +     str     r6, [r1], #0x04
> +     str     r7, [r1], #0x04
> +.Lkcopy_bad2:
> +     subs    r2, r2, #0x10
> +     bge     .Lkcopy_bad2_loop16
> +
> +     adds    r2, r2, #0x10
> +     popeq   {r4-r7}
> +     moveq   pc, lr                  /* Return now if done */
> +     subs    r2, r2, #0x04
> +     sublt   r0, r0, #0x02
> +     blt     .Lkcopy_bad_endgame
> +
> +.Lkcopy_bad2_loop4:
> +     mov     r4, ip, lsr #16
> +     ldr     ip, [r0], #0x04
> +     subs    r2, r2, #0x04
> +     orr     r4, r4, ip, lsl #16
> +     str     r4, [r1], #0x04
> +     bge     .Lkcopy_bad2_loop4
> +     sub     r0, r0, #0x02
> +     b       .Lkcopy_bad_endgame
> +
> +.Lkcopy_bad3_loop16:
> +     mov     r4, ip, lsr #24
> +     ldr     r5, [r0], #0x04
> +     pld     [r0, #0x018]
> +     ldr     r6, [r0], #0x04
> +     ldr     r7, [r0], #0x04
> +     ldr     ip, [r0], #0x04
> +     orr     r4, r4, r5, lsl #8
> +     mov     r5, r5, lsr #24
> +     orr     r5, r5, r6, lsl #8
> +     mov     r6, r6, lsr #24
> +     orr     r6, r6, r7, lsl #8
> +     mov     r7, r7, lsr #24
> +     orr     r7, r7, ip, lsl #8
> +     str     r4, [r1], #0x04
> +     str     r5, [r1], #0x04
> +     str     r6, [r1], #0x04
> +     str     r7, [r1], #0x04
> +.Lkcopy_bad3:
> +     subs    r2, r2, #0x10
> +     bge     .Lkcopy_bad3_loop16
> +
> +     adds    r2, r2, #0x10
> +     popeq   {r4-r7}
> +     moveq   pc, lr                  /* Return now if done */
> +     subs    r2, r2, #0x04
> +     sublt   r0, r0, #0x01
> +     blt     .Lkcopy_bad_endgame
> +
> +.Lkcopy_bad3_loop4:
> +     mov     r4, ip, lsl #24
> +     ldr     ip, [r0], #0x04
> +     subs    r2, r2, #0x04
> +     orr     r4, r4, ip, lsl #8
> +     str     r4, [r1], #0x04
> +     bge     .Lkcopy_bad3_loop4
> +     sub     r0, r0, #0x01
> +
> +.Lkcopy_bad_endgame:
> +     pop     {r4-r7}
> +     mov     r3, #0x00
> +     adds    r2, r2, #0x04
> +     moveq   pc, lr
> +.Lkcopy_bad_endgame2:
> +     rsbs    r2, r2, #0x03
> +     addne   pc, pc, r2, lsl #3
> +     nop
> +     ldrb    ip, [r0], #0x01
> +     strb    ip, [r1], #0x01
> +     ldrb    ip, [r0], #0x01
> +     strb    ip, [r1], #0x01
> +     ldrb    ip, [r0]
> +     strb    ip, [r1]
> +     RET
> +END(kcopy)
> 
> 

Reply via email to