armv7: bcopy_page.S XSCALE cleanup+pld preload

Artturi Alm Tue, 06 Mar 2018 13:54:27 -0800

Hi,

at worst, for ARMv5TE and above the hint instructions execute as NOPs,
if they are not implemented.


while the Q in link below, does mention Cortex-A8,
i believe the "answer" they provide, does somewhat apply 'upwards' :)

-Artturi


Q: What is the fastest way to copy memory on a Cortex-A8?
http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka13544.html



diff --git sys/arch/arm/arm/bcopy_page.S sys/arch/arm/arm/bcopy_page.S
index 3c8f1ae2ce6..e8ee69bd897 100644
--- sys/arch/arm/arm/bcopy_page.S
+++ sys/arch/arm/arm/bcopy_page.S
@@ -44,10 +44,6 @@
 
 #include "assym.h"
 
-#ifndef __XSCALE__
-
-/* #define BIG_LOOPS */
-
 /*
  * bcopy_page(src, dest)
  *
@@ -58,46 +54,24 @@
  *   r1 - dest address
  *
  * Requires:
- *   number of bytes per page (PAGE_SIZE) is a multiple of 512 (BIG_LOOPS), 128
- *   otherwise.
+ *   number of bytes per page (PAGE_SIZE) is a multiple of 512.
  */
 
-#define        CHUNK_SIZE      32
-
-#define        PREFETCH_FIRST_CHUNK    /* nothing */
-#define        PREFETCH_NEXT_CHUNK     /* nothing */
-
-#ifndef COPY_CHUNK
 #define        COPY_CHUNK \
-       PREFETCH_NEXT_CHUNK ; \
+       pld     [r0, #0x18] ; \
        ldmia   r0!, {r3-r8,ip,lr} ; \
        stmia   r1!, {r3-r8,ip,lr}
-#endif /* ! COPY_CHUNK */
-
-#ifndef SAVE_REGS
-#define        SAVE_REGS       stmfd   sp!, {r4-r8, lr}
-#define        RESTORE_REGS    ldmfd   sp!, {r4-r8, pc}
-#endif
 
 ENTRY(bcopy_page)
-       PREFETCH_FIRST_CHUNK
-       SAVE_REGS
-#ifdef BIG_LOOPS
+       pld     [r0]
+       stmfd   sp!, {r4-r8, lr}
        mov     r2, #(PAGE_SIZE >> 9)
-#else
-       mov     r2, #(PAGE_SIZE >> 7)
-#endif
-
 1:
        COPY_CHUNK
        COPY_CHUNK
        COPY_CHUNK
        COPY_CHUNK
 
-#ifdef BIG_LOOPS
-       /* There is little point making the loop any larger; unless we are
-          running with the cache off, the load/store overheads will
-          completely dominate this loop.  */
        COPY_CHUNK
        COPY_CHUNK
        COPY_CHUNK
@@ -112,11 +86,14 @@ ENTRY(bcopy_page)
        COPY_CHUNK
        COPY_CHUNK
        COPY_CHUNK
-#endif
+
+       /* There is little point making the loop any larger; unless we are
+          running with the cache off, the load/store overheads will
+          completely dominate this loop.  */
        subs    r2, r2, #1
        bne     1b
 
-       RESTORE_REGS            /* ...and return. */
+       ldmfd   sp!, {r4-r8, pc}        /* ...and return. */
 
 /*
  * bzero_page(dest)
@@ -127,17 +104,12 @@ ENTRY(bcopy_page)
  *   r0 - dest address
  *
  * Requires:
- *   number of bytes per page (PAGE_SIZE) is a multiple of 512 (BIG_LOOPS), 128
- *   otherwise
+ *   number of bytes per page (PAGE_SIZE) is a multiple of 512.
  */
 
 ENTRY(bzero_page)
        stmfd   sp!, {r4-r8, lr}
-#ifdef BIG_LOOPS
        mov     r2, #(PAGE_SIZE >> 9)
-#else
-       mov     r2, #(PAGE_SIZE >> 7)
-#endif
        mov     r3, #0
        mov     r4, #0
        mov     r5, #0
@@ -153,10 +125,6 @@ ENTRY(bzero_page)
        stmia   r0!, {r3-r8,ip,lr}
        stmia   r0!, {r3-r8,ip,lr}
 
-#ifdef BIG_LOOPS
-       /* There is little point making the loop any larger; unless we are
-          running with the cache off, the load/store overheads will
-          completely dominate this loop.  */
        stmia   r0!, {r3-r8,ip,lr}
        stmia   r0!, {r3-r8,ip,lr}
        stmia   r0!, {r3-r8,ip,lr}
@@ -172,105 +140,7 @@ ENTRY(bzero_page)
        stmia   r0!, {r3-r8,ip,lr}
        stmia   r0!, {r3-r8,ip,lr}
 
-#endif
-
        subs    r2, r2, #1
        bne     1b
 
        ldmfd   sp!, {r4-r8, pc}
-
-#else  /* __XSCALE__ */
-
-/*
- * XSCALE version of bcopy_page
- */
-ENTRY(bcopy_page)
-       pld     [r0]
-       stmfd   sp!, {r4, r5}
-       mov     ip, #32
-       ldr     r2, [r0], #0x04         /* 0x00 */
-       ldr     r3, [r0], #0x04         /* 0x04 */
-1:     pld     [r0, #0x18]             /* Prefetch 0x20 */
-       ldr     r4, [r0], #0x04         /* 0x08 */
-       ldr     r5, [r0], #0x04         /* 0x0c */
-       strd    r2, [r1], #0x08
-       ldr     r2, [r0], #0x04         /* 0x10 */
-       ldr     r3, [r0], #0x04         /* 0x14 */
-       strd    r4, [r1], #0x08
-       ldr     r4, [r0], #0x04         /* 0x18 */
-       ldr     r5, [r0], #0x04         /* 0x1c */
-       strd    r2, [r1], #0x08
-       ldr     r2, [r0], #0x04         /* 0x20 */
-       ldr     r3, [r0], #0x04         /* 0x24 */
-       pld     [r0, #0x18]             /* Prefetch 0x40 */
-       strd    r4, [r1], #0x08
-       ldr     r4, [r0], #0x04         /* 0x28 */
-       ldr     r5, [r0], #0x04         /* 0x2c */
-       strd    r2, [r1], #0x08
-       ldr     r2, [r0], #0x04         /* 0x30 */
-       ldr     r3, [r0], #0x04         /* 0x34 */
-       strd    r4, [r1], #0x08
-       ldr     r4, [r0], #0x04         /* 0x38 */
-       ldr     r5, [r0], #0x04         /* 0x3c */
-       strd    r2, [r1], #0x08
-       ldr     r2, [r0], #0x04         /* 0x40 */
-       ldr     r3, [r0], #0x04         /* 0x44 */
-       pld     [r0, #0x18]             /* Prefetch 0x60 */
-       strd    r4, [r1], #0x08
-       ldr     r4, [r0], #0x04         /* 0x48 */
-       ldr     r5, [r0], #0x04         /* 0x4c */
-       strd    r2, [r1], #0x08
-       ldr     r2, [r0], #0x04         /* 0x50 */
-       ldr     r3, [r0], #0x04         /* 0x54 */
-       strd    r4, [r1], #0x08
-       ldr     r4, [r0], #0x04         /* 0x58 */
-       ldr     r5, [r0], #0x04         /* 0x5c */
-       strd    r2, [r1], #0x08
-       ldr     r2, [r0], #0x04         /* 0x60 */
-       ldr     r3, [r0], #0x04         /* 0x64 */
-       pld     [r0, #0x18]             /* Prefetch 0x80 */
-       strd    r4, [r1], #0x08
-       ldr     r4, [r0], #0x04         /* 0x68 */
-       ldr     r5, [r0], #0x04         /* 0x6c */
-       strd    r2, [r1], #0x08
-       ldr     r2, [r0], #0x04         /* 0x70 */
-       ldr     r3, [r0], #0x04         /* 0x74 */
-       strd    r4, [r1], #0x08
-       ldr     r4, [r0], #0x04         /* 0x78 */
-       ldr     r5, [r0], #0x04         /* 0x7c */
-       strd    r2, [r1], #0x08
-       subs    ip, ip, #0x01
-       ldrgt   r2, [r0], #0x04         /* 0x80 */
-       ldrgt   r3, [r0], #0x04         /* 0x84 */
-       strd    r4, [r1], #0x08
-       bgt     1b
-       ldmfd   sp!, {r4, r5}
-       mov     pc, lr
-
-/*
- * XSCALE version of bzero_page
- */
-ENTRY(bzero_page)
-       mov     r1, #PAGE_SIZE
-       mov     r2, #0
-       mov     r3, #0
-1:     strd    r2, [r0], #8            /* 32 */
-       strd    r2, [r0], #8
-       strd    r2, [r0], #8
-       strd    r2, [r0], #8
-       strd    r2, [r0], #8            /* 64 */
-       strd    r2, [r0], #8
-       strd    r2, [r0], #8
-       strd    r2, [r0], #8
-       strd    r2, [r0], #8            /* 96 */
-       strd    r2, [r0], #8
-       strd    r2, [r0], #8
-       strd    r2, [r0], #8
-       strd    r2, [r0], #8            /* 128 */
-       strd    r2, [r0], #8
-       strd    r2, [r0], #8
-       strd    r2, [r0], #8
-       subs    r1, r1, #128
-       bne     1b
-       mov     pc, lr
-#endif /* __XSCALE__ */

armv7: bcopy_page.S XSCALE cleanup+pld preload

Reply via email to