Hi, at worst, for ARMv5TE and above the hint instructions execute as NOPs, if they are not implemented.
while the Q in link below, does mention Cortex-A8, i believe the "answer" they provide, does somewhat apply 'upwards' :) -Artturi Q: What is the fastest way to copy memory on a Cortex-A8? http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka13544.html diff --git sys/arch/arm/arm/bcopy_page.S sys/arch/arm/arm/bcopy_page.S index 3c8f1ae2ce6..e8ee69bd897 100644 --- sys/arch/arm/arm/bcopy_page.S +++ sys/arch/arm/arm/bcopy_page.S @@ -44,10 +44,6 @@ #include "assym.h" -#ifndef __XSCALE__ - -/* #define BIG_LOOPS */ - /* * bcopy_page(src, dest) * @@ -58,46 +54,24 @@ * r1 - dest address * * Requires: - * number of bytes per page (PAGE_SIZE) is a multiple of 512 (BIG_LOOPS), 128 - * otherwise. + * number of bytes per page (PAGE_SIZE) is a multiple of 512. */ -#define CHUNK_SIZE 32 - -#define PREFETCH_FIRST_CHUNK /* nothing */ -#define PREFETCH_NEXT_CHUNK /* nothing */ - -#ifndef COPY_CHUNK #define COPY_CHUNK \ - PREFETCH_NEXT_CHUNK ; \ + pld [r0, #0x18] ; \ ldmia r0!, {r3-r8,ip,lr} ; \ stmia r1!, {r3-r8,ip,lr} -#endif /* ! COPY_CHUNK */ - -#ifndef SAVE_REGS -#define SAVE_REGS stmfd sp!, {r4-r8, lr} -#define RESTORE_REGS ldmfd sp!, {r4-r8, pc} -#endif ENTRY(bcopy_page) - PREFETCH_FIRST_CHUNK - SAVE_REGS -#ifdef BIG_LOOPS + pld [r0] + stmfd sp!, {r4-r8, lr} mov r2, #(PAGE_SIZE >> 9) -#else - mov r2, #(PAGE_SIZE >> 7) -#endif - 1: COPY_CHUNK COPY_CHUNK COPY_CHUNK COPY_CHUNK -#ifdef BIG_LOOPS - /* There is little point making the loop any larger; unless we are - running with the cache off, the load/store overheads will - completely dominate this loop. */ COPY_CHUNK COPY_CHUNK COPY_CHUNK @@ -112,11 +86,14 @@ ENTRY(bcopy_page) COPY_CHUNK COPY_CHUNK COPY_CHUNK -#endif + + /* There is little point making the loop any larger; unless we are + running with the cache off, the load/store overheads will + completely dominate this loop. */ subs r2, r2, #1 bne 1b - RESTORE_REGS /* ...and return. */ + ldmfd sp!, {r4-r8, pc} /* ...and return. */ /* * bzero_page(dest) @@ -127,17 +104,12 @@ ENTRY(bcopy_page) * r0 - dest address * * Requires: - * number of bytes per page (PAGE_SIZE) is a multiple of 512 (BIG_LOOPS), 128 - * otherwise + * number of bytes per page (PAGE_SIZE) is a multiple of 512. */ ENTRY(bzero_page) stmfd sp!, {r4-r8, lr} -#ifdef BIG_LOOPS mov r2, #(PAGE_SIZE >> 9) -#else - mov r2, #(PAGE_SIZE >> 7) -#endif mov r3, #0 mov r4, #0 mov r5, #0 @@ -153,10 +125,6 @@ ENTRY(bzero_page) stmia r0!, {r3-r8,ip,lr} stmia r0!, {r3-r8,ip,lr} -#ifdef BIG_LOOPS - /* There is little point making the loop any larger; unless we are - running with the cache off, the load/store overheads will - completely dominate this loop. */ stmia r0!, {r3-r8,ip,lr} stmia r0!, {r3-r8,ip,lr} stmia r0!, {r3-r8,ip,lr} @@ -172,105 +140,7 @@ ENTRY(bzero_page) stmia r0!, {r3-r8,ip,lr} stmia r0!, {r3-r8,ip,lr} -#endif - subs r2, r2, #1 bne 1b ldmfd sp!, {r4-r8, pc} - -#else /* __XSCALE__ */ - -/* - * XSCALE version of bcopy_page - */ -ENTRY(bcopy_page) - pld [r0] - stmfd sp!, {r4, r5} - mov ip, #32 - ldr r2, [r0], #0x04 /* 0x00 */ - ldr r3, [r0], #0x04 /* 0x04 */ -1: pld [r0, #0x18] /* Prefetch 0x20 */ - ldr r4, [r0], #0x04 /* 0x08 */ - ldr r5, [r0], #0x04 /* 0x0c */ - strd r2, [r1], #0x08 - ldr r2, [r0], #0x04 /* 0x10 */ - ldr r3, [r0], #0x04 /* 0x14 */ - strd r4, [r1], #0x08 - ldr r4, [r0], #0x04 /* 0x18 */ - ldr r5, [r0], #0x04 /* 0x1c */ - strd r2, [r1], #0x08 - ldr r2, [r0], #0x04 /* 0x20 */ - ldr r3, [r0], #0x04 /* 0x24 */ - pld [r0, #0x18] /* Prefetch 0x40 */ - strd r4, [r1], #0x08 - ldr r4, [r0], #0x04 /* 0x28 */ - ldr r5, [r0], #0x04 /* 0x2c */ - strd r2, [r1], #0x08 - ldr r2, [r0], #0x04 /* 0x30 */ - ldr r3, [r0], #0x04 /* 0x34 */ - strd r4, [r1], #0x08 - ldr r4, [r0], #0x04 /* 0x38 */ - ldr r5, [r0], #0x04 /* 0x3c */ - strd r2, [r1], #0x08 - ldr r2, [r0], #0x04 /* 0x40 */ - ldr r3, [r0], #0x04 /* 0x44 */ - pld [r0, #0x18] /* Prefetch 0x60 */ - strd r4, [r1], #0x08 - ldr r4, [r0], #0x04 /* 0x48 */ - ldr r5, [r0], #0x04 /* 0x4c */ - strd r2, [r1], #0x08 - ldr r2, [r0], #0x04 /* 0x50 */ - ldr r3, [r0], #0x04 /* 0x54 */ - strd r4, [r1], #0x08 - ldr r4, [r0], #0x04 /* 0x58 */ - ldr r5, [r0], #0x04 /* 0x5c */ - strd r2, [r1], #0x08 - ldr r2, [r0], #0x04 /* 0x60 */ - ldr r3, [r0], #0x04 /* 0x64 */ - pld [r0, #0x18] /* Prefetch 0x80 */ - strd r4, [r1], #0x08 - ldr r4, [r0], #0x04 /* 0x68 */ - ldr r5, [r0], #0x04 /* 0x6c */ - strd r2, [r1], #0x08 - ldr r2, [r0], #0x04 /* 0x70 */ - ldr r3, [r0], #0x04 /* 0x74 */ - strd r4, [r1], #0x08 - ldr r4, [r0], #0x04 /* 0x78 */ - ldr r5, [r0], #0x04 /* 0x7c */ - strd r2, [r1], #0x08 - subs ip, ip, #0x01 - ldrgt r2, [r0], #0x04 /* 0x80 */ - ldrgt r3, [r0], #0x04 /* 0x84 */ - strd r4, [r1], #0x08 - bgt 1b - ldmfd sp!, {r4, r5} - mov pc, lr - -/* - * XSCALE version of bzero_page - */ -ENTRY(bzero_page) - mov r1, #PAGE_SIZE - mov r2, #0 - mov r3, #0 -1: strd r2, [r0], #8 /* 32 */ - strd r2, [r0], #8 - strd r2, [r0], #8 - strd r2, [r0], #8 - strd r2, [r0], #8 /* 64 */ - strd r2, [r0], #8 - strd r2, [r0], #8 - strd r2, [r0], #8 - strd r2, [r0], #8 /* 96 */ - strd r2, [r0], #8 - strd r2, [r0], #8 - strd r2, [r0], #8 - strd r2, [r0], #8 /* 128 */ - strd r2, [r0], #8 - strd r2, [r0], #8 - strd r2, [r0], #8 - subs r1, r1, #128 - bne 1b - mov pc, lr -#endif /* __XSCALE__ */