Module Name: src Committed By: matt Date: Wed Aug 29 18:37:14 UTC 2012
Modified Files: src/sys/arch/arm/arm: cpufunc.c cpufunc_asm_armv7.S Log Message: Recode armv7_dcache_wbinv_all in asm. Add armv7_dcache_inv_all and armv7_icache_inv_all as well. Use dsb/dmb/isb instructions To generate a diff of this commit: cvs rdiff -u -r1.112 -r1.113 src/sys/arch/arm/arm/cpufunc.c cvs rdiff -u -r1.2 -r1.3 src/sys/arch/arm/arm/cpufunc_asm_armv7.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/arm/arm/cpufunc.c diff -u src/sys/arch/arm/arm/cpufunc.c:1.112 src/sys/arch/arm/arm/cpufunc.c:1.113 --- src/sys/arch/arm/arm/cpufunc.c:1.112 Wed Aug 29 18:29:04 2012 +++ src/sys/arch/arm/arm/cpufunc.c Wed Aug 29 18:37:14 2012 @@ -1,4 +1,4 @@ -/* $NetBSD: cpufunc.c,v 1.112 2012/08/29 18:29:04 matt Exp $ */ +/* $NetBSD: cpufunc.c,v 1.113 2012/08/29 18:37:14 matt Exp $ */ /* * arm7tdmi support code Copyright (c) 2001 John Fremlin @@ -49,7 +49,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: cpufunc.c,v 1.112 2012/08/29 18:29:04 matt Exp $"); +__KERNEL_RCSID(0, "$NetBSD: cpufunc.c,v 1.113 2012/08/29 18:37:14 matt Exp $"); #include "opt_compat_netbsd.h" #include "opt_cpuoptions.h" @@ -2835,53 +2835,6 @@ armv7_setup(char *args) curcpu()->ci_ctrl = cpuctrl; cpu_control(0xffffffff, cpuctrl); } - -/* Clean the data cache to the level of coherency. Slow. */ -void -armv7_dcache_wbinv_all(void) -{ - u_int clidr, loc, level; - - /* Cache Level ID Register */ - __asm volatile("mrc\tp15, 1, %0, c0, c0, 1" : "=r" (clidr)); - - loc = (clidr >> 24) & 7; /* Level of Coherency */ - - for (level = 0; level <= loc; level++) { - u_int ctype, csid; - int line_size, ways, nsets, wayshift, setshift; - - ctype = (clidr >> (level * 3)) & 7; - /* We're supposed to stop when ctype == 0, but we - * trust that loc isn't larger than necesssary. */ - if (ctype < 2) continue; /* no cache / only icache */ - - csid = get_cachesize_cp15(level << 1); - line_size = CPU_CSID_LEN(csid); - ways = CPU_CSID_ASSOC(csid); - nsets = (csid >> 13) & 0x7fff; - - wayshift = __builtin_clz(ways); /* leading zeros */ - setshift = line_size + 4; - - for (; nsets >= 0; nsets--) { - int way; - - for (way = ways; way >= 0; way--) { - /* Clean by set/way */ - const u_int sw = (way << wayshift) - | (nsets << setshift) - | (level << 1); - - __asm volatile("mcr\tp15, 0, %0, c7, c10, 2" - :: "r"(sw)); - } - } - } - - __asm volatile("dsb"); - __asm volatile("isb"); -} #endif /* CPU_CORTEX */ Index: src/sys/arch/arm/arm/cpufunc_asm_armv7.S diff -u src/sys/arch/arm/arm/cpufunc_asm_armv7.S:1.2 src/sys/arch/arm/arm/cpufunc_asm_armv7.S:1.3 --- src/sys/arch/arm/arm/cpufunc_asm_armv7.S:1.2 Sat Jun 19 19:44:57 2010 +++ src/sys/arch/arm/arm/cpufunc_asm_armv7.S Wed Aug 29 18:37:14 2012 @@ -31,50 +31,55 @@ #include <machine/cpu.h> #include <machine/asm.h> -#define entrysize #32 - .arch armv7a - ENTRY(armv7_cpu_sleep) - tst r0, #0x00000000 @shouldn't sleep 0 - wfi - RET + tst r0, #0x00000000 @ shouldn't sleep 0 + wfene @ this can cheaper when doing MP + bx lr END(armv7_cpu_sleep) ENTRY(armv7_wait) - mrc p15, 0, r0, c2, c0, 0 @arbitrary read of CP15 - add r0, r0, #0 @a stall - RET + mrc p15, 0, r0, c2, c0, 0 @ arbitrary read of CP15 + add r0, r0, #0 @ a stall + bx lr END(armv7_wait) ENTRY(armv7_context_switch) - mcr p15, 0, r0, c7, c10, 4 @drain the write buffer - mcr p15, 0, r0, c2, c0, 0 @set the new TTB - mcr p15, 0, r0, c8, c7, 0 @flush the I+D - RET + dsb @ data synchronization barrier + mcr p15, 0, r0, c2, c0, 0 @ set the new TTB +#ifdef MULTIPROCESSOR + mcr p15, 0, r0, c8, c3, 0 @ flush I+D tlb single entry +#else + mcr p15, 0, r0, c8, c7, 0 @ flush the I+D +#endif + dsb + isb + bx lr END(armv7_context_switch) ENTRY(armv7_tlb_flushID_SE) - mcr p15, 0, r0, c8, c7, 1 @flush I+D tlb single entry - mcr p15, 0, r0, c7, c10, 4 @drain write buffer - RET +#ifdef MULTIPROCESSOR + mcr p15, 0, r0, c8, c3, 1 @ flush I+D tlb single entry +#else + mcr p15, 0, r0, c8, c7, 1 @ flush I+D tlb single entry +#endif + dsb @ data synchronization barrier + isb + bx lr END(armv7_tlb_flushID_SE) ENTRY(armv7_setttb) -/* Does this even exist on armv7? */ -#ifdef PMAP_CACHE_VIVT - stmdb sp!, {r0, lr} - bl _C_LABEL(armv7_idcache_wbinv_all) @clean the D cache - ldmia sp!, {r0, lr} + mcr p15, 0, r0, c2, c0, 0 @ load new TTB +#ifdef MULTIPROCESSOR + mcr p15, 0, r0, c8, c3, 0 @ invalidate all I+D TLBs +#else + mcr p15, 0, r0, c8, c7, 0 @ invalidate all I+D TLBs #endif - - mcr p15, 0, r0, c2, c0, 0 @load new TTB - mcr p15, 0, r0, c8, c7, 0 @invalidate I+D TLBs - mcr p15, 0, r0, c7, c10, 4 @drain the write buffer - - RET + dsb @ data synchronization barrier + isb + bx lr END(armv7_setttb) /* Cache operations. */ @@ -82,14 +87,20 @@ END(armv7_setttb) /* LINTSTUB: void armv7_icache_sync_range(vaddr_t, vsize_t); */ ENTRY_NP(armv7_icache_sync_range) 1: - mcr p15, 0, r0, c7, c5, 1 @invalidate the I-Cache line - mcr p15, 0, r0, c7, c10, 1 @wb the D-Cache line - add r0, r0, entrysize - subs r1, r1, entrysize + mcr p15, 0, r0, c7, c5, 1 @ invalidate the I-Cache line + mcr p15, 0, r0, c7, c10, 1 @ wb the D-Cache line + mrc p15, 1, r2, c0, c0, 0 @ read CCSIDR + and r2, r2, #7 @ get line size (log2(size)-4) + add r2, r2, #4 @ adjust + mov ip, #1 @ make a bit mask + lsl r2, ip, r2 @ and shift into position + add r0, r0, r2 + subs r1, r1, r2 bhi 1b - mcr p15, 0, r0, c7, c10, 4 @drain the write buffer, BSB - RET + dsb @ data synchronization barrier + isb + bx lr END(armv7_icache_sync_range) /* LINTSTUB: void armv7_icache_sync_all(void); */ @@ -102,56 +113,80 @@ ENTRY_NP(armv7_icache_sync_all) stmdb sp!, {r0, lr} bl _C_LABEL(armv7_idcache_wbinv_all) @clean the D cache ldmia sp!, {r0, lr} - mcr p15, 0, r0, c7, c10, 4 @drain the write buffer, BSB - RET + dsb @ data synchronization barrier + isb + bx lr END(armv7_icache_sync_all) ENTRY(armv7_dcache_wb_range) 1: - mcr p15, 0, r0, c7, c10, 1 @wb the D-Cache - add r0, r0, entrysize - subs r1, r1, entrysize + mcr p15, 0, r0, c7, c10, 1 @ wb the D-Cache + mrc p15, 1, r2, c0, c0, 0 @ read CCSIDR + and r2, r2, #7 @ get line size (log2(size)-4) + add r2, r2, #4 @ adjust + mov ip, #1 @ make a bit mask + lsl r2, ip, r2 @ and shift into position + add r0, r0, r2 + subs r1, r1, r2 bhi 1b - mcr p15, 0, r0, c7, c10, 4 @drain the write buffer, BSB - RET + dsb @ data synchronization barrier + bx lr END(armv7_dcache_wb_range) /* LINTSTUB: void armv7_dcache_wbinv_range(vaddr_t, vsize_t); */ ENTRY(armv7_dcache_wbinv_range) 1: - mcr p15, 0, r0, c7, c14, 1 @wb and inv the D-Cache line - add r0, r0, entrysize - subs r1, r1, entrysize + mcr p15, 0, r0, c7, c14, 1 @ wb and inv the D-Cache line + mrc p15, 1, r2, c0, c0, 0 @ read CCSIDR + and r2, r2, #7 @ get line size (log2(size)-4) + add r2, r2, #4 @ adjust + mov ip, #1 @ make a bit mask + lsl r2, ip, r2 @ and shift into position + add r0, r0, r2 + subs r1, r1, r2 bhi 1b - mcr p15, 0, r0, c7, c10, 4 @drain the write buffer, BSB - RET + dsb @ data synchronization barrier + bx lr END(armv7_dcache_wbinv_range) /* * LINTSTUB: void armv7_dcache_inv_range(vaddr_t, vsize_t); */ ENTRY(armv7_dcache_inv_range) 1: - mcr p15, 0, r0, c7, c6, 1 @invalidate the D-Cache line - add r0, r0, entrysize - subs r1, r1, entrysize + mcr p15, 0, r0, c7, c6, 1 @ invalidate the D-Cache line + mrc p15, 1, r2, c0, c0, 0 @ read CCSIDR + and r2, r2, #7 @ get line size (log2(size)-4) + add r2, r2, #4 @ adjust + mov ip, #1 @ make a bit mask + lsl r2, ip, r2 @ and shift into position + add r0, r0, r2 + subs r1, r1, r2 bhi 1b - mcr p15, 0, r0, c7, c10, 4 @drain the write buffer, BSB - RET + dsb @ data synchronization barrier + bx lr END(armv7_dcache_inv_range) +/* * LINTSTUB: void armv7_idcache_wbinv_range(vaddr_t, vsize_t); */ ENTRY(armv7_idcache_wbinv_range) 1: - mcr p15, 0, r0, c7, c5, 1 @invalidate the I-Cache line - mcr p15, 0, r0, c7, c14, 1 @wb and inv the D-Cache line - add r0, r0, entrysize - subs r1, r1, entrysize + mcr p15, 0, r0, c7, c5, 1 @ invalidate the I-Cache line + mcr p15, 0, r0, c7, c14, 1 @ wb and inv the D-Cache line + mrc p15, 1, r2, c0, c0, 0 @ read CCSIDR + and r2, r2, #7 @ get line size (log2(size)-4) + add r2, r2, #4 @ adjust + mov ip, #1 @ make a bit mask + lsl r2, ip, r2 @ and shift into position + add r0, r0, r2 + subs r1, r1, r2 bhi 1b - mcr p15, 0, r0, c7, c10, 4 @drain the write buffer, BSB - RET + dsb @ data synchronization barrier + isb + bx lr END(armv7_idcache_wbinv_range) +/* * LINTSTUB: void armv7_idcache_wbinv_all(void); */ ENTRY_NP(armv7_idcache_wbinv_all) /* * We assume that the code here can never be out of sync with the @@ -164,6 +199,169 @@ ENTRY_NP(armv7_idcache_wbinv_all) END(armv7_idcache_wbinv_all) /* - * armv7_dcache_wbinv_all is in cpufunc.c. It's really too long to - * write in assembler. + * These work very hard to not push registers onto the stack and to limit themselves + * to use r0-r3 and ip. */ +/* * LINTSTUB: void armv7_icache_inv_all(void); */ +ENTRY_NP(armv7_icache_inv_all) + mov r0, #0 + mcr p15, 2, r0, c0, c0, 0 @ set cache level to L1 + mrc p15, 1, r0, c0, c0, 0 @ read CCSIDR + + ubfx r2, r0, #13, #15 @ get num sets - 1 from CCSIDR + ubfx r3, r0, #3, #10 @ get numways - 1 from CCSIDR + clz r1, r3 @ number of bits to MSB of way + lsl r3, r3, r1 @ shift into position + mov ip, #1 @ + lsl ip, ip, r1 @ ip now contains the way decr + + ubfx r0, r0, #0, #3 @ get linesize from CCSIDR + add r0, r0, #4 @ apply bias + lsl r2, r2, r0 @ shift sets by log2(linesize) + add r3, r3, r2 @ merge numsets - 1 with numways - 1 + sub ip, ip, r2 @ subtract numsets - 1 from way decr + mov r1, #1 + lsl r1, r1, r0 @ r1 now contains the set decr + mov r2, ip @ r2 now contains set way decr + + /* r3 = ways/sets, r2 = way decr, r1 = set decr, r0 and ip are free */ +1: mcr p15, 0, r3, c7, c6, 2 @ invalidate line + movs r0, r3 @ get current way/set + beq 2f @ at 0 means we are done. + movs r0, r0, lsl #10 @ clear way bits leaving only set bits + subne r3, r3, r1 @ non-zero?, decrement set # + subeq r3, r3, r2 @ zero?, decrement way # and restore set count + b 1b + +2: dsb @ wait for stores to finish + mov r0, #0 @ and ... + mcr p15, 0, r0, c7, c5, 0 @ invalidate L1 cache + isb @ instruction sync barrier + bx lr @ return +END(armv7_icache_inv_all) + +/* * LINTSTUB: void armv7_dcache_inv_all(void); */ +ENTRY_NP(armv7_dcache_inv_all) + mrc p15, 1, r0, c0, c0, 1 @ read CLIDR + ands r3, r0, #0x07000000 + beq .Ldone_inv + lsr r3, r3, #23 @ left align loc (low 4 bits) + + mov r1, #0 +.Lstart_inv: + add r2, r3, r3, lsr #1 @ r2 = level * 3 / 2 + mov r1, r0, lsr r2 @ r1 = cache type + and r1, r1, #7 + cmp r1, #2 @ is it data or i&d? + blt .Lnext_level_inv @ nope, skip level + + mcr p15, 2, r3, c0, c0, 0 @ select cache level + isb + mrc p15, 1, r0, c0, c0, 0 @ read CCSIDR + + ubfx ip, r0, #0, #3 @ get linesize from CCSIDR + add ip, ip, #4 @ apply bias + ubfx r2, r0, #13, #15 @ get numsets - 1 from CCSIDR + lsl r2, r2, ip @ shift to set position + orr r3, r3, r2 @ merge set into way/set/level + mov r1, #1 + lsl r1, r1, ip @ r1 = set decr + + ubfx ip, r0, #3, #10 @ get numways - 1 from [to be discarded] CCSIDR + clz r2, ip @ number of bits to MSB of way + lsl ip, ip, r2 @ shift by that into way position + mov r0, #1 @ + lsl r2, r0, r2 @ r2 now contains the way decr + mov r0, r3 @ get sets/level (no way yet) + orr r3, r3, ip @ merge way into way/set/level + bfc r0, #0, #4 @ clear low 4 bits (level) to get numset - 1 + sub r2, r2, r0 @ subtract from way decr + + /* r3 = ways/sets/level, r2 = way decr, r1 = set decr, r0 and ip are free */ +1: mcr p15, 0, r3, c7, c6, 2 @ invalidate line + cmp r3, #15 @ are we done with this level (way/set == 0) + bls .Lnext_level_inv @ yes, go to next level + lsl r0, r3, #10 @ clear way bits leaving only set/level bits + lsr r0, r0, #4 @ clear level bits leaving only set bits + subne r3, r3, r1 @ non-zero?, decrement set # + subeq r3, r3, r2 @ zero?, decrement way # and restore set count + b 1b + +.Lnext_level_inv: + mrc p15, 1, r0, c0, c0, 1 @ read CLIDR + and ip, r0, #0x07000000 @ narrow to LoC + lsr ip, ip, #23 @ left align LoC (low 4 bits) + add r3, r3, #2 @ go to next level + cmp r3, ip @ compare + blt .Lstart_inv @ not done, next level (r0 == CLIDR) + +.Ldone_inv: + mov r0, #0 @ default back to cache level 0 + mcr p15, 2, r0, c0, c0, 0 @ select cache level + dsb + isb + bx lr +END(armv7_dcache_inv_all) + +/* * LINTSTUB: void armv7_dcache_wbinv_all(void); */ +ENTRY_NP(armv7_dcache_wbinv_all) + mrc p15, 1, r0, c0, c0, 1 @ read CLIDR + ands r3, r0, #0x07000000 + beq .Ldone_wbinv + lsr r3, r3, #23 @ left align loc (low 4 bits) + + mov r1, #0 +.Lstart_wbinv: + add r2, r3, r3, lsr #1 @ r2 = level * 3 / 2 + mov r1, r0, lsr r2 @ r1 = cache type + bfc r1, #3, #28 + cmp r1, #2 @ is it data or i&d? + blt .Lnext_level_wbinv @ nope, skip level + + mcr p15, 2, r3, c0, c0, 0 @ select cache level + isb + mrc p15, 1, r0, c0, c0, 0 @ read CCSIDR + + ubfx ip, r0, #0, #3 @ get linesize from CCSIDR + add ip, ip, #4 @ apply bias + ubfx r2, r0, #13, #15 @ get numsets - 1 from CCSIDR + lsl r2, r2, ip @ shift to set position + orr r3, r3, r2 @ merge set into way/set/level + mov r1, #1 + lsl r1, r1, ip @ r1 = set decr + + ubfx ip, r0, #3, #10 @ get numways - 1 from [to be discarded] CCSIDR + clz r2, ip @ number of bits to MSB of way + lsl ip, ip, r2 @ shift by that into way position + mov r0, #1 @ + lsl r2, r0, r2 @ r2 now contains the way decr + mov r0, r3 @ get sets/level (no way yet) + orr r3, r3, ip @ merge way into way/set/level + bfc r0, #0, #4 @ clear low 4 bits (level) to get numset - 1 + sub r2, r2, r0 @ subtract from way decr + + /* r3 = ways/sets/level, r2 = way decr, r1 = set decr, r0 and ip are free */ +1: mcr p15, 0, r3, c7, c14, 2 @ writeback and invalidate line + cmp r3, #15 @ are we done with this level (way/set == 0) + bls .Lnext_level_wbinv @ yes, go to next level + lsl r0, r3, #10 @ clear way bits leaving only set/level bits + lsr r0, r0, #4 @ clear level bits leaving only set bits + subne r3, r3, r1 @ non-zero?, decrement set # + subeq r3, r3, r2 @ zero?, decrement way # and restore set count + b 1b + +.Lnext_level_wbinv: + mrc p15, 1, r0, c0, c0, 1 @ read CLIDR + and ip, r0, #0x07000000 @ narrow to LoC + lsr ip, ip, #23 @ left align LoC (low 4 bits) + add r3, r3, #2 @ go to next level + cmp r3, ip @ compare + blt .Lstart_wbinv @ not done, next level (r0 == CLIDR) + +.Ldone_wbinv: + mov r0, #0 @ default back to cache level 0 + mcr p15, 2, r0, c0, c0, 0 @ select cache level + dsb + isb + bx lr +END(armv7_dcache_wbinv_all)