Module Name:    src
Committed By:   matt
Date:           Wed Aug 29 18:37:14 UTC 2012

Modified Files:
        src/sys/arch/arm/arm: cpufunc.c cpufunc_asm_armv7.S

Log Message:
Recode armv7_dcache_wbinv_all in asm.  Add armv7_dcache_inv_all and
armv7_icache_inv_all as well.
Use dsb/dmb/isb instructions


To generate a diff of this commit:
cvs rdiff -u -r1.112 -r1.113 src/sys/arch/arm/arm/cpufunc.c
cvs rdiff -u -r1.2 -r1.3 src/sys/arch/arm/arm/cpufunc_asm_armv7.S

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/arm/arm/cpufunc.c
diff -u src/sys/arch/arm/arm/cpufunc.c:1.112 src/sys/arch/arm/arm/cpufunc.c:1.113
--- src/sys/arch/arm/arm/cpufunc.c:1.112	Wed Aug 29 18:29:04 2012
+++ src/sys/arch/arm/arm/cpufunc.c	Wed Aug 29 18:37:14 2012
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpufunc.c,v 1.112 2012/08/29 18:29:04 matt Exp $	*/
+/*	$NetBSD: cpufunc.c,v 1.113 2012/08/29 18:37:14 matt Exp $	*/
 
 /*
  * arm7tdmi support code Copyright (c) 2001 John Fremlin
@@ -49,7 +49,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cpufunc.c,v 1.112 2012/08/29 18:29:04 matt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpufunc.c,v 1.113 2012/08/29 18:37:14 matt Exp $");
 
 #include "opt_compat_netbsd.h"
 #include "opt_cpuoptions.h"
@@ -2835,53 +2835,6 @@ armv7_setup(char *args)
 	curcpu()->ci_ctrl = cpuctrl;
 	cpu_control(0xffffffff, cpuctrl);
 }
-
-/* Clean the data cache to the level of coherency. Slow. */
-void
-armv7_dcache_wbinv_all(void)
-{
-	u_int clidr, loc, level;
-
-	/* Cache Level ID Register */
-	__asm volatile("mrc\tp15, 1, %0, c0, c0, 1" : "=r" (clidr));
-
-	loc = (clidr >> 24) & 7; /* Level of Coherency */
-
-	for (level = 0; level <= loc; level++) {
-		u_int ctype, csid;
-		int line_size, ways, nsets, wayshift, setshift;
-
-		ctype = (clidr >> (level * 3)) & 7;
-		/* We're supposed to stop when ctype == 0, but we
-		 * trust that loc isn't larger than necesssary. */
-		if (ctype < 2) continue; /* no cache / only icache */
-
-		csid = get_cachesize_cp15(level << 1);
-		line_size = CPU_CSID_LEN(csid);
-		ways = CPU_CSID_ASSOC(csid);
-		nsets = (csid >> 13) & 0x7fff;
-
-		wayshift = __builtin_clz(ways); /* leading zeros */
-		setshift = line_size + 4;
-
-		for (; nsets >= 0; nsets--) {
-			int way;
-
-			for (way = ways; way >= 0; way--) {
-				/* Clean by set/way */
-				const u_int sw = (way << wayshift)
-				    | (nsets << setshift)
-				    | (level << 1);
-
-				__asm volatile("mcr\tp15, 0, %0, c7, c10, 2"
-				    :: "r"(sw));
-			}
-		}
-	}
-
-	__asm volatile("dsb");
-	__asm volatile("isb");
-}
 #endif /* CPU_CORTEX */
 
 

Index: src/sys/arch/arm/arm/cpufunc_asm_armv7.S
diff -u src/sys/arch/arm/arm/cpufunc_asm_armv7.S:1.2 src/sys/arch/arm/arm/cpufunc_asm_armv7.S:1.3
--- src/sys/arch/arm/arm/cpufunc_asm_armv7.S:1.2	Sat Jun 19 19:44:57 2010
+++ src/sys/arch/arm/arm/cpufunc_asm_armv7.S	Wed Aug 29 18:37:14 2012
@@ -31,50 +31,55 @@
 #include <machine/cpu.h>
 #include <machine/asm.h>
 
-#define entrysize		#32
-
 	.arch	armv7a
 
-
 ENTRY(armv7_cpu_sleep)
-	tst	r0, #0x00000000 	@shouldn't sleep 0
-	wfi
-	RET
+	tst	r0, #0x00000000 	@ shouldn't sleep 0
+	wfene				@ this can cheaper when doing MP
+	bx	lr
 END(armv7_cpu_sleep)
 
 ENTRY(armv7_wait)
-	mrc	p15, 0, r0, c2, c0, 0	@arbitrary read of CP15
-	add	r0, r0, #0		@a stall
-	RET
+	mrc	p15, 0, r0, c2, c0, 0	@ arbitrary read of CP15
+	add	r0, r0, #0		@ a stall
+	bx	lr
 END(armv7_wait)
 
 ENTRY(armv7_context_switch)
-	mcr	p15, 0, r0, c7, c10, 4  @drain the write buffer
-	mcr	p15, 0, r0, c2, c0, 0 	@set the new TTB
-	mcr	p15, 0, r0, c8, c7, 0	@flush the I+D
-	RET
+	dsb				@ data synchronization barrier
+	mcr	p15, 0, r0, c2, c0, 0 	@ set the new TTB
+#ifdef MULTIPROCESSOR
+	mcr	p15, 0, r0, c8, c3, 0	@ flush I+D tlb single entry
+#else
+	mcr	p15, 0, r0, c8, c7, 0	@ flush the I+D
+#endif
+	dsb
+	isb
+	bx	lr
 END(armv7_context_switch)
 
 ENTRY(armv7_tlb_flushID_SE)
-	mcr	p15, 0, r0, c8, c7, 1	@flush I+D tlb single entry
-	mcr	p15, 0, r0, c7, c10, 4  @drain write buffer
-	RET
+#ifdef MULTIPROCESSOR
+	mcr	p15, 0, r0, c8, c3, 1	@ flush I+D tlb single entry
+#else
+	mcr	p15, 0, r0, c8, c7, 1	@ flush I+D tlb single entry
+#endif
+	dsb				@ data synchronization barrier
+	isb
+	bx	lr
 END(armv7_tlb_flushID_SE)
 
 
 ENTRY(armv7_setttb)
-/* Does this even exist on armv7? */
-#ifdef PMAP_CACHE_VIVT
-	stmdb	sp!, {r0, lr}
-	bl	_C_LABEL(armv7_idcache_wbinv_all) @clean the D cache
-	ldmia	sp!, {r0, lr}
+	mcr	p15, 0, r0, c2, c0, 0   @ load new TTB
+#ifdef MULTIPROCESSOR
+	mcr	p15, 0, r0, c8, c3, 0	@ invalidate all I+D TLBs
+#else
+	mcr	p15, 0, r0, c8, c7, 0   @ invalidate all I+D TLBs
 #endif
-
-	mcr	p15, 0, r0, c2, c0, 0   @load new TTB
-	mcr	p15, 0, r0, c8, c7, 0   @invalidate I+D TLBs
-	mcr	p15, 0, r0, c7, c10, 4  @drain the write buffer
-
-	RET
+	dsb				@ data synchronization barrier
+	isb
+	bx	lr
 END(armv7_setttb)
 
 /* Cache operations. */
@@ -82,14 +87,20 @@ END(armv7_setttb)
 /* LINTSTUB: void armv7_icache_sync_range(vaddr_t, vsize_t); */
 ENTRY_NP(armv7_icache_sync_range)
 1:
-	mcr	p15, 0, r0, c7, c5, 1	@invalidate the I-Cache line
-	mcr	p15, 0, r0, c7, c10, 1	@wb the D-Cache line
-	add	r0, r0, entrysize
-	subs	r1, r1, entrysize
+	mcr	p15, 0, r0, c7, c5, 1	@ invalidate the I-Cache line
+	mcr	p15, 0, r0, c7, c10, 1	@ wb the D-Cache line
+	mrc	p15, 1, r2, c0, c0, 0	@ read CCSIDR
+	and	r2, r2, #7		@ get line size (log2(size)-4)
+	add	r2, r2, #4		@ adjust
+	mov	ip, #1			@ make a bit mask
+	lsl	r2, ip, r2		@ and shift into position
+	add	r0, r0, r2
+	subs	r1, r1, r2
 	bhi	1b
 
-	mcr	p15, 0, r0, c7, c10, 4 	@drain the write buffer, BSB 
-	RET
+	dsb				@ data synchronization barrier
+	isb
+	bx	lr
 END(armv7_icache_sync_range)
 
 /* LINTSTUB: void armv7_icache_sync_all(void); */
@@ -102,56 +113,80 @@ ENTRY_NP(armv7_icache_sync_all)
 	stmdb	sp!, {r0, lr}
 	bl	_C_LABEL(armv7_idcache_wbinv_all) @clean the D cache
 	ldmia	sp!, {r0, lr}
-	mcr	p15, 0, r0, c7, c10, 4  @drain the write buffer, BSB
-	RET
+	dsb				@ data synchronization barrier
+	isb
+	bx	lr
 END(armv7_icache_sync_all)
 
 ENTRY(armv7_dcache_wb_range)
 1:
-	mcr	p15, 0, r0, c7, c10, 1	@wb the D-Cache
-	add	r0, r0, entrysize
-	subs	r1, r1, entrysize
+	mcr	p15, 0, r0, c7, c10, 1	@ wb the D-Cache
+	mrc	p15, 1, r2, c0, c0, 0	@ read CCSIDR
+	and	r2, r2, #7		@ get line size (log2(size)-4)
+	add	r2, r2, #4		@ adjust
+	mov	ip, #1			@ make a bit mask
+	lsl	r2, ip, r2		@ and shift into position
+	add	r0, r0, r2
+	subs	r1, r1, r2
 	bhi	1b
-	mcr	p15, 0, r0, c7, c10, 4  @drain the write buffer, BSB 
-	RET
+	dsb				@ data synchronization barrier
+	bx	lr
 END(armv7_dcache_wb_range)
 
 /* LINTSTUB: void armv7_dcache_wbinv_range(vaddr_t, vsize_t); */
 ENTRY(armv7_dcache_wbinv_range)
 1:
-	mcr	p15, 0, r0, c7, c14, 1	@wb and inv the D-Cache line
-	add	r0, r0, entrysize
-	subs	r1, r1, entrysize
+	mcr	p15, 0, r0, c7, c14, 1	@ wb and inv the D-Cache line
+	mrc	p15, 1, r2, c0, c0, 0	@ read CCSIDR
+	and	r2, r2, #7		@ get line size (log2(size)-4)
+	add	r2, r2, #4		@ adjust
+	mov	ip, #1			@ make a bit mask
+	lsl	r2, ip, r2		@ and shift into position
+	add	r0, r0, r2
+	subs	r1, r1, r2
 	bhi	1b
-	mcr	p15, 0, r0, c7, c10, 4  @drain the write buffer, BSB 
-	RET
+	dsb				@ data synchronization barrier
+	bx	lr
 END(armv7_dcache_wbinv_range)
 
 /* * LINTSTUB: void armv7_dcache_inv_range(vaddr_t, vsize_t); */
 ENTRY(armv7_dcache_inv_range)
 1:
-	mcr	p15, 0, r0, c7, c6, 1	@invalidate the D-Cache line  
-	add	r0, r0, entrysize 
-	subs	r1, r1, entrysize
+	mcr	p15, 0, r0, c7, c6, 1	@ invalidate the D-Cache line  
+	mrc	p15, 1, r2, c0, c0, 0	@ read CCSIDR
+	and	r2, r2, #7		@ get line size (log2(size)-4)
+	add	r2, r2, #4		@ adjust
+	mov	ip, #1			@ make a bit mask
+	lsl	r2, ip, r2		@ and shift into position
+	add	r0, r0, r2 
+	subs	r1, r1, r2
 	bhi	1b
 
-	mcr	p15, 0, r0, c7, c10, 4  @drain the write buffer, BSB 
-	RET
+	dsb				@ data synchronization barrier
+	bx	lr
 END(armv7_dcache_inv_range)
 
 
+/* * LINTSTUB: void armv7_idcache_wbinv_range(vaddr_t, vsize_t); */
 ENTRY(armv7_idcache_wbinv_range)
 1:
-	mcr	p15, 0, r0, c7, c5, 1	@invalidate the I-Cache line
-	mcr	p15, 0, r0, c7, c14, 1 	@wb and inv the D-Cache line
-	add	r0, r0, entrysize
-	subs	r1, r1, entrysize
+	mcr	p15, 0, r0, c7, c5, 1	@ invalidate the I-Cache line
+	mcr	p15, 0, r0, c7, c14, 1 	@ wb and inv the D-Cache line
+	mrc	p15, 1, r2, c0, c0, 0	@ read CCSIDR
+	and	r2, r2, #7		@ get line size (log2(size)-4)
+	add	r2, r2, #4		@ adjust
+	mov	ip, #1			@ make a bit mask
+	lsl	r2, ip, r2		@ and shift into position
+	add	r0, r0, r2
+	subs	r1, r1, r2
 	bhi	1b
 
-	mcr	p15, 0, r0, c7, c10, 4  @drain the write buffer, BSB 
-	RET
+	dsb				@ data synchronization barrier
+	isb
+	bx	lr
 END(armv7_idcache_wbinv_range)
 
+/* * LINTSTUB: void armv7_idcache_wbinv_all(void); */
 ENTRY_NP(armv7_idcache_wbinv_all)
 	/*
 	 * We assume that the code here can never be out of sync with the
@@ -164,6 +199,169 @@ ENTRY_NP(armv7_idcache_wbinv_all)
 END(armv7_idcache_wbinv_all)
 
 /*
- * armv7_dcache_wbinv_all is in cpufunc.c. It's really too long to
- * write in assembler.
+ * These work very hard to not push registers onto the stack and to limit themselves
+ * to use r0-r3 and ip.
  */
+/* * LINTSTUB: void armv7_icache_inv_all(void); */
+ENTRY_NP(armv7_icache_inv_all)
+	mov	r0, #0
+	mcr	p15, 2, r0, c0, c0, 0	@ set cache level to L1
+	mrc	p15, 1, r0, c0, c0, 0	@ read CCSIDR
+
+	ubfx	r2, r0, #13, #15	@ get num sets - 1 from CCSIDR
+	ubfx	r3, r0, #3, #10		@ get numways - 1 from CCSIDR
+	clz	r1, r3			@ number of bits to MSB of way
+	lsl	r3, r3, r1		@ shift into position
+	mov	ip, #1			@ 
+	lsl	ip, ip, r1		@ ip now contains the way decr
+
+	ubfx	r0, r0, #0, #3		@ get linesize from CCSIDR
+	add	r0, r0, #4		@ apply bias
+	lsl	r2, r2, r0		@ shift sets by log2(linesize)
+	add	r3, r3, r2		@ merge numsets - 1 with numways - 1
+	sub	ip, ip, r2		@ subtract numsets - 1 from way decr
+	mov	r1, #1
+	lsl	r1, r1, r0		@ r1 now contains the set decr
+	mov	r2, ip			@ r2 now contains set way decr
+
+	/* r3 = ways/sets, r2 = way decr, r1 = set decr, r0 and ip are free */
+1:	mcr	p15, 0, r3, c7, c6, 2	@ invalidate line
+	movs	r0, r3			@ get current way/set
+	beq	2f			@ at 0 means we are done.
+	movs	r0, r0, lsl #10		@ clear way bits leaving only set bits
+	subne	r3, r3, r1		@ non-zero?, decrement set #
+	subeq	r3, r3, r2		@ zero?, decrement way # and restore set count
+	b	1b
+
+2:	dsb				@ wait for stores to finish
+	mov	r0, #0			@ and ...
+	mcr	p15, 0, r0, c7, c5, 0	@ invalidate L1 cache
+	isb				@ instruction sync barrier
+	bx	lr			@ return
+END(armv7_icache_inv_all)
+
+/* * LINTSTUB: void armv7_dcache_inv_all(void); */
+ENTRY_NP(armv7_dcache_inv_all)
+	mrc	p15, 1, r0, c0, c0, 1	@ read CLIDR
+	ands	r3, r0, #0x07000000
+	beq	.Ldone_inv
+	lsr	r3, r3, #23		@ left align loc (low 4 bits)
+
+	mov	r1, #0
+.Lstart_inv:
+	add	r2, r3, r3, lsr #1	@ r2 = level * 3 / 2
+	mov	r1, r0, lsr r2		@ r1 = cache type
+	and	r1, r1, #7
+	cmp	r1, #2			@ is it data or i&d?
+	blt	.Lnext_level_inv	@ nope, skip level
+
+	mcr	p15, 2, r3, c0, c0, 0	@ select cache level
+	isb
+	mrc	p15, 1, r0, c0, c0, 0	@ read CCSIDR
+
+	ubfx	ip, r0, #0, #3		@ get linesize from CCSIDR
+	add	ip, ip, #4		@ apply bias
+	ubfx	r2, r0, #13, #15	@ get numsets - 1 from CCSIDR
+	lsl	r2, r2, ip		@ shift to set position
+	orr	r3, r3, r2		@ merge set into way/set/level 
+	mov	r1, #1
+	lsl	r1, r1, ip		@ r1 = set decr
+
+	ubfx	ip, r0, #3, #10		@ get numways - 1 from [to be discarded] CCSIDR
+	clz	r2, ip			@ number of bits to MSB of way
+	lsl	ip, ip, r2		@ shift by that into way position
+	mov	r0, #1			@ 
+	lsl	r2, r0, r2		@ r2 now contains the way decr
+	mov	r0, r3 			@ get sets/level (no way yet)
+	orr	r3, r3, ip		@ merge way into way/set/level
+	bfc	r0, #0, #4		@ clear low 4 bits (level) to get numset - 1
+	sub	r2, r2, r0		@ subtract from way decr
+
+	/* r3 = ways/sets/level, r2 = way decr, r1 = set decr, r0 and ip are free */
+1:	mcr	p15, 0, r3, c7, c6, 2	@ invalidate line
+	cmp	r3, #15			@ are we done with this level (way/set == 0) 
+	bls	.Lnext_level_inv	@ yes, go to next level
+	lsl	r0, r3, #10		@ clear way bits leaving only set/level bits
+	lsr	r0, r0, #4		@ clear level bits leaving only set bits
+	subne	r3, r3, r1		@ non-zero?, decrement set #
+	subeq	r3, r3, r2		@ zero?, decrement way # and restore set count
+	b	1b
+
+.Lnext_level_inv:
+	mrc	p15, 1, r0, c0, c0, 1	@ read CLIDR
+	and	ip, r0, #0x07000000	@ narrow to LoC
+	lsr	ip, ip, #23		@ left align LoC (low 4 bits)
+	add	r3, r3, #2		@ go to next level
+	cmp	r3, ip			@ compare
+	blt	.Lstart_inv		@ not done, next level (r0 == CLIDR)
+
+.Ldone_inv:
+	mov	r0, #0			@ default back to cache level 0
+	mcr	p15, 2, r0, c0, c0, 0	@ select cache level
+	dsb
+	isb
+	bx	lr
+END(armv7_dcache_inv_all)
+
+/* * LINTSTUB: void armv7_dcache_wbinv_all(void); */
+ENTRY_NP(armv7_dcache_wbinv_all)
+	mrc	p15, 1, r0, c0, c0, 1	@ read CLIDR
+	ands	r3, r0, #0x07000000
+	beq	.Ldone_wbinv
+	lsr	r3, r3, #23		@ left align loc (low 4 bits)
+
+	mov	r1, #0
+.Lstart_wbinv:
+	add	r2, r3, r3, lsr #1	@ r2 = level * 3 / 2
+	mov	r1, r0, lsr r2		@ r1 = cache type
+	bfc	r1, #3, #28
+	cmp	r1, #2			@ is it data or i&d?
+	blt	.Lnext_level_wbinv	@ nope, skip level
+
+	mcr	p15, 2, r3, c0, c0, 0	@ select cache level
+	isb
+	mrc	p15, 1, r0, c0, c0, 0	@ read CCSIDR
+
+	ubfx	ip, r0, #0, #3		@ get linesize from CCSIDR
+	add	ip, ip, #4		@ apply bias
+	ubfx	r2, r0, #13, #15	@ get numsets - 1 from CCSIDR
+	lsl	r2, r2, ip		@ shift to set position
+	orr	r3, r3, r2		@ merge set into way/set/level 
+	mov	r1, #1
+	lsl	r1, r1, ip		@ r1 = set decr
+
+	ubfx	ip, r0, #3, #10		@ get numways - 1 from [to be discarded] CCSIDR
+	clz	r2, ip			@ number of bits to MSB of way
+	lsl	ip, ip, r2		@ shift by that into way position
+	mov	r0, #1			@ 
+	lsl	r2, r0, r2		@ r2 now contains the way decr
+	mov	r0, r3 			@ get sets/level (no way yet)
+	orr	r3, r3, ip		@ merge way into way/set/level
+	bfc	r0, #0, #4		@ clear low 4 bits (level) to get numset - 1
+	sub	r2, r2, r0		@ subtract from way decr
+
+	/* r3 = ways/sets/level, r2 = way decr, r1 = set decr, r0 and ip are free */
+1:	mcr	p15, 0, r3, c7, c14, 2	@ writeback and invalidate line
+	cmp	r3, #15			@ are we done with this level (way/set == 0) 
+	bls	.Lnext_level_wbinv	@ yes, go to next level
+	lsl	r0, r3, #10		@ clear way bits leaving only set/level bits
+	lsr	r0, r0, #4		@ clear level bits leaving only set bits
+	subne	r3, r3, r1		@ non-zero?, decrement set #
+	subeq	r3, r3, r2		@ zero?, decrement way # and restore set count
+	b	1b
+
+.Lnext_level_wbinv:
+	mrc	p15, 1, r0, c0, c0, 1	@ read CLIDR
+	and	ip, r0, #0x07000000	@ narrow to LoC
+	lsr	ip, ip, #23		@ left align LoC (low 4 bits)
+	add	r3, r3, #2		@ go to next level
+	cmp	r3, ip			@ compare
+	blt	.Lstart_wbinv		@ not done, next level (r0 == CLIDR)
+
+.Ldone_wbinv:
+	mov	r0, #0			@ default back to cache level 0
+	mcr	p15, 2, r0, c0, c0, 0	@ select cache level
+	dsb
+	isb
+	bx	lr
+END(armv7_dcache_wbinv_all)

Reply via email to