On Sat, Aug 06, 2016 at 12:02:53PM +0200, Mark Kettenis wrote:
> The diff below puts the page tables on armv7 in cachable memory. This
> should speed things up a little bit, especially on processors that
> walk the page tables coherently, which is pretty much all of them
> except for Cortex-A8. We check whether the page table walk is
> coherent by checking the aprpropriate field in the ID_MMFR3 register.
> If the processor walks the page tables coherently, we disable flushing
> of page table entries.
>
> This is a necessary step for MULTIPROCESSOR support, and it should
> reduce the number of times we move pages from cachable to uncachable,
> a transition I'm not sure we handle correctly.
>
> I'd appreciate it if somebody could test this on a system with a
> Cortex-A8 processor.
The diff works fine on Cortex-A8 and brings a kernel build down from 49m
to 19m there!
> Index: arch/arm/arm/cpufunc_asm_armv7.S
> ===================================================================
> RCS file: /cvs/src/sys/arch/arm/arm/cpufunc_asm_armv7.S,v
> retrieving revision 1.12
> diff -u -p -r1.12 cpufunc_asm_armv7.S
> --- arch/arm/arm/cpufunc_asm_armv7.S 5 Aug 2016 19:56:52 -0000 1.12
> +++ arch/arm/arm/cpufunc_asm_armv7.S 6 Aug 2016 09:51:54 -0000
> @@ -35,6 +35,17 @@ ENTRY(armv7_periphbase)
> mrc CP15_CBAR(r0)
> mov pc, lr
>
> +#define TTBR_RGN_NC (0 << 3)
> +#define TTBR_RGN_WBWA (1 << 3)
> +#define TTBR_RGN_WT (1 << 3)
> +#define TTBR_RGN_WBNWA (3 << 3)
> +#define TTBR_IRGN_NC ((0 << 0) | (0 << 6))
> +#define TTBR_IRGN_WBWA ((0 << 0) | (1 << 6))
> +#define TTBR_IRGN_WT ((1 << 0) | (0 << 6))
> +#define TTBR_IRGN_WBNWA ((1 << 0) | (1 << 6))
> +
> +#define TTBR_DEFAULT (TTBR_IRGN_WBNWA | TTBR_RGN_WBNWA)
> +
> /*
> * Functions to set the MMU Translation Table Base register
> */
> @@ -44,6 +55,7 @@ ENTRY(armv7_setttb)
> dsb sy
> isb sy
>
> + orr r0, r0, #TTBR_DEFAULT
> mcr CP15_TTBR0(r0) /* load new TTB */
> mcr CP15_TLBIALL(r0) /* invalidate unified TLB */
> dsb sy
> @@ -232,6 +244,7 @@ ENTRY(armv7_context_switch)
> dsb sy
> isb sy
>
> + orr r0, r0, #TTBR_DEFAULT
> mcr CP15_TTBR0(r0) /* set the new TTB */
> mcr CP15_TLBIALL(r0) /* and flush the unified tlb */
> dsb sy
> Index: arch/arm/arm/pmap7.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/arm/arm/pmap7.c,v
> retrieving revision 1.32
> diff -u -p -r1.32 pmap7.c
> --- arch/arm/arm/pmap7.c 3 Aug 2016 11:52:43 -0000 1.32
> +++ arch/arm/arm/pmap7.c 6 Aug 2016 09:51:55 -0000
> @@ -3291,7 +3291,7 @@ pmap_pte_init_generic(void)
> void
> pmap_pte_init_armv7(void)
> {
> - uint32_t cachereg;
> + uint32_t id_mmfr3;
>
> /*
> * XXX
> @@ -3305,9 +3305,10 @@ pmap_pte_init_armv7(void)
> pte_l2_l_cache_mode = L2_C|L2_B;
> pte_l2_s_cache_mode = L2_C|L2_B;
>
> - pte_l1_s_cache_mode_pt = L1_S_C;
> - pte_l2_l_cache_mode_pt = L2_C;
> - pte_l2_s_cache_mode_pt = L2_C;
> + pte_l1_s_cache_mode_pt = L1_S_B|L1_S_C;
> + pte_l2_l_cache_mode_pt = L2_B|L2_C;
> + pte_l2_s_cache_mode_pt = L2_B|L2_C;
> + pmap_needs_pte_sync = 1;
>
> pte_l1_s_cache_mask = L1_S_CACHE_MASK_v7;
> pte_l2_l_cache_mask = L2_L_CACHE_MASK_v7;
> @@ -3339,26 +3340,10 @@ pmap_pte_init_armv7(void)
> pte_l1_c_proto = L1_C_PROTO_v7;
> pte_l2_s_proto = L2_S_PROTO_v7;
>
> - /* probe L1 dcache */
> - __asm volatile("mcr p15, 2, %0, c0, c0, 0" :: "r" (0) );
> - __asm volatile("mrc p15, 1, %0, c0, c0, 0" : "=r" (cachereg) );
> - if ((cachereg & 0x80000000) == 0) {
> -#if 0
> - /*
> - * pmap_pte_init_generic() has defaulted to write-through
> - * settings for pte pages, but the cache does not support
> - * write-through.
> - */
> - pmap_needs_pte_sync = 1;
> - pte_l1_s_cache_mode_pt = L1_S_B|L1_S_C;
> - pte_l2_l_cache_mode_pt = L2_B|L2_C;
> - pte_l2_s_cache_mode_pt = L2_B|L2_C;
> -#endif
> - /* XXX: Don't cache PTEs, until write-back is fixed. */
> - pte_l1_s_cache_mode_pt = L1_S_V7_TEX(1);
> - pte_l2_l_cache_mode_pt = L2_V7_L_TEX(1);
> - pte_l2_s_cache_mode_pt = L2_V7_S_TEX(1);
> - }
> + /* Check for coherent walk. */
> + __asm volatile("mrc p15, 0, %0, c0, c1, 7" : "=r"(id_mmfr3));
> + if ((id_mmfr3 & 0x00f00000) == 0x00100000)
> + pmap_needs_pte_sync = 0;
> }
>
> uint32_t pmap_alias_dist;