On Sat, Aug 06, 2016 at 12:02:53PM +0200, Mark Kettenis wrote:

> The diff below puts the page tables on armv7 in cachable memory.  This
> should speed things up a little bit, especially on processors that
> walk the page tables coherently, which is pretty much all of them
> except for Cortex-A8.  We check whether the page table walk is
> coherent by checking the aprpropriate field in the ID_MMFR3 register.
> If the processor walks the page tables coherently, we disable flushing
> of page table entries.
> 
> This is a necessary step for MULTIPROCESSOR support, and it should
> reduce the number of times we move pages from cachable to uncachable,
> a transition I'm not sure we handle correctly.
> 
> I'd appreciate it if somebody could test this on a system with a
> Cortex-A8 processor.

The diff works fine on Cortex-A8 and brings a kernel build down from 49m
to 19m there!
 
> Index: arch/arm/arm/cpufunc_asm_armv7.S
> ===================================================================
> RCS file: /cvs/src/sys/arch/arm/arm/cpufunc_asm_armv7.S,v
> retrieving revision 1.12
> diff -u -p -r1.12 cpufunc_asm_armv7.S
> --- arch/arm/arm/cpufunc_asm_armv7.S  5 Aug 2016 19:56:52 -0000       1.12
> +++ arch/arm/arm/cpufunc_asm_armv7.S  6 Aug 2016 09:51:54 -0000
> @@ -35,6 +35,17 @@ ENTRY(armv7_periphbase)
>       mrc     CP15_CBAR(r0)   
>       mov     pc, lr
>  
> +#define TTBR_RGN_NC  (0 << 3)
> +#define TTBR_RGN_WBWA        (1 << 3)
> +#define TTBR_RGN_WT  (1 << 3)
> +#define TTBR_RGN_WBNWA       (3 << 3)
> +#define TTBR_IRGN_NC ((0 << 0) | (0 << 6))
> +#define TTBR_IRGN_WBWA       ((0 << 0) | (1 << 6))
> +#define TTBR_IRGN_WT ((1 << 0) | (0 << 6))
> +#define TTBR_IRGN_WBNWA      ((1 << 0) | (1 << 6))
> +
> +#define TTBR_DEFAULT (TTBR_IRGN_WBNWA | TTBR_RGN_WBNWA)
> +
>  /*
>   * Functions to set the MMU Translation Table Base register
>   */
> @@ -44,6 +55,7 @@ ENTRY(armv7_setttb)
>       dsb     sy
>       isb     sy
>  
> +     orr     r0, r0, #TTBR_DEFAULT
>       mcr     CP15_TTBR0(r0)          /* load new TTB */
>       mcr     CP15_TLBIALL(r0)        /* invalidate unified TLB */
>       dsb     sy
> @@ -232,6 +244,7 @@ ENTRY(armv7_context_switch)
>       dsb     sy
>       isb     sy
>  
> +     orr     r0, r0, #TTBR_DEFAULT
>       mcr     CP15_TTBR0(r0)          /* set the new TTB */
>       mcr     CP15_TLBIALL(r0)        /* and flush the unified tlb */
>       dsb     sy
> Index: arch/arm/arm/pmap7.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/arm/arm/pmap7.c,v
> retrieving revision 1.32
> diff -u -p -r1.32 pmap7.c
> --- arch/arm/arm/pmap7.c      3 Aug 2016 11:52:43 -0000       1.32
> +++ arch/arm/arm/pmap7.c      6 Aug 2016 09:51:55 -0000
> @@ -3291,7 +3291,7 @@ pmap_pte_init_generic(void)
>  void
>  pmap_pte_init_armv7(void)
>  {
> -     uint32_t cachereg;
> +     uint32_t id_mmfr3;
>  
>       /*
>        * XXX 
> @@ -3305,9 +3305,10 @@ pmap_pte_init_armv7(void)
>       pte_l2_l_cache_mode = L2_C|L2_B;
>       pte_l2_s_cache_mode = L2_C|L2_B;
>  
> -     pte_l1_s_cache_mode_pt = L1_S_C;
> -     pte_l2_l_cache_mode_pt = L2_C;
> -     pte_l2_s_cache_mode_pt = L2_C;
> +     pte_l1_s_cache_mode_pt = L1_S_B|L1_S_C;
> +     pte_l2_l_cache_mode_pt = L2_B|L2_C;
> +     pte_l2_s_cache_mode_pt = L2_B|L2_C;
> +     pmap_needs_pte_sync = 1;
>  
>       pte_l1_s_cache_mask = L1_S_CACHE_MASK_v7;
>       pte_l2_l_cache_mask = L2_L_CACHE_MASK_v7;
> @@ -3339,26 +3340,10 @@ pmap_pte_init_armv7(void)
>       pte_l1_c_proto = L1_C_PROTO_v7;
>       pte_l2_s_proto = L2_S_PROTO_v7;
>  
> -     /* probe L1 dcache */
> -     __asm volatile("mcr p15, 2, %0, c0, c0, 0" :: "r" (0) );
> -     __asm volatile("mrc p15, 1, %0, c0, c0, 0" : "=r" (cachereg) );
> -     if ((cachereg & 0x80000000) == 0) {
> -#if 0
> -             /*
> -              * pmap_pte_init_generic() has defaulted to write-through
> -              * settings for pte pages, but the cache does not support
> -              * write-through.
> -              */
> -             pmap_needs_pte_sync = 1;
> -             pte_l1_s_cache_mode_pt = L1_S_B|L1_S_C;
> -             pte_l2_l_cache_mode_pt = L2_B|L2_C;
> -             pte_l2_s_cache_mode_pt = L2_B|L2_C;
> -#endif
> -             /* XXX: Don't cache PTEs, until write-back is fixed. */
> -             pte_l1_s_cache_mode_pt = L1_S_V7_TEX(1);
> -             pte_l2_l_cache_mode_pt = L2_V7_L_TEX(1);
> -             pte_l2_s_cache_mode_pt = L2_V7_S_TEX(1);
> -     }
> +     /* Check for coherent walk. */
> +     __asm volatile("mrc p15, 0, %0, c0, c1, 7" : "=r"(id_mmfr3));
> +     if ((id_mmfr3 & 0x00f00000) == 0x00100000)
> +             pmap_needs_pte_sync = 0;
>  }
>  
>  uint32_t pmap_alias_dist;

Reply via email to