OK, I have a Sun Blade 2500 (2x UltraSPARC III) I can use to test. I'll try
to get to this this weekend.

Patrick


On Wed, Dec 4, 2013 at 3:56 AM, Kirill Tkhai <tk...@yandex.ru> wrote:

> Hi,
>
> I'm looking for a person who has sparc64 machine with NUMA. The patch
> below adds
> NUMA kernel text replication support. This should improve sparc64 kernel
> performance
> a little bit.
>
> I tested it on my machines, and it looks working for me. But they are not
> standard
> sun v9. So person with standard vanila-supported machine is seeked!
>
> Is anybody able to help me?
>
> It's necessary to 1)clone David Miller's git tree:
>
> git clone --depth=1 git://
> git.kernel.org/pub/scm/linux/kernel/git/davem/sparc.git
>
> 2)apply the patch and 3)do not forget to enable CONFIG_NUMA in
> xconfig/menuconfig.
>
> The following actions is to do a boot test. If everything is OK, I'll be
> very thankful
> if you're able to execute any short performance test: before patch and
> with it.
>
> Thanks!
>
> Signed-off-by: Kirill Tkhai <tk...@yandex.ru>
> ---
>  arch/sparc/include/asm/page_64.h    |    3 +
>  arch/sparc/include/asm/pgtable_64.h |    6 ++
>  arch/sparc/include/asm/trap_block.h |   17 ++++++
>  arch/sparc/kernel/smp_64.c          |    8 ++-
>  arch/sparc/kernel/trampoline_64.S   |   46 ++++++++++++++---
>  arch/sparc/mm/init_64.c             |   94
> ++++++++++++++++++++++++++++++++++-
>  arch/sparc/mm/init_64.h             |    2 +-
>  7 files changed, 163 insertions(+), 13 deletions(-)
> diff --git a/arch/sparc/include/asm/page_64.h
> b/arch/sparc/include/asm/page_64.h
> index aac53fc..5a85352 100644
> --- a/arch/sparc/include/asm/page_64.h
> +++ b/arch/sparc/include/asm/page_64.h
> @@ -8,6 +8,9 @@
>  #define PAGE_SIZE    (_AC(1,UL) << PAGE_SHIFT)
>  #define PAGE_MASK    (~(PAGE_SIZE-1))
>
> +#define PAGE4MB_SHIFT          22
> +#define PAGE4MB_SIZE           (_AC(1,UL) << PAGE4MB_SHIFT)
> +
>  /* Flushing for D-cache alias handling is only needed if
>   * the page size is smaller than 16K.
>   */
> diff --git a/arch/sparc/include/asm/pgtable_64.h
> b/arch/sparc/include/asm/pgtable_64.h
> index 8358dc1..0b0495f 100644
> --- a/arch/sparc/include/asm/pgtable_64.h
> +++ b/arch/sparc/include/asm/pgtable_64.h
> @@ -884,6 +884,12 @@ extern pmd_t swapper_low_pmd_dir[PTRS_PER_PMD];
>  extern void paging_init(void);
>  extern unsigned long find_ecache_flush_span(unsigned long size);
>
> +#ifdef CONFIG_NUMA
> +extern void numa_copy_kernel_text(void);
> +#else
> +static inline void numa_copy_kernel_text(void) {}
> +#endif
> +
>  struct seq_file;
>  extern void mmu_info(struct seq_file *);
>
> diff --git a/arch/sparc/include/asm/trap_block.h
> b/arch/sparc/include/asm/trap_block.h
> index 7e26b2d..a2f0990 100644
> --- a/arch/sparc/include/asm/trap_block.h
> +++ b/arch/sparc/include/asm/trap_block.h
> @@ -138,6 +138,23 @@ extern struct sun4v_2insn_patch_entry
> __sun4v_2insn_patch,
>         nop;                                            \
>         .previous;
>
> +#ifdef CONFIG_NUMA
> +
> +#define __GET_NODEID(REG, TMP)                         \
> +       __GET_CPUID(REG)                                \
> +       sethi   %hi(numa_cpu_lookup_table), TMP;        \
> +       or      TMP, %lo(numa_cpu_lookup_table), TMP;   \
> +       sllx    REG, 2, REG;                            \
> +       add     TMP, REG, TMP;                          \
> +       lduw    [TMP], REG;
> +
> +#else /* !CONFIG_NUMA */
> +
> +#define __GET_NODEID(REG, TMP)                         \
> +       clr     REG
> +
> +#endif /* !CONFIG_NUMA */
> +
>  #ifdef CONFIG_SMP
>
>  #define TRAP_LOAD_TRAP_BLOCK(DEST, TMP)                \
> diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
> index b66a533..554a0c5 100644
> --- a/arch/sparc/kernel/smp_64.c
> +++ b/arch/sparc/kernel/smp_64.c
> @@ -285,7 +285,7 @@ static void ldom_startcpu_cpuid(unsigned int cpu,
> unsigned long thread_reg,
>                                 void **descrp)
>  {
>         extern unsigned long sparc64_ttable_tl0;
> -       extern unsigned long kern_locked_tte_data;
> +       extern unsigned long kern_locked_tte_data[MAX_NUMNODES];
>         struct hvtramp_descr *hdesc;
>         unsigned long trampoline_ra;
>         struct trap_per_cpu *tb;
> @@ -315,7 +315,7 @@ static void ldom_startcpu_cpuid(unsigned int cpu,
> unsigned long thread_reg,
>         hdesc->thread_reg = thread_reg;
>
>         tte_vaddr = (unsigned long) KERNBASE;
> -       tte_data = kern_locked_tte_data;
> +       tte_data = kern_locked_tte_data[0];
>
>         for (i = 0; i < hdesc->num_mappings; i++) {
>                 hdesc->maps[i].vaddr = tte_vaddr;
> @@ -1214,6 +1214,10 @@ int setup_profiling_timer(unsigned int multiplier)
>
>  void __init smp_prepare_cpus(unsigned int max_cpus)
>  {
> +       /* Dublicate kernel on every node. Do this after
> +        * all kernel patches are applied.
> +        */
> +       numa_copy_kernel_text();
>  }
>
>  void smp_prepare_boot_cpu(void)
> diff --git a/arch/sparc/kernel/trampoline_64.S
> b/arch/sparc/kernel/trampoline_64.S
> index ad4bde3..e5a4f85 100644
> --- a/arch/sparc/kernel/trampoline_64.S
> +++ b/arch/sparc/kernel/trampoline_64.S
> @@ -117,26 +117,42 @@ startup_continue:
>         flushw
>
>         /* Setup the loop variables:
> +        * %l1: Number of 4MB pages containing not-init kernel text
> +        * %l2: TTE base of node 0. Used for DTLB and for rest of __init
> text
> +        *      ITLB mappings. See numa_alloc_kernel_text() for details.
>          * %l3: VADDR base
> -        * %l4: TTE base
> +        * %l4: TTE base of current node. Used for ITLB.
>          * %l5: Loop iterator, iterates from 0 to
> 'num_kernel_image_mappings'
>          * %l6: Number of TTE entries to map
>          * %l7: Highest TTE entry number, we count down
>          */
>         sethi           %hi(KERNBASE), %l3
>         sethi           %hi(kern_locked_tte_data), %l4
> -       ldx             [%l4 + %lo(kern_locked_tte_data)], %l4
> +       or              %l4, %lo(kern_locked_tte_data), %l4
> +       ldx             [%l4], %l2      ! kern_locked_tte_data[0]
> +
> +       __GET_NODEID(%g2, %g1)
> +       sllx            %g2, 3, %g2
> +       add             %l4, %g2, %l4
> +       ldx             [%l4], %l4      ! kern_locked_tte_data[node]
> +
>         clr             %l5
>         sethi           %hi(num_kernel_image_mappings), %l6
>         lduw            [%l6 + %lo(num_kernel_image_mappings)], %l6
>
> +       sethi           %hi(num_node_copy_mappings), %l1
> +       lduw            [%l1 + %lo(num_node_copy_mappings)], %l1
> +
>         mov             15, %l7
>         BRANCH_IF_ANY_CHEETAH(g1,g5,2f)
>
>         mov             63, %l7
>  2:
> -
> -3:
> +       cmp             %l5, %l1        !__init section
> +       bne             4f
> +        nop
> +       mov             %l2, %l4        !use node 0 TTE
> +4:
>         /* Lock into I-MMU */
>         sethi           %hi(call_method), %g2
>         or              %g2, %lo(call_method), %g2
> @@ -190,7 +206,7 @@ startup_continue:
>
>         add             %l3, %g1, %g2
>         stx             %g2, [%sp + 2047 + 128 + 0x28]  ! VADDR
> -       add             %l4, %g1, %g2
> +       add             %l2, %g1, %g2
>         stx             %g2, [%sp + 2047 + 128 + 0x30]  ! TTE
>
>         /* TTE index is highest minus loop index.  */
> @@ -205,7 +221,7 @@ startup_continue:
>
>         add             %l5, 1, %l5
>         cmp             %l5, %l6
> -       bne,pt          %xcc, 3b
> +       bne,pt          %xcc, 2b
>          nop
>
>         sethi           %hi(prom_entry_lock), %g2
> @@ -217,12 +233,26 @@ startup_continue:
>  niagara_lock_tlb:
>         sethi           %hi(KERNBASE), %l3
>         sethi           %hi(kern_locked_tte_data), %l4
> -       ldx             [%l4 + %lo(kern_locked_tte_data)], %l4
> +       or              %l4, %lo(kern_locked_tte_data), %l4
> +       ldx             [%l4], %l2      ! kern_locked_tte_data[0]
> +
> +       __GET_NODEID(%g2, %g1)
> +       sllx            %g2, 3, %g2
> +       add             %l4, %g2, %l4
> +       ldx             [%l4], %l4      ! kern_locked_tte_data[node]
> +
>         clr             %l5
>         sethi           %hi(num_kernel_image_mappings), %l6
>         lduw            [%l6 + %lo(num_kernel_image_mappings)], %l6
>
> +       sethi           %hi(num_node_copy_mappings), %l1
> +       lduw            [%l1 + %lo(num_node_copy_mappings)], %l1
>  1:
> +       cmp             %l5, %l1        !__init section
> +       bne             4f
> +        nop
> +       mov             %l2, %l4        !use node 0 TTE
> +4:
>         mov             HV_FAST_MMU_MAP_PERM_ADDR, %o5
>         sllx            %l5, 22, %g2
>         add             %l3, %g2, %o0
> @@ -235,7 +265,7 @@ niagara_lock_tlb:
>         sllx            %l5, 22, %g2
>         add             %l3, %g2, %o0
>         clr             %o1
> -       add             %l4, %g2, %o2
> +       add             %l2, %g2, %o2
>         mov             HV_MMU_DMMU, %o3
>         ta              HV_FAST_TRAP
>
> diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
> index 5322e53..0183213 100644
> --- a/arch/sparc/mm/init_64.c
> +++ b/arch/sparc/mm/init_64.c
> @@ -186,6 +186,7 @@ unsigned long sparc64_kern_pri_nuc_bits __read_mostly;
>  unsigned long sparc64_kern_sec_context __read_mostly;
>
>  int num_kernel_image_mappings;
> +int num_node_copy_mappings;
>
>  #ifdef CONFIG_DEBUG_DCFLUSH
>  atomic_t dcpage_flushes = ATOMIC_INIT(0);
> @@ -477,7 +478,7 @@ void mmu_info(struct seq_file *m)
>  struct linux_prom_translation prom_trans[512] __read_mostly;
>  unsigned int prom_trans_ents __read_mostly;
>
> -unsigned long kern_locked_tte_data;
> +unsigned long kern_locked_tte_data[MAX_NUMNODES];
>
>  /* The obp translations are saved based on 8k pagesize, since obp can
>   * use a mixture of pagesizes. Misses to the LOW_OBP_ADDRESS ->
> @@ -591,7 +592,7 @@ static void __init remap_kernel(void)
>         phys_page = (prom_boot_mapping_phys_low >> 22UL) << 22UL;
>         tte_data = kern_large_tte(phys_page);
>
> -       kern_locked_tte_data = tte_data;
> +       kern_locked_tte_data[0] = tte_data;
>
>         /* Now lock us into the TLBs via Hypervisor or OBP. */
>         if (tlb_type == hypervisor) {
> @@ -1330,6 +1331,79 @@ static void __init bootmem_init_nonnuma(void)
>         node_set_online(0);
>  }
>
> +#ifdef CONFIG_NUMA
> +
> +/* Allocate memory for per-node copy of kernel text.
> + * The copying itself will be made after all kernel
> + * patches are applied.
> + */
> +static void __init numa_alloc_kernel_text(void)
> +{
> +       unsigned long init_start = (unsigned long)__init_begin;
> +       unsigned int size, node;
> +
> +       /* The rest init text will be mapped from the original image.
> +        */
> +       size = round_up(init_start - KERNBASE, PAGE4MB_SIZE);
> +       num_node_copy_mappings = size >> PAGE4MB_SHIFT;
> +
> +       for (node = 1; node < num_node_masks; node++) {
> +               unsigned long tte_data;
> +               phys_addr_t new_base_pa;
> +
> +               new_base_pa = memblock_alloc_nid(size, PAGE4MB_SIZE, node);
> +
> +               if (new_base_pa) {
> +                       pr_info("node %d: Allocated memory for copy of "
> +                               "kernel text: [%016llx, %016llx]\n",
> +                                node, new_base_pa, new_base_pa + size);
> +                       tte_data = kern_large_tte(new_base_pa);
> +               } else {
> +                       pr_err("node %d: Can't allocate memory for kernel "
> +                              "text duplicate\n", node);
> +                       tte_data = kern_locked_tte_data[0];
> +               }
> +
> +               kern_locked_tte_data[node] = tte_data;
> +       }
> +}
> +
> +/* Dublicate kernel text on every NUMA node.
> + * Do not copy pages which contain only init text,
> + * because they are mapped from original kernel.
> + */
> +void numa_copy_kernel_text(void)
> +{
> +       unsigned int size, node;
> +       unsigned long tte_data0;
> +
> +       size = num_node_copy_mappings << PAGE4MB_SHIFT;
> +       tte_data0 = kern_locked_tte_data[0];
> +
> +       for (node = 1; node < num_node_masks; node++) {
> +               unsigned long tte_data, phys_addr;
> +
> +               tte_data = kern_locked_tte_data[node];
> +
> +               if (tte_data == tte_data0)
> +                       continue;
> +
> +               /* PA is [42:12] range */
> +               phys_addr = (((tte_data << 21) >> 21) >> 13) << 13;
> +
> +               memcpy(__va(phys_addr), (void *)KERNBASE, size);
> +       }
> +}
> +
> +#else /* CONFIG_NUMA */
> +
> +static void __init numa_alloc_kernel_text(void)
> +{
> +}
> +
> +#endif /* CONFIG_NUMA */
> +
> +
>  static unsigned long __init bootmem_init(unsigned long phys_base)
>  {
>         unsigned long end_pfn;
> @@ -1341,6 +1415,8 @@ static unsigned long __init bootmem_init(unsigned
> long phys_base)
>         if (bootmem_init_numa() < 0)
>                 bootmem_init_nonnuma();
>
> +       numa_alloc_kernel_text();
> +
>         /* Dump memblock with node info. */
>         memblock_dump_all();
>
> @@ -1922,6 +1998,9 @@ void __init paging_init(void)
>                 memblock_add(pavail[i].phys_addr, pavail[i].reg_size);
>         }
>
> +#ifdef CONFIG_NUMA
> +       kern_size = round_up(kern_size, PAGE4MB_SIZE);
> +#endif
>         memblock_reserve(kern_base, kern_size);
>
>         find_ramdisk(phys_base);
> @@ -2188,6 +2267,17 @@ void free_initmem(void)
>          * The init section is aligned to 8k in vmlinux.lds. Page align
> for >8k pagesizes.
>          */
>         addr = PAGE_ALIGN((unsigned long)(__init_begin));
> +
> +#ifdef CONFIG_NUMA
> +       if (num_node_masks > 1) {
> +               /* Do not free 4KB pages which are lying at 4MB page
> +                * together with normal kernel text. Their addresses
> +                * are forbidden forever.
> +                */
> +               addr = round_up(addr, PAGE4MB_SIZE);
> +       }
> +#endif
> +
>         initend = (unsigned long)(__init_end) & PAGE_MASK;
>         for (; addr < initend; addr += PAGE_SIZE) {
>                 unsigned long page;
> diff --git a/arch/sparc/mm/init_64.h b/arch/sparc/mm/init_64.h
> index 5d3782de..a14c8d8 100644
> --- a/arch/sparc/mm/init_64.h
> +++ b/arch/sparc/mm/init_64.h
> @@ -34,7 +34,7 @@ extern struct linux_prom_translation prom_trans[512];
>  extern unsigned int prom_trans_ents;
>
>  /* Exported for SMP bootup purposes. */
> -extern unsigned long kern_locked_tte_data;
> +extern unsigned long kern_locked_tte_data[MAX_NUMNODES];
>
>  extern void prom_world(int enter);
>
>
>
> --
> To UNSUBSCRIBE, email to debian-sparc-requ...@lists.debian.org
> with a subject of "unsubscribe". Trouble? Contact
> listmas...@lists.debian.org
> Archive: http://lists.debian.org/176311386150...@web5m.yandex.ru
>
>

Reply via email to