[Need HELP!] Please help to test kernel patch (sparc64, NUMA)

Kirill Tkhai Wed, 04 Dec 2013 02:22:01 -0800

Hi,

I'm looking for a person who has sparc64 machine with NUMA. The patch below adds
NUMA kernel text replication support. This should improve sparc64 kernel 
performance
a little bit.


I tested it on my machines, and it looks working for me. But they are not 
standard
sun v9. So person with standard vanila-supported machine is seeked!

Is anybody able to help me?

It's necessary to 1)clone David Miller's git tree:

git clone --depth=1 
git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc.git

2)apply the patch and 3)do not forget to enable CONFIG_NUMA in 
xconfig/menuconfig.

The following actions is to do a boot test. If everything is OK, I'll be very 
thankful
if you're able to execute any short performance test: before patch and with it.

Thanks!

Signed-off-by: Kirill Tkhai <[email protected]>
---
 arch/sparc/include/asm/page_64.h    |    3 +
 arch/sparc/include/asm/pgtable_64.h |    6 ++
 arch/sparc/include/asm/trap_block.h |   17 ++++++
 arch/sparc/kernel/smp_64.c          |    8 ++-
 arch/sparc/kernel/trampoline_64.S   |   46 ++++++++++++++---
 arch/sparc/mm/init_64.c             |   94 ++++++++++++++++++++++++++++++++++-
 arch/sparc/mm/init_64.h             |    2 +-
 7 files changed, 163 insertions(+), 13 deletions(-)
diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index aac53fc..5a85352 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -8,6 +8,9 @@
 #define PAGE_SIZE    (_AC(1,UL) << PAGE_SHIFT)
 #define PAGE_MASK    (~(PAGE_SIZE-1))
 
+#define PAGE4MB_SHIFT          22
+#define PAGE4MB_SIZE           (_AC(1,UL) << PAGE4MB_SHIFT)
+
 /* Flushing for D-cache alias handling is only needed if
  * the page size is smaller than 16K.
  */
diff --git a/arch/sparc/include/asm/pgtable_64.h 
b/arch/sparc/include/asm/pgtable_64.h
index 8358dc1..0b0495f 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -884,6 +884,12 @@ extern pmd_t swapper_low_pmd_dir[PTRS_PER_PMD];
 extern void paging_init(void);
 extern unsigned long find_ecache_flush_span(unsigned long size);
 
+#ifdef CONFIG_NUMA
+extern void numa_copy_kernel_text(void);
+#else
+static inline void numa_copy_kernel_text(void) {}
+#endif
+
 struct seq_file;
 extern void mmu_info(struct seq_file *);
 
diff --git a/arch/sparc/include/asm/trap_block.h 
b/arch/sparc/include/asm/trap_block.h
index 7e26b2d..a2f0990 100644
--- a/arch/sparc/include/asm/trap_block.h
+++ b/arch/sparc/include/asm/trap_block.h
@@ -138,6 +138,23 @@ extern struct sun4v_2insn_patch_entry __sun4v_2insn_patch,
        nop;                                            \
        .previous;
 
+#ifdef CONFIG_NUMA
+
+#define __GET_NODEID(REG, TMP)                         \
+       __GET_CPUID(REG)                                \
+       sethi   %hi(numa_cpu_lookup_table), TMP;        \
+       or      TMP, %lo(numa_cpu_lookup_table), TMP;   \
+       sllx    REG, 2, REG;                            \
+       add     TMP, REG, TMP;                          \
+       lduw    [TMP], REG;
+
+#else /* !CONFIG_NUMA */
+
+#define __GET_NODEID(REG, TMP)                         \
+       clr     REG
+
+#endif /* !CONFIG_NUMA */
+
 #ifdef CONFIG_SMP
 
 #define TRAP_LOAD_TRAP_BLOCK(DEST, TMP)                \
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index b66a533..554a0c5 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -285,7 +285,7 @@ static void ldom_startcpu_cpuid(unsigned int cpu, unsigned 
long thread_reg,
                                void **descrp)
 {
        extern unsigned long sparc64_ttable_tl0;
-       extern unsigned long kern_locked_tte_data;
+       extern unsigned long kern_locked_tte_data[MAX_NUMNODES];
        struct hvtramp_descr *hdesc;
        unsigned long trampoline_ra;
        struct trap_per_cpu *tb;
@@ -315,7 +315,7 @@ static void ldom_startcpu_cpuid(unsigned int cpu, unsigned 
long thread_reg,
        hdesc->thread_reg = thread_reg;
 
        tte_vaddr = (unsigned long) KERNBASE;
-       tte_data = kern_locked_tte_data;
+       tte_data = kern_locked_tte_data[0];
 
        for (i = 0; i < hdesc->num_mappings; i++) {
                hdesc->maps[i].vaddr = tte_vaddr;
@@ -1214,6 +1214,10 @@ int setup_profiling_timer(unsigned int multiplier)
 
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
+       /* Dublicate kernel on every node. Do this after
+        * all kernel patches are applied.
+        */
+       numa_copy_kernel_text();
 }
 
 void smp_prepare_boot_cpu(void)
diff --git a/arch/sparc/kernel/trampoline_64.S 
b/arch/sparc/kernel/trampoline_64.S
index ad4bde3..e5a4f85 100644
--- a/arch/sparc/kernel/trampoline_64.S
+++ b/arch/sparc/kernel/trampoline_64.S
@@ -117,26 +117,42 @@ startup_continue:
        flushw
 
        /* Setup the loop variables:
+        * %l1: Number of 4MB pages containing not-init kernel text
+        * %l2: TTE base of node 0. Used for DTLB and for rest of __init text
+        *      ITLB mappings. See numa_alloc_kernel_text() for details.
         * %l3: VADDR base
-        * %l4: TTE base
+        * %l4: TTE base of current node. Used for ITLB.
         * %l5: Loop iterator, iterates from 0 to 'num_kernel_image_mappings'
         * %l6: Number of TTE entries to map
         * %l7: Highest TTE entry number, we count down
         */
        sethi           %hi(KERNBASE), %l3
        sethi           %hi(kern_locked_tte_data), %l4
-       ldx             [%l4 + %lo(kern_locked_tte_data)], %l4
+       or              %l4, %lo(kern_locked_tte_data), %l4
+       ldx             [%l4], %l2      ! kern_locked_tte_data[0]
+
+       __GET_NODEID(%g2, %g1)
+       sllx            %g2, 3, %g2
+       add             %l4, %g2, %l4
+       ldx             [%l4], %l4      ! kern_locked_tte_data[node]
+
        clr             %l5
        sethi           %hi(num_kernel_image_mappings), %l6
        lduw            [%l6 + %lo(num_kernel_image_mappings)], %l6
 
+       sethi           %hi(num_node_copy_mappings), %l1
+       lduw            [%l1 + %lo(num_node_copy_mappings)], %l1
+
        mov             15, %l7
        BRANCH_IF_ANY_CHEETAH(g1,g5,2f)
 
        mov             63, %l7
 2:
-
-3:
+       cmp             %l5, %l1        !__init section
+       bne             4f
+        nop
+       mov             %l2, %l4        !use node 0 TTE
+4:
        /* Lock into I-MMU */
        sethi           %hi(call_method), %g2
        or              %g2, %lo(call_method), %g2
@@ -190,7 +206,7 @@ startup_continue:
 
        add             %l3, %g1, %g2
        stx             %g2, [%sp + 2047 + 128 + 0x28]  ! VADDR
-       add             %l4, %g1, %g2
+       add             %l2, %g1, %g2
        stx             %g2, [%sp + 2047 + 128 + 0x30]  ! TTE
 
        /* TTE index is highest minus loop index.  */
@@ -205,7 +221,7 @@ startup_continue:
 
        add             %l5, 1, %l5
        cmp             %l5, %l6
-       bne,pt          %xcc, 3b
+       bne,pt          %xcc, 2b
         nop
 
        sethi           %hi(prom_entry_lock), %g2
@@ -217,12 +233,26 @@ startup_continue:
 niagara_lock_tlb:
        sethi           %hi(KERNBASE), %l3
        sethi           %hi(kern_locked_tte_data), %l4
-       ldx             [%l4 + %lo(kern_locked_tte_data)], %l4
+       or              %l4, %lo(kern_locked_tte_data), %l4
+       ldx             [%l4], %l2      ! kern_locked_tte_data[0]
+
+       __GET_NODEID(%g2, %g1)
+       sllx            %g2, 3, %g2
+       add             %l4, %g2, %l4
+       ldx             [%l4], %l4      ! kern_locked_tte_data[node]
+
        clr             %l5
        sethi           %hi(num_kernel_image_mappings), %l6
        lduw            [%l6 + %lo(num_kernel_image_mappings)], %l6
 
+       sethi           %hi(num_node_copy_mappings), %l1
+       lduw            [%l1 + %lo(num_node_copy_mappings)], %l1
 1:
+       cmp             %l5, %l1        !__init section
+       bne             4f
+        nop
+       mov             %l2, %l4        !use node 0 TTE
+4:
        mov             HV_FAST_MMU_MAP_PERM_ADDR, %o5
        sllx            %l5, 22, %g2
        add             %l3, %g2, %o0
@@ -235,7 +265,7 @@ niagara_lock_tlb:
        sllx            %l5, 22, %g2
        add             %l3, %g2, %o0
        clr             %o1
-       add             %l4, %g2, %o2
+       add             %l2, %g2, %o2
        mov             HV_MMU_DMMU, %o3
        ta              HV_FAST_TRAP
 
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 5322e53..0183213 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -186,6 +186,7 @@ unsigned long sparc64_kern_pri_nuc_bits __read_mostly;
 unsigned long sparc64_kern_sec_context __read_mostly;
 
 int num_kernel_image_mappings;
+int num_node_copy_mappings;
 
 #ifdef CONFIG_DEBUG_DCFLUSH
 atomic_t dcpage_flushes = ATOMIC_INIT(0);
@@ -477,7 +478,7 @@ void mmu_info(struct seq_file *m)
 struct linux_prom_translation prom_trans[512] __read_mostly;
 unsigned int prom_trans_ents __read_mostly;
 
-unsigned long kern_locked_tte_data;
+unsigned long kern_locked_tte_data[MAX_NUMNODES];
 
 /* The obp translations are saved based on 8k pagesize, since obp can
  * use a mixture of pagesizes. Misses to the LOW_OBP_ADDRESS ->
@@ -591,7 +592,7 @@ static void __init remap_kernel(void)
        phys_page = (prom_boot_mapping_phys_low >> 22UL) << 22UL;
        tte_data = kern_large_tte(phys_page);
 
-       kern_locked_tte_data = tte_data;
+       kern_locked_tte_data[0] = tte_data;
 
        /* Now lock us into the TLBs via Hypervisor or OBP. */
        if (tlb_type == hypervisor) {
@@ -1330,6 +1331,79 @@ static void __init bootmem_init_nonnuma(void)
        node_set_online(0);
 }
 
+#ifdef CONFIG_NUMA
+
+/* Allocate memory for per-node copy of kernel text.
+ * The copying itself will be made after all kernel
+ * patches are applied.
+ */
+static void __init numa_alloc_kernel_text(void)
+{
+       unsigned long init_start = (unsigned long)__init_begin;
+       unsigned int size, node;
+
+       /* The rest init text will be mapped from the original image.
+        */
+       size = round_up(init_start - KERNBASE, PAGE4MB_SIZE);
+       num_node_copy_mappings = size >> PAGE4MB_SHIFT;
+
+       for (node = 1; node < num_node_masks; node++) {
+               unsigned long tte_data;
+               phys_addr_t new_base_pa;
+
+               new_base_pa = memblock_alloc_nid(size, PAGE4MB_SIZE, node);
+
+               if (new_base_pa) {
+                       pr_info("node %d: Allocated memory for copy of "
+                               "kernel text: [%016llx, %016llx]\n",
+                                node, new_base_pa, new_base_pa + size);
+                       tte_data = kern_large_tte(new_base_pa);
+               } else {
+                       pr_err("node %d: Can't allocate memory for kernel "
+                              "text duplicate\n", node);
+                       tte_data = kern_locked_tte_data[0];
+               }
+
+               kern_locked_tte_data[node] = tte_data;
+       }
+}
+
+/* Dublicate kernel text on every NUMA node.
+ * Do not copy pages which contain only init text,
+ * because they are mapped from original kernel.
+ */
+void numa_copy_kernel_text(void)
+{
+       unsigned int size, node;
+       unsigned long tte_data0;
+
+       size = num_node_copy_mappings << PAGE4MB_SHIFT;
+       tte_data0 = kern_locked_tte_data[0];
+
+       for (node = 1; node < num_node_masks; node++) {
+               unsigned long tte_data, phys_addr;
+
+               tte_data = kern_locked_tte_data[node];
+
+               if (tte_data == tte_data0)
+                       continue;
+
+               /* PA is [42:12] range */
+               phys_addr = (((tte_data << 21) >> 21) >> 13) << 13;
+
+               memcpy(__va(phys_addr), (void *)KERNBASE, size);
+       }
+}
+
+#else /* CONFIG_NUMA */
+
+static void __init numa_alloc_kernel_text(void)
+{
+}
+
+#endif /* CONFIG_NUMA */
+
+
 static unsigned long __init bootmem_init(unsigned long phys_base)
 {
        unsigned long end_pfn;
@@ -1341,6 +1415,8 @@ static unsigned long __init bootmem_init(unsigned long 
phys_base)
        if (bootmem_init_numa() < 0)
                bootmem_init_nonnuma();
 
+       numa_alloc_kernel_text();
+
        /* Dump memblock with node info. */
        memblock_dump_all();
 
@@ -1922,6 +1998,9 @@ void __init paging_init(void)
                memblock_add(pavail[i].phys_addr, pavail[i].reg_size);
        }
 
+#ifdef CONFIG_NUMA
+       kern_size = round_up(kern_size, PAGE4MB_SIZE);
+#endif
        memblock_reserve(kern_base, kern_size);
 
        find_ramdisk(phys_base);
@@ -2188,6 +2267,17 @@ void free_initmem(void)
         * The init section is aligned to 8k in vmlinux.lds. Page align for >8k 
pagesizes.
         */
        addr = PAGE_ALIGN((unsigned long)(__init_begin));
+
+#ifdef CONFIG_NUMA
+       if (num_node_masks > 1) {
+               /* Do not free 4KB pages which are lying at 4MB page
+                * together with normal kernel text. Their addresses
+                * are forbidden forever.
+                */
+               addr = round_up(addr, PAGE4MB_SIZE);
+       }
+#endif
+
        initend = (unsigned long)(__init_end) & PAGE_MASK;
        for (; addr < initend; addr += PAGE_SIZE) {
                unsigned long page;
diff --git a/arch/sparc/mm/init_64.h b/arch/sparc/mm/init_64.h
index 5d3782de..a14c8d8 100644
--- a/arch/sparc/mm/init_64.h
+++ b/arch/sparc/mm/init_64.h
@@ -34,7 +34,7 @@ extern struct linux_prom_translation prom_trans[512];
 extern unsigned int prom_trans_ents;
 
 /* Exported for SMP bootup purposes. */
-extern unsigned long kern_locked_tte_data;
+extern unsigned long kern_locked_tte_data[MAX_NUMNODES];
 
 extern void prom_world(int enter);
 


-- 
To UNSUBSCRIBE, email to [email protected]
with a subject of "unsubscribe". Trouble? Contact [email protected]
Archive: http://lists.debian.org/[email protected]

[Need HELP!] Please help to test kernel patch (sparc64, NUMA)

Reply via email to