Tony, Please don't apply this yet. I just noticed the CONFIG_FLATMEM configs did not work. I will look at this more this evening.
Sorry for the confusion, Robin On Mon, Feb 11, 2008 at 12:09:02PM -0600, Robin Holt wrote: > > This attached patch significantly shrinks boot memory allocation on ia64. > It does this by not allocating per_cpu areas for cpus that can never > exist. > > In the case where acpi does not have any numa node description of the > cpus, I defaulted to assigning the first 32 round-robin on the known > nodes.. For the !CONFIG_ACPI I used for_each_possible_cpu(). > > > Signed-off-by: Robin Holt <[EMAIL PROTECTED]> > > --- > > I tested all the different config options. allyesconfig fails with > or without this patch so that was the one exception. Otherwise, > allnoconfig, allmodconfig, deconfig, and configs/* all compiled. > Additionally, I booted the sn2- and defconfig both on altix and the > defconfig on a zx2000 with 2 cpus. I would like it if somebody with > access to a simulator could build and boot this. That is a different > code path which I have no means of checking. > > Version 5: > > I went too quickly. Shortly after I sent the last email, I got a reply > from HP saying 16 was their largest non-numa box. I will therefore go > back to the 32 Tony and I discussed last Friday. > > Version 4: > > Changed the reservation of additional per_cpu space to round-robin on > the known nodes. > > Cleaned up a copy other loops to use for_each_possible_early_cpu(). > > Changed the default number of cpus to 256 and also changed the lower > threshold to only apply when no early boot cpus are found. This change > was prompted by an note from HP that they support 256 cpus. They did > mention this is on a NUMA box, but I have not currently received a reply > as to whether the cpu locations are described in the ACPI tables. > > Version 3: > > I reworked this patch to use a cpumask to track the cpus we have seen. > It still initializes the .nid to NUMA_NO_NODE (-1). The introcution of > a bitmask makes the scans much cleaner. > > This patch could be using the cpu_possible_map instead of our own. > I was reluctant to do that, but there is nothing that prevents it. > Does anybody have an opinion? > > > Version 2 fixed a port bug. It also introduces NUMA_NO_NODE for ia64. > This is a direct copy from x86. > > One comment I have received is the hard-coded 4 described above should > probably be 8 or 16 to handle larger non-NUMA machines. I originally > set it to 4 because my recollection was that, at most, you could have > four processors per FSB, but maybe that is just an SGI limitation. > > How should this be set? Should I be using a PAL call? processor model? > Limit by current FSB spec and adjust as new processors come along? > > > Using a patched SuSE SLES10 kernel with both the mca patch that Jack/Russ > submitted a couple days ago and the attached. > > On a 2 cpu, 6GB system, NR_CPUS=4096: > Before the patch: > Memory: 5687728k/6234784k available (5777k code, 579632k reserved, 10450k > data, > 672k init) > After both patches: > Memory: 6211984k/6235040k available (5552k code, 55376k reserved, 10418k > data, 656k init) > 90% savings on reserved. > > On a 1 cpu, 1GB system, NR_CPUS=4096 before 572,464K, after 37,456k for > a 93% savings. > > > Index: per_cpu_v4/arch/ia64/kernel/setup.c > =================================================================== > --- per_cpu_v4.orig/arch/ia64/kernel/setup.c 2008-02-11 06:22:41.586019474 > -0600 > +++ per_cpu_v4/arch/ia64/kernel/setup.c 2008-02-11 12:05:29.030432470 > -0600 > @@ -45,6 +45,7 @@ > #include <linux/cpufreq.h> > #include <linux/kexec.h> > #include <linux/crash_dump.h> > +#include <linux/numa.h> > > #include <asm/ia32.h> > #include <asm/machvec.h> > @@ -494,9 +495,12 @@ setup_arch (char **cmdline_p) > # ifdef CONFIG_ACPI_NUMA > acpi_numa_init(); > # endif > + per_cpu_scan_finalize((cpus_weight(early_cpu_possible_map) == 0 ? > + 32 : cpus_weight(early_cpu_possible_map)), additional_cpus); > #else > # ifdef CONFIG_SMP > smp_build_cpu_map(); /* happens, e.g., with the Ski simulator */ > + per_cpu_scan_finalize(num_possible_cpus(), additional_cpus); > # endif > #endif /* CONFIG_APCI_BOOT */ > > Index: per_cpu_v4/arch/ia64/mm/discontig.c > =================================================================== > --- per_cpu_v4.orig/arch/ia64/mm/discontig.c 2008-02-11 06:22:41.610022488 > -0600 > +++ per_cpu_v4/arch/ia64/mm/discontig.c 2008-02-11 06:24:46.513705386 > -0600 > @@ -104,7 +104,7 @@ static int __meminit early_nr_cpus_node( > { > int cpu, n = 0; > > - for (cpu = 0; cpu < NR_CPUS; cpu++) > + for_each_possible_early_cpu(cpu) > if (node == node_cpuid[cpu].nid) > n++; > > @@ -142,7 +142,7 @@ static void *per_cpu_node_setup(void *cp > #ifdef CONFIG_SMP > int cpu; > > - for (cpu = 0; cpu < NR_CPUS; cpu++) { > + for_each_possible_early_cpu(cpu) { > if (node == node_cpuid[cpu].nid) { > memcpy(__va(cpu_data), __phys_per_cpu_start, > __per_cpu_end - __per_cpu_start); > @@ -345,7 +345,7 @@ static void __init initialize_pernode_da > > #ifdef CONFIG_SMP > /* Set the node_data pointer for each per-cpu struct */ > - for (cpu = 0; cpu < NR_CPUS; cpu++) { > + for_each_possible_early_cpu(cpu) { > node = node_cpuid[cpu].nid; > per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data; > } > @@ -493,13 +493,9 @@ void __cpuinit *per_cpu_init(void) > int cpu; > static int first_time = 1; > > - > - if (smp_processor_id() != 0) > - return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; > - > if (first_time) { > first_time = 0; > - for (cpu = 0; cpu < NR_CPUS; cpu++) > + for_each_possible_early_cpu(cpu) > per_cpu(local_per_cpu_offset, cpu) = > __per_cpu_offset[cpu]; > } > > Index: per_cpu_v4/arch/ia64/kernel/acpi.c > =================================================================== > --- per_cpu_v4.orig/arch/ia64/kernel/acpi.c 2008-02-11 06:22:41.538013446 > -0600 > +++ per_cpu_v4/arch/ia64/kernel/acpi.c 2008-02-11 09:10:49.016485958 > -0600 > @@ -482,6 +482,7 @@ acpi_numa_processor_affinity_init(struct > (pa->apic_id << 8) | (pa->local_sapic_eid); > /* nid should be overridden as logical node id later */ > node_cpuid[srat_num_cpus].nid = pxm; > + cpu_set(srat_num_cpus, early_cpu_possible_map); > srat_num_cpus++; > } > > @@ -559,7 +560,7 @@ void __init acpi_numa_arch_fixup(void) > } > > /* set logical node id in cpu structure */ > - for (i = 0; i < srat_num_cpus; i++) > + for_each_possible_early_cpu(i) > node_cpuid[i].nid = pxm_to_node(node_cpuid[i].nid); > > printk(KERN_INFO "Number of logical nodes in system = %d\n", > Index: per_cpu_v4/arch/ia64/kernel/numa.c > =================================================================== > --- per_cpu_v4.orig/arch/ia64/kernel/numa.c 2008-02-11 06:22:41.578018469 > -0600 > +++ per_cpu_v4/arch/ia64/kernel/numa.c 2008-02-11 06:24:46.549709906 > -0600 > @@ -73,7 +73,7 @@ void __init build_cpu_to_node_map(void) > for(node=0; node < MAX_NUMNODES; node++) > cpus_clear(node_to_cpu_mask[node]); > > - for(cpu = 0; cpu < NR_CPUS; ++cpu) { > + for_each_possible_early_cpu(cpu) { > node = -1; > for (i = 0; i < NR_CPUS; ++i) > if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) { > Index: per_cpu_v4/include/asm-ia64/acpi.h > =================================================================== > --- per_cpu_v4.orig/include/asm-ia64/acpi.h 2008-02-11 06:22:51.167222639 > -0600 > +++ per_cpu_v4/include/asm-ia64/acpi.h 2008-02-11 06:24:46.569712417 > -0600 > @@ -115,7 +115,11 @@ extern unsigned int is_cpu_cpei_target(u > extern void set_cpei_target_cpu(unsigned int cpu); > extern unsigned int get_cpei_target_cpu(void); > extern void prefill_possible_map(void); > +#ifdef CONFIG_ACPI_HOTPLUG_CPU > extern int additional_cpus; > +#else > +#define additional_cpus 0 > +#endif > > #ifdef CONFIG_ACPI_NUMA > #if MAX_NUMNODES > 256 > Index: per_cpu_v4/include/asm-ia64/numa.h > =================================================================== > --- per_cpu_v4.orig/include/asm-ia64/numa.h 2008-02-11 06:22:51.183224648 > -0600 > +++ per_cpu_v4/include/asm-ia64/numa.h 2008-02-11 11:39:05.266138236 > -0600 > @@ -22,6 +22,8 @@ > > #include <asm/mmzone.h> > > +#define NUMA_NO_NODE -1 > + > extern u16 cpu_to_node_map[NR_CPUS] __cacheline_aligned; > extern cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned; > extern pg_data_t *pgdat_list[MAX_NUMNODES]; > @@ -68,6 +70,31 @@ extern int paddr_to_nid(unsigned long pa > extern void map_cpu_to_node(int cpu, int nid); > extern void unmap_cpu_from_node(int cpu, int nid); > > +extern cpumask_t early_cpu_possible_map; > +#define for_each_possible_early_cpu(cpu) \ > + for_each_cpu_mask((cpu), early_cpu_possible_map) > + > +static inline void per_cpu_scan_finalize(int min_cpus, int reserve_cpus) > +{ > + int low_cpu, high_cpu; > + int cpu; > + int next_nid = 0; > + > + low_cpu = cpus_weight(early_cpu_possible_map); > + > + high_cpu = max(low_cpu, min_cpus); > + high_cpu = min(high_cpu + reserve_cpus, NR_CPUS); > + > + for (cpu = low_cpu; cpu <= high_cpu; cpu++) { > + cpu_set(cpu, early_cpu_possible_map); > + if (node_cpuid[cpu].nid == NUMA_NO_NODE) { > + node_cpuid[cpu].nid = next_nid; > + next_nid++; > + if (next_nid >= num_online_nodes()) > + next_nid = 0; > + } > + } > +} > > #else /* !CONFIG_NUMA */ > #define map_cpu_to_node(cpu, nid) do{}while(0) > @@ -75,6 +102,7 @@ extern void unmap_cpu_from_node(int cpu, > > #define paddr_to_nid(addr) 0 > > +static inline void per_cpu_scan_finalize(int min_cpus, int reserve_cpus) { } > #endif /* CONFIG_NUMA */ > > #endif /* _ASM_IA64_NUMA_H */ > Index: per_cpu_v4/arch/ia64/mm/numa.c > =================================================================== > --- per_cpu_v4.orig/arch/ia64/mm/numa.c 2008-02-11 06:22:41.610022488 > -0600 > +++ per_cpu_v4/arch/ia64/mm/numa.c 2008-02-11 06:24:46.629719951 -0600 > @@ -27,7 +27,10 @@ > */ > int num_node_memblks; > struct node_memblk_s node_memblk[NR_NODE_MEMBLKS]; > -struct node_cpuid_s node_cpuid[NR_CPUS]; > +struct node_cpuid_s node_cpuid[NR_CPUS] = > + { [0 ... NR_CPUS-1] = { .phys_id = 0, .nid = NUMA_NO_NODE } }; > +cpumask_t early_cpu_possible_map = CPU_MASK_NONE; > + > /* > * This is a matrix with "distances" between nodes, they should be > * proportional to the memory access latency ratios. > - > To unsubscribe from this list: send the line "unsubscribe linux-ia64" in > the body of a message to [EMAIL PROTECTED] > More majordomo info at http://vger.kernel.org/majordomo-info.html - To unsubscribe from this list: send the line "unsubscribe linux-ia64" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html