[Patch] Allocate sparse vmemmap block above 4G
Resend the patch for more people to review On some single node x64 system with huge amount of physical memory e.g > 64G. the memmap size maybe very big. If the memmap is allocated from low pages, it may occupies too much memory below 4G. then swiotlb could fail to reserve bounce buffer under 4G which will lead to boot failure. This patch will first try to allocate memmap memory above 4G in sparse vmemmap code. If it failed, it will allocate memmap above MAX_DMA_ADDRESS. This patch is against 2.6.24-rc1-git14 Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]> Signed-off-by: Suresh Siddha <[EMAIL PROTECTED]> diff -Nraup a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c --- a/arch/x86/mm/init_64.c 2007-11-06 15:16:12.0 +0800 +++ b/arch/x86/mm/init_64.c 2007-11-06 15:55:50.0 +0800 @@ -448,6 +448,13 @@ void online_page(struct page *page) num_physpages++; } +void * __meminit alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size, +unsigned long align) +{ +return __alloc_bootmem_core(pgdat->bdata, size, +align, (4UL*1024*1024*1024), 0, 1); +} + #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory is added always to NORMAL zone. This means you will never get diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h --- a/include/linux/bootmem.h 2007-11-06 16:06:31.0 +0800 +++ b/include/linux/bootmem.h 2007-11-06 15:50:36.0 +0800 @@ -61,6 +61,10 @@ extern void *__alloc_bootmem_core(struct unsigned long limit, int strict_goal); +extern void *alloc_bootmem_high_node(pg_data_t *pgdat, +unsigned long size, +unsigned long align); + #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE extern void reserve_bootmem(unsigned long addr, unsigned long size); #define alloc_bootmem(x) \ diff -Nraup a/mm/bootmem.c b/mm/bootmem.c --- a/mm/bootmem.c 2007-11-06 16:06:31.0 +0800 +++ b/mm/bootmem.c 2007-11-06 15:49:20.0 +0800 @@ -492,3 +492,11 @@ void * __init __alloc_bootmem_low_node(p return __alloc_bootmem_core(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0); } + +__attribute__((weak)) __meminit +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size, +unsigned long align) +{ +return NULL; +} + diff -Nraup a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c --- a/mm/sparse-vmemmap.c 2007-11-06 15:16:12.0 +0800 +++ b/mm/sparse-vmemmap.c 2007-11-06 16:08:52.0 +0800 @@ -43,9 +43,13 @@ void * __meminit vmemmap_alloc_block(uns if (page) return page_address(page); return NULL; - } else + } else { + void *p = alloc_bootmem_high_node(NODE_DATA(node), size, size); + if (p) + return p; return __alloc_bootmem_node(NODE_DATA(node), size, size, __pa(MAX_DMA_ADDRESS)); + } } void __meminit vmemmap_verify(pte_t *pte, int node, - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Patch]Add strict_goal parameter to __alloc_bootmem_core
Resend the patch for more people to review. If __alloc_bootmem_core was given a goal, it will first try to allocate memory above that goal. If failed, it will try from the low pages. Sometimes we don't want this behavior, we want the goal to be strict. This patch introduce a strict_goal parameter to __alloc_bootmem_core, If strict_goal is set, __alloc_bootmem_core will return NULL to indicate it can't allocate memory above that goal. Note we do not scan from last_success if strict_goal is set, it will scan from the beginning of the goal instead We skip this optimization to keep the code simple because strict_goal is not supposed to be used in hot path. Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]> Signed-off-by: Suresh Siddha <[EMAIL PROTECTED]> diff -Nraup a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c --- a/arch/x86/mm/numa_64.c 2007-10-24 11:50:57.0 +0800 +++ b/arch/x86/mm/numa_64.c 2007-11-07 13:06:50.0 +0800 @@ -247,7 +247,7 @@ void __init setup_node_zones(int nodeid) __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, memmapsize, SMP_CACHE_BYTES, round_down(limit - memmapsize, PAGE_SIZE), - limit); + limit, 1); #endif } diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h --- a/include/linux/bootmem.h 2007-11-07 13:06:35.0 +0800 +++ b/include/linux/bootmem.h 2007-11-07 13:06:04.0 +0800 @@ -58,7 +58,8 @@ extern void *__alloc_bootmem_core(struct unsigned long size, unsigned long align, unsigned long goal, - unsigned long limit); + unsigned long limit, + int strict_goal); #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE extern void reserve_bootmem(unsigned long addr, unsigned long size); diff -Nraup a/mm/bootmem.c b/mm/bootmem.c --- a/mm/bootmem.c 2007-11-07 13:06:35.0 +0800 +++ b/mm/bootmem.c 2007-11-07 13:06:18.0 +0800 @@ -179,7 +179,7 @@ static void __init free_bootmem_core(boo */ void * __init __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, - unsigned long align, unsigned long goal, unsigned long limit) + unsigned long align, unsigned long goal, unsigned long limit, int strict_goal) { unsigned long offset, remaining_size, areasize, preferred; unsigned long i, start = 0, incr, eidx, end_pfn; @@ -212,15 +212,20 @@ __alloc_bootmem_core(struct bootmem_data /* * We try to allocate bootmem pages above 'goal' * first, then we try to allocate lower pages. -*/ - if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) { - preferred = goal - bdata->node_boot_start; +* if the goal is not strict. + */ + + preferred = 0; + if (goal) { + if (goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) { + preferred = goal - bdata->node_boot_start; if (bdata->last_success >= preferred) - if (!limit || (limit && limit > bdata->last_success)) + if (!strict_goal && (!limit || (limit && limit > bdata->last_success))) preferred = bdata->last_success; - } else - preferred = 0; + } else if (strict_goal) +return NULL; + } preferred = PFN_DOWN(ALIGN(preferred, align)) + offset; areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; @@ -247,7 +252,7 @@ restart_scan: i = ALIGN(j, incr); } - if (preferred > offset) { + if (preferred > offset && !strict_goal) { preferred = offset; goto restart_scan; } @@ -421,7 +426,7 @@ void * __init __alloc_bootmem_nopanic(un void *ptr; list_for_each_entry(bdata, _list, list) { - ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); + ptr = __alloc_bootmem_core(bdata, size, align, goal, 0, 0); if (ptr) return ptr; } @@ -449,7 +454,7 @@ void * __init __alloc_bootmem_node(pg_da { void *ptr; - ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); + ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0, 0); if (ptr) return ptr; @@ -468,7 +473,7 @@ void * __init __alloc_bootmem_low(unsign list_for_each_entry(bdata, _list, list) { ptr = __alloc_bootmem_core(bdata, size, align, goal, -
[Patch] Allocate sparse vmemmap block above 4G
Resend the patch for more people to review On some single node x64 system with huge amount of physical memory e.g 64G. the memmap size maybe very big. If the memmap is allocated from low pages, it may occupies too much memory below 4G. then swiotlb could fail to reserve bounce buffer under 4G which will lead to boot failure. This patch will first try to allocate memmap memory above 4G in sparse vmemmap code. If it failed, it will allocate memmap above MAX_DMA_ADDRESS. This patch is against 2.6.24-rc1-git14 Signed-off-by: Zou Nan hai [EMAIL PROTECTED] Signed-off-by: Suresh Siddha [EMAIL PROTECTED] diff -Nraup a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c --- a/arch/x86/mm/init_64.c 2007-11-06 15:16:12.0 +0800 +++ b/arch/x86/mm/init_64.c 2007-11-06 15:55:50.0 +0800 @@ -448,6 +448,13 @@ void online_page(struct page *page) num_physpages++; } +void * __meminit alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size, +unsigned long align) +{ +return __alloc_bootmem_core(pgdat-bdata, size, +align, (4UL*1024*1024*1024), 0, 1); +} + #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory is added always to NORMAL zone. This means you will never get diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h --- a/include/linux/bootmem.h 2007-11-06 16:06:31.0 +0800 +++ b/include/linux/bootmem.h 2007-11-06 15:50:36.0 +0800 @@ -61,6 +61,10 @@ extern void *__alloc_bootmem_core(struct unsigned long limit, int strict_goal); +extern void *alloc_bootmem_high_node(pg_data_t *pgdat, +unsigned long size, +unsigned long align); + #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE extern void reserve_bootmem(unsigned long addr, unsigned long size); #define alloc_bootmem(x) \ diff -Nraup a/mm/bootmem.c b/mm/bootmem.c --- a/mm/bootmem.c 2007-11-06 16:06:31.0 +0800 +++ b/mm/bootmem.c 2007-11-06 15:49:20.0 +0800 @@ -492,3 +492,11 @@ void * __init __alloc_bootmem_low_node(p return __alloc_bootmem_core(pgdat-bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0); } + +__attribute__((weak)) __meminit +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size, +unsigned long align) +{ +return NULL; +} + diff -Nraup a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c --- a/mm/sparse-vmemmap.c 2007-11-06 15:16:12.0 +0800 +++ b/mm/sparse-vmemmap.c 2007-11-06 16:08:52.0 +0800 @@ -43,9 +43,13 @@ void * __meminit vmemmap_alloc_block(uns if (page) return page_address(page); return NULL; - } else + } else { + void *p = alloc_bootmem_high_node(NODE_DATA(node), size, size); + if (p) + return p; return __alloc_bootmem_node(NODE_DATA(node), size, size, __pa(MAX_DMA_ADDRESS)); + } } void __meminit vmemmap_verify(pte_t *pte, int node, - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Patch]Add strict_goal parameter to __alloc_bootmem_core
Resend the patch for more people to review. If __alloc_bootmem_core was given a goal, it will first try to allocate memory above that goal. If failed, it will try from the low pages. Sometimes we don't want this behavior, we want the goal to be strict. This patch introduce a strict_goal parameter to __alloc_bootmem_core, If strict_goal is set, __alloc_bootmem_core will return NULL to indicate it can't allocate memory above that goal. Note we do not scan from last_success if strict_goal is set, it will scan from the beginning of the goal instead We skip this optimization to keep the code simple because strict_goal is not supposed to be used in hot path. Signed-off-by: Zou Nan hai [EMAIL PROTECTED] Signed-off-by: Suresh Siddha [EMAIL PROTECTED] diff -Nraup a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c --- a/arch/x86/mm/numa_64.c 2007-10-24 11:50:57.0 +0800 +++ b/arch/x86/mm/numa_64.c 2007-11-07 13:06:50.0 +0800 @@ -247,7 +247,7 @@ void __init setup_node_zones(int nodeid) __alloc_bootmem_core(NODE_DATA(nodeid)-bdata, memmapsize, SMP_CACHE_BYTES, round_down(limit - memmapsize, PAGE_SIZE), - limit); + limit, 1); #endif } diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h --- a/include/linux/bootmem.h 2007-11-07 13:06:35.0 +0800 +++ b/include/linux/bootmem.h 2007-11-07 13:06:04.0 +0800 @@ -58,7 +58,8 @@ extern void *__alloc_bootmem_core(struct unsigned long size, unsigned long align, unsigned long goal, - unsigned long limit); + unsigned long limit, + int strict_goal); #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE extern void reserve_bootmem(unsigned long addr, unsigned long size); diff -Nraup a/mm/bootmem.c b/mm/bootmem.c --- a/mm/bootmem.c 2007-11-07 13:06:35.0 +0800 +++ b/mm/bootmem.c 2007-11-07 13:06:18.0 +0800 @@ -179,7 +179,7 @@ static void __init free_bootmem_core(boo */ void * __init __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, - unsigned long align, unsigned long goal, unsigned long limit) + unsigned long align, unsigned long goal, unsigned long limit, int strict_goal) { unsigned long offset, remaining_size, areasize, preferred; unsigned long i, start = 0, incr, eidx, end_pfn; @@ -212,15 +212,20 @@ __alloc_bootmem_core(struct bootmem_data /* * We try to allocate bootmem pages above 'goal' * first, then we try to allocate lower pages. -*/ - if (goal goal = bdata-node_boot_start PFN_DOWN(goal) end_pfn) { - preferred = goal - bdata-node_boot_start; +* if the goal is not strict. + */ + + preferred = 0; + if (goal) { + if (goal = bdata-node_boot_start PFN_DOWN(goal) end_pfn) { + preferred = goal - bdata-node_boot_start; if (bdata-last_success = preferred) - if (!limit || (limit limit bdata-last_success)) + if (!strict_goal (!limit || (limit limit bdata-last_success))) preferred = bdata-last_success; - } else - preferred = 0; + } else if (strict_goal) +return NULL; + } preferred = PFN_DOWN(ALIGN(preferred, align)) + offset; areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; @@ -247,7 +252,7 @@ restart_scan: i = ALIGN(j, incr); } - if (preferred offset) { + if (preferred offset !strict_goal) { preferred = offset; goto restart_scan; } @@ -421,7 +426,7 @@ void * __init __alloc_bootmem_nopanic(un void *ptr; list_for_each_entry(bdata, bdata_list, list) { - ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); + ptr = __alloc_bootmem_core(bdata, size, align, goal, 0, 0); if (ptr) return ptr; } @@ -449,7 +454,7 @@ void * __init __alloc_bootmem_node(pg_da { void *ptr; - ptr = __alloc_bootmem_core(pgdat-bdata, size, align, goal, 0); + ptr = __alloc_bootmem_core(pgdat-bdata, size, align, goal, 0, 0); if (ptr) return ptr; @@ -468,7 +473,7 @@ void * __init __alloc_bootmem_low(unsign list_for_each_entry(bdata, bdata_list, list) { ptr = __alloc_bootmem_core(bdata, size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); + ARCH_LOW_ADDRESS_LIMIT, 0); if (ptr) return
[Patch]Add strict_goal parameter to __alloc_bootmem_core
If __alloc_bootmem_core was given a goal, it will first try to allocate memory above that goal. If failed, it will try from the low pages. Sometimes we don't want this behavior, we want the goal to be strict. This patch introduce a strict_goal parameter to __alloc_bootmem_core, If strict_goal is set, __alloc_bootmem_core will return NULL to indicate it can't allocate memory above that goal. Note we do not scan from last_success if strict_goal is set, it will scan from the beginning of the goal instead We skip this optimization to keep the code simple because strict_goal is not supposed to be used in hot path. Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]> Signed-off-by: Suresh Siddha <[EMAIL PROTECTED]> diff -Nraup a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c --- a/arch/x86/mm/numa_64.c 2007-10-24 11:50:57.0 +0800 +++ b/arch/x86/mm/numa_64.c 2007-11-07 13:06:50.0 +0800 @@ -247,7 +247,7 @@ void __init setup_node_zones(int nodeid) __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, memmapsize, SMP_CACHE_BYTES, round_down(limit - memmapsize, PAGE_SIZE), - limit); + limit, 1); #endif } diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h --- a/include/linux/bootmem.h 2007-11-07 13:06:35.0 +0800 +++ b/include/linux/bootmem.h 2007-11-07 13:06:04.0 +0800 @@ -58,7 +58,8 @@ extern void *__alloc_bootmem_core(struct unsigned long size, unsigned long align, unsigned long goal, - unsigned long limit); + unsigned long limit, + int strict_goal); #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE extern void reserve_bootmem(unsigned long addr, unsigned long size); diff -Nraup a/mm/bootmem.c b/mm/bootmem.c --- a/mm/bootmem.c 2007-11-07 13:06:35.0 +0800 +++ b/mm/bootmem.c 2007-11-07 13:06:18.0 +0800 @@ -179,7 +179,7 @@ static void __init free_bootmem_core(boo */ void * __init __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, - unsigned long align, unsigned long goal, unsigned long limit) + unsigned long align, unsigned long goal, unsigned long limit, int strict_goal) { unsigned long offset, remaining_size, areasize, preferred; unsigned long i, start = 0, incr, eidx, end_pfn; @@ -212,15 +212,20 @@ __alloc_bootmem_core(struct bootmem_data /* * We try to allocate bootmem pages above 'goal' * first, then we try to allocate lower pages. -*/ - if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) { - preferred = goal - bdata->node_boot_start; +* if the goal is not strict. + */ + + preferred = 0; + if (goal) { + if (goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) { + preferred = goal - bdata->node_boot_start; if (bdata->last_success >= preferred) - if (!limit || (limit && limit > bdata->last_success)) + if (!strict_goal && (!limit || (limit && limit > bdata->last_success))) preferred = bdata->last_success; - } else - preferred = 0; + } else if (strict_goal) +return NULL; + } preferred = PFN_DOWN(ALIGN(preferred, align)) + offset; areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; @@ -247,7 +252,7 @@ restart_scan: i = ALIGN(j, incr); } - if (preferred > offset) { + if (preferred > offset && !strict_goal) { preferred = offset; goto restart_scan; } @@ -421,7 +426,7 @@ void * __init __alloc_bootmem_nopanic(un void *ptr; list_for_each_entry(bdata, _list, list) { - ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); + ptr = __alloc_bootmem_core(bdata, size, align, goal, 0, 0); if (ptr) return ptr; } @@ -449,7 +454,7 @@ void * __init __alloc_bootmem_node(pg_da { void *ptr; - ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); + ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0, 0); if (ptr) return ptr; @@ -468,7 +473,7 @@ void * __init __alloc_bootmem_low(unsign list_for_each_entry(bdata, _list, list) { ptr = __alloc_bootmem_core(bdata, size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); +
[Patch] Allocate sparse vmemmap block above 4G
Try to allocate sparse vmemmap block above 4G on x64 system. On some single node x64 system with huge amount of physical memory e.g > 64G. the memmap size maybe very big. If the memmap is allocated from low pages, it may occupies too much memory below 4G. then swiotlb could fail to reserve bounce buffer under 4G which will lead to boot failure. This patch will first try to allocate memmap memory above 4G in sparse vmemmap code. If it failed, it will allocate memmap above MAX_DMA_ADDRESS. This patch is against 2.6.24-rc1-git14 Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]> Signed-off-by: Suresh Siddha <[EMAIL PROTECTED]> diff -Nraup a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c --- a/arch/x86/mm/init_64.c 2007-11-06 15:16:12.0 +0800 +++ b/arch/x86/mm/init_64.c 2007-11-06 15:55:50.0 +0800 @@ -448,6 +448,13 @@ void online_page(struct page *page) num_physpages++; } +void * __meminit alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size, +unsigned long align) +{ +return __alloc_bootmem_core(pgdat->bdata, size, +align, (4UL*1024*1024*1024), 0, 1); +} + #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory is added always to NORMAL zone. This means you will never get diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h --- a/include/linux/bootmem.h 2007-11-06 16:06:31.0 +0800 +++ b/include/linux/bootmem.h 2007-11-06 15:50:36.0 +0800 @@ -61,6 +61,10 @@ extern void *__alloc_bootmem_core(struct unsigned long limit, int strict_goal); +extern void *alloc_bootmem_high_node(pg_data_t *pgdat, +unsigned long size, +unsigned long align); + #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE extern void reserve_bootmem(unsigned long addr, unsigned long size); #define alloc_bootmem(x) \ diff -Nraup a/mm/bootmem.c b/mm/bootmem.c --- a/mm/bootmem.c 2007-11-06 16:06:31.0 +0800 +++ b/mm/bootmem.c 2007-11-06 15:49:20.0 +0800 @@ -492,3 +492,11 @@ void * __init __alloc_bootmem_low_node(p return __alloc_bootmem_core(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0); } + +__attribute__((weak)) __meminit +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size, +unsigned long align) +{ +return NULL; +} + diff -Nraup a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c --- a/mm/sparse-vmemmap.c 2007-11-06 15:16:12.0 +0800 +++ b/mm/sparse-vmemmap.c 2007-11-06 16:08:52.0 +0800 @@ -43,9 +43,13 @@ void * __meminit vmemmap_alloc_block(uns if (page) return page_address(page); return NULL; - } else + } else { + void *p = alloc_bootmem_high_node(NODE_DATA(node), size, size); + if (p) + return p; return __alloc_bootmem_node(NODE_DATA(node), size, size, __pa(MAX_DMA_ADDRESS)); + } } void __meminit vmemmap_verify(pte_t *pte, int node, - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch2/2] fix wrong proc cpuinfo on x64
in 2.6.24-rc1 kernel, The /proc/cpuinfo display is wrong. Another issue is that it will display bogus cpus with wrong information if the kernel is compiled with a big CONFIG_NR_CPU. That is because before a cpu in cpu_present_map is up, c->cpu_index of that cpu is 0. thus the cpu_online(c->cpu_index) check in show_cpuinfo is invalid. This patch will let cpuinfo_op use cpu_online_map instead of cpu_present_map to iterate cpus. Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]> --- linux-2.6.24-rc1/arch/x86/kernel/setup_64.c 2007-10-29 22:03:05.0 -0400 +++ b/arch/x86/kernel/setup_64.c2007-11-05 23:09:06.0 -0500 @@ -1078,8 +1078,6 @@ static int show_cpuinfo(struct seq_file #ifdef CONFIG_SMP - if (!cpu_online(c->cpu_index)) - return 0; cpu = c->cpu_index; #endif @@ -1171,15 +1169,15 @@ static int show_cpuinfo(struct seq_file static void *c_start(struct seq_file *m, loff_t *pos) { if (*pos == 0) /* just in case, cpu 0 is not the first */ - *pos = first_cpu(cpu_possible_map); - if ((*pos) < NR_CPUS && cpu_possible(*pos)) + *pos = first_cpu(cpu_online_map); + if ((*pos) < NR_CPUS && cpu_online(*pos)) return _data(*pos); return NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { - *pos = next_cpu(*pos, cpu_possible_map); + *pos = next_cpu(*pos, cpu_online_map); return c_start(m, pos); } - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch2/2] fix wrong proc cpuinfo on x64
in 2.6.24-rc1 kernel, The /proc/cpuinfo display is wrong. Another issue is that it will display bogus cpus with wrong information if the kernel is compiled with a big CONFIG_NR_CPU. That is because before a cpu in cpu_present_map is up, c-cpu_index of that cpu is 0. thus the cpu_online(c-cpu_index) check in show_cpuinfo is invalid. This patch will let cpuinfo_op use cpu_online_map instead of cpu_present_map to iterate cpus. Signed-off-by: Zou Nan hai [EMAIL PROTECTED] --- linux-2.6.24-rc1/arch/x86/kernel/setup_64.c 2007-10-29 22:03:05.0 -0400 +++ b/arch/x86/kernel/setup_64.c2007-11-05 23:09:06.0 -0500 @@ -1078,8 +1078,6 @@ static int show_cpuinfo(struct seq_file #ifdef CONFIG_SMP - if (!cpu_online(c-cpu_index)) - return 0; cpu = c-cpu_index; #endif @@ -1171,15 +1169,15 @@ static int show_cpuinfo(struct seq_file static void *c_start(struct seq_file *m, loff_t *pos) { if (*pos == 0) /* just in case, cpu 0 is not the first */ - *pos = first_cpu(cpu_possible_map); - if ((*pos) NR_CPUS cpu_possible(*pos)) + *pos = first_cpu(cpu_online_map); + if ((*pos) NR_CPUS cpu_online(*pos)) return cpu_data(*pos); return NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { - *pos = next_cpu(*pos, cpu_possible_map); + *pos = next_cpu(*pos, cpu_online_map); return c_start(m, pos); } - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch]Add strict_goal parameter to __alloc_bootmem_core
If __alloc_bootmem_core was given a goal, it will first try to allocate memory above that goal. If failed, it will try from the low pages. Sometimes we don't want this behavior, we want the goal to be strict. This patch introduce a strict_goal parameter to __alloc_bootmem_core, If strict_goal is set, __alloc_bootmem_core will return NULL to indicate it can't allocate memory above that goal. Note we do not scan from last_success if strict_goal is set, it will scan from the beginning of the goal instead We skip this optimization to keep the code simple because strict_goal is not supposed to be used in hot path. Signed-off-by: Zou Nan hai [EMAIL PROTECTED] Signed-off-by: Suresh Siddha [EMAIL PROTECTED] diff -Nraup a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c --- a/arch/x86/mm/numa_64.c 2007-10-24 11:50:57.0 +0800 +++ b/arch/x86/mm/numa_64.c 2007-11-07 13:06:50.0 +0800 @@ -247,7 +247,7 @@ void __init setup_node_zones(int nodeid) __alloc_bootmem_core(NODE_DATA(nodeid)-bdata, memmapsize, SMP_CACHE_BYTES, round_down(limit - memmapsize, PAGE_SIZE), - limit); + limit, 1); #endif } diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h --- a/include/linux/bootmem.h 2007-11-07 13:06:35.0 +0800 +++ b/include/linux/bootmem.h 2007-11-07 13:06:04.0 +0800 @@ -58,7 +58,8 @@ extern void *__alloc_bootmem_core(struct unsigned long size, unsigned long align, unsigned long goal, - unsigned long limit); + unsigned long limit, + int strict_goal); #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE extern void reserve_bootmem(unsigned long addr, unsigned long size); diff -Nraup a/mm/bootmem.c b/mm/bootmem.c --- a/mm/bootmem.c 2007-11-07 13:06:35.0 +0800 +++ b/mm/bootmem.c 2007-11-07 13:06:18.0 +0800 @@ -179,7 +179,7 @@ static void __init free_bootmem_core(boo */ void * __init __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, - unsigned long align, unsigned long goal, unsigned long limit) + unsigned long align, unsigned long goal, unsigned long limit, int strict_goal) { unsigned long offset, remaining_size, areasize, preferred; unsigned long i, start = 0, incr, eidx, end_pfn; @@ -212,15 +212,20 @@ __alloc_bootmem_core(struct bootmem_data /* * We try to allocate bootmem pages above 'goal' * first, then we try to allocate lower pages. -*/ - if (goal goal = bdata-node_boot_start PFN_DOWN(goal) end_pfn) { - preferred = goal - bdata-node_boot_start; +* if the goal is not strict. + */ + + preferred = 0; + if (goal) { + if (goal = bdata-node_boot_start PFN_DOWN(goal) end_pfn) { + preferred = goal - bdata-node_boot_start; if (bdata-last_success = preferred) - if (!limit || (limit limit bdata-last_success)) + if (!strict_goal (!limit || (limit limit bdata-last_success))) preferred = bdata-last_success; - } else - preferred = 0; + } else if (strict_goal) +return NULL; + } preferred = PFN_DOWN(ALIGN(preferred, align)) + offset; areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; @@ -247,7 +252,7 @@ restart_scan: i = ALIGN(j, incr); } - if (preferred offset) { + if (preferred offset !strict_goal) { preferred = offset; goto restart_scan; } @@ -421,7 +426,7 @@ void * __init __alloc_bootmem_nopanic(un void *ptr; list_for_each_entry(bdata, bdata_list, list) { - ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); + ptr = __alloc_bootmem_core(bdata, size, align, goal, 0, 0); if (ptr) return ptr; } @@ -449,7 +454,7 @@ void * __init __alloc_bootmem_node(pg_da { void *ptr; - ptr = __alloc_bootmem_core(pgdat-bdata, size, align, goal, 0); + ptr = __alloc_bootmem_core(pgdat-bdata, size, align, goal, 0, 0); if (ptr) return ptr; @@ -468,7 +473,7 @@ void * __init __alloc_bootmem_low(unsign list_for_each_entry(bdata, bdata_list, list) { ptr = __alloc_bootmem_core(bdata, size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); + ARCH_LOW_ADDRESS_LIMIT, 0); if (ptr) return ptr; } @@ -485,5 +490,5 @@ void
[Patch] Allocate sparse vmemmap block above 4G
Try to allocate sparse vmemmap block above 4G on x64 system. On some single node x64 system with huge amount of physical memory e.g 64G. the memmap size maybe very big. If the memmap is allocated from low pages, it may occupies too much memory below 4G. then swiotlb could fail to reserve bounce buffer under 4G which will lead to boot failure. This patch will first try to allocate memmap memory above 4G in sparse vmemmap code. If it failed, it will allocate memmap above MAX_DMA_ADDRESS. This patch is against 2.6.24-rc1-git14 Signed-off-by: Zou Nan hai [EMAIL PROTECTED] Signed-off-by: Suresh Siddha [EMAIL PROTECTED] diff -Nraup a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c --- a/arch/x86/mm/init_64.c 2007-11-06 15:16:12.0 +0800 +++ b/arch/x86/mm/init_64.c 2007-11-06 15:55:50.0 +0800 @@ -448,6 +448,13 @@ void online_page(struct page *page) num_physpages++; } +void * __meminit alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size, +unsigned long align) +{ +return __alloc_bootmem_core(pgdat-bdata, size, +align, (4UL*1024*1024*1024), 0, 1); +} + #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory is added always to NORMAL zone. This means you will never get diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h --- a/include/linux/bootmem.h 2007-11-06 16:06:31.0 +0800 +++ b/include/linux/bootmem.h 2007-11-06 15:50:36.0 +0800 @@ -61,6 +61,10 @@ extern void *__alloc_bootmem_core(struct unsigned long limit, int strict_goal); +extern void *alloc_bootmem_high_node(pg_data_t *pgdat, +unsigned long size, +unsigned long align); + #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE extern void reserve_bootmem(unsigned long addr, unsigned long size); #define alloc_bootmem(x) \ diff -Nraup a/mm/bootmem.c b/mm/bootmem.c --- a/mm/bootmem.c 2007-11-06 16:06:31.0 +0800 +++ b/mm/bootmem.c 2007-11-06 15:49:20.0 +0800 @@ -492,3 +492,11 @@ void * __init __alloc_bootmem_low_node(p return __alloc_bootmem_core(pgdat-bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0); } + +__attribute__((weak)) __meminit +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size, +unsigned long align) +{ +return NULL; +} + diff -Nraup a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c --- a/mm/sparse-vmemmap.c 2007-11-06 15:16:12.0 +0800 +++ b/mm/sparse-vmemmap.c 2007-11-06 16:08:52.0 +0800 @@ -43,9 +43,13 @@ void * __meminit vmemmap_alloc_block(uns if (page) return page_address(page); return NULL; - } else + } else { + void *p = alloc_bootmem_high_node(NODE_DATA(node), size, size); + if (p) + return p; return __alloc_bootmem_node(NODE_DATA(node), size, size, __pa(MAX_DMA_ADDRESS)); + } } void __meminit vmemmap_verify(pte_t *pte, int node, - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch1/2] fix wrong proc cpuinfo on x64
in 2.6.24-rc1 kernel, The /proc/cpuinfo display is wrong. One issue is every processor id appears to be 0. That is because smp_store_cpu_info will set cpuinfo_x86->cpu_index to cpu id then call identify_cpu identify_cpu will call early_identify_cpu which set c->cpu_index back to 0. This patch set cpu_index after identify_cpu to fix the issue. Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]> --- linux-2.6.24-rc1/arch/x86/kernel/smpboot_64.c 2007-10-29 22:03:05.0 -0400 +++ b/arch/x86/kernel/smpboot_64.c 2007-11-05 22:12:57.0 -0500 @@ -141,8 +141,8 @@ static void __cpuinit smp_store_cpu_info struct cpuinfo_x86 *c = _data(id); *c = boot_cpu_data; - c->cpu_index = id; identify_cpu(c); + c->cpu_index = id; print_cpu_info(c); } - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch1/2] fix wrong proc cpuinfo on x64
in 2.6.24-rc1 kernel, The /proc/cpuinfo display is wrong. One issue is every processor id appears to be 0. That is because smp_store_cpu_info will set cpuinfo_x86-cpu_index to cpu id then call identify_cpu identify_cpu will call early_identify_cpu which set c-cpu_index back to 0. This patch set cpu_index after identify_cpu to fix the issue. Signed-off-by: Zou Nan hai [EMAIL PROTECTED] --- linux-2.6.24-rc1/arch/x86/kernel/smpboot_64.c 2007-10-29 22:03:05.0 -0400 +++ b/arch/x86/kernel/smpboot_64.c 2007-11-05 22:12:57.0 -0500 @@ -141,8 +141,8 @@ static void __cpuinit smp_store_cpu_info struct cpuinfo_x86 *c = cpu_data(id); *c = boot_cpu_data; - c-cpu_index = id; identify_cpu(c); + c-cpu_index = id; print_cpu_info(c); } - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 2.6.23 boot failures on x86-64.
On Wed, 2007-10-31 at 14:04, Zou Nan hai wrote: > On Tue, 2007-10-30 at 05:21, Martin Ebourne wrote: > > On Mon, 2007-10-29 at 15:43 -0400, Dave Jones wrote: > > > On Mon, Oct 29, 2007 at 08:03:09PM +0100, Andi Kleen wrote: > > > > > > But if allocating bootmem >4G doesn't work on these systems > > > > > > most likely they have more problems anyways. It might be better > > > > > > to find out what goes wrong exactly. > > > > > Any ideas on what to instrument ? > > > > > > > > See what address the bootmem_alloc_high returns; check if it overlaps > > > > with something etc. > > > > > > > > Fill the memory on the system and see if it can access all of its > > > memory. > > > > > > Martin, as you have one of the affected systems, do you feel up to this? > > > > Faking a node at -1fff > > Bootmem setup node 0 -1fff > > sparse_early_mem_map_alloc: returned address 8170b000 > > > > My box has 512MB of RAM. > > > > Cheers, > > > > Martin. > > Oops, sorry, > seem to be a mistake of me. > I forget to exclude the DMA range. > > Does the following patch fix the issue? > > Thanks > Zou Nan hai > > --- a/arch/x86/mm/init_64.c 2007-10-31 11:24:11.0 +0800 > +++ b/arch/x86/mm/init_64.c 2007-10-31 12:31:02.0 +0800 > @@ -731,7 +731,7 @@ int in_gate_area_no_task(unsigned long a > void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) > { > return __alloc_bootmem_core(pgdat->bdata, size, > - SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); > + SMP_CACHE_BYTES, (4UL*1024*1024*1024), > __pa(MAX_DMA_ADDRESS)); > } > > const char *arch_vma_name(struct vm_area_struct *vma) > > > > Please ignore the patch, the patch is wrong. However I think the root cause is when __alloc_bootmem_core fail to allocate a memory above 4G it will fall back to allocate from the lowest page. Then happens to be allocated in DMA region sometimes... Since this code path is dead, I am OK to revert the patch. Suresh and I will check the CONFIG_SPARSE_VMEMMAP path. Thanks Zou Nan hai - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 2.6.23 boot failures on x86-64.
On Tue, 2007-10-30 at 05:21, Martin Ebourne wrote: > On Mon, 2007-10-29 at 15:43 -0400, Dave Jones wrote: > > On Mon, Oct 29, 2007 at 08:03:09PM +0100, Andi Kleen wrote: > > > > > But if allocating bootmem >4G doesn't work on these systems > > > > > most likely they have more problems anyways. It might be better > > > > > to find out what goes wrong exactly. > > > > Any ideas on what to instrument ? > > > > > > See what address the bootmem_alloc_high returns; check if it overlaps > > > with something etc. > > > > > > Fill the memory on the system and see if it can access all of its memory. > > > > Martin, as you have one of the affected systems, do you feel up to this? > > Faking a node at -1fff > Bootmem setup node 0 -1fff > sparse_early_mem_map_alloc: returned address 8170b000 > > My box has 512MB of RAM. > > Cheers, > > Martin. Oops, sorry, seem to be a mistake of me. I forget to exclude the DMA range. Does the following patch fix the issue? Thanks Zou Nan hai --- a/arch/x86/mm/init_64.c 2007-10-31 11:24:11.0 +0800 +++ b/arch/x86/mm/init_64.c 2007-10-31 12:31:02.0 +0800 @@ -731,7 +731,7 @@ int in_gate_area_no_task(unsigned long a void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) { return __alloc_bootmem_core(pgdat->bdata, size, - SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); + SMP_CACHE_BYTES, (4UL*1024*1024*1024), __pa(MAX_DMA_ADDRESS)); } const char *arch_vma_name(struct vm_area_struct *vma) - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 2.6.23 boot failures on x86-64.
On Tue, 2007-10-30 at 05:21, Martin Ebourne wrote: On Mon, 2007-10-29 at 15:43 -0400, Dave Jones wrote: On Mon, Oct 29, 2007 at 08:03:09PM +0100, Andi Kleen wrote: But if allocating bootmem 4G doesn't work on these systems most likely they have more problems anyways. It might be better to find out what goes wrong exactly. Any ideas on what to instrument ? See what address the bootmem_alloc_high returns; check if it overlaps with something etc. Fill the memory on the system and see if it can access all of its memory. Martin, as you have one of the affected systems, do you feel up to this? Faking a node at -1fff Bootmem setup node 0 -1fff sparse_early_mem_map_alloc: returned address 8170b000 My box has 512MB of RAM. Cheers, Martin. Oops, sorry, seem to be a mistake of me. I forget to exclude the DMA range. Does the following patch fix the issue? Thanks Zou Nan hai --- a/arch/x86/mm/init_64.c 2007-10-31 11:24:11.0 +0800 +++ b/arch/x86/mm/init_64.c 2007-10-31 12:31:02.0 +0800 @@ -731,7 +731,7 @@ int in_gate_area_no_task(unsigned long a void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) { return __alloc_bootmem_core(pgdat-bdata, size, - SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); + SMP_CACHE_BYTES, (4UL*1024*1024*1024), __pa(MAX_DMA_ADDRESS)); } const char *arch_vma_name(struct vm_area_struct *vma) - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 2.6.23 boot failures on x86-64.
On Wed, 2007-10-31 at 14:04, Zou Nan hai wrote: On Tue, 2007-10-30 at 05:21, Martin Ebourne wrote: On Mon, 2007-10-29 at 15:43 -0400, Dave Jones wrote: On Mon, Oct 29, 2007 at 08:03:09PM +0100, Andi Kleen wrote: But if allocating bootmem 4G doesn't work on these systems most likely they have more problems anyways. It might be better to find out what goes wrong exactly. Any ideas on what to instrument ? See what address the bootmem_alloc_high returns; check if it overlaps with something etc. Fill the memory on the system and see if it can access all of its memory. Martin, as you have one of the affected systems, do you feel up to this? Faking a node at -1fff Bootmem setup node 0 -1fff sparse_early_mem_map_alloc: returned address 8170b000 My box has 512MB of RAM. Cheers, Martin. Oops, sorry, seem to be a mistake of me. I forget to exclude the DMA range. Does the following patch fix the issue? Thanks Zou Nan hai --- a/arch/x86/mm/init_64.c 2007-10-31 11:24:11.0 +0800 +++ b/arch/x86/mm/init_64.c 2007-10-31 12:31:02.0 +0800 @@ -731,7 +731,7 @@ int in_gate_area_no_task(unsigned long a void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) { return __alloc_bootmem_core(pgdat-bdata, size, - SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); + SMP_CACHE_BYTES, (4UL*1024*1024*1024), __pa(MAX_DMA_ADDRESS)); } const char *arch_vma_name(struct vm_area_struct *vma) Please ignore the patch, the patch is wrong. However I think the root cause is when __alloc_bootmem_core fail to allocate a memory above 4G it will fall back to allocate from the lowest page. Then happens to be allocated in DMA region sometimes... Since this code path is dead, I am OK to revert the patch. Suresh and I will check the CONFIG_SPARSE_VMEMMAP path. Thanks Zou Nan hai - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch]some proc entries are missed in sched_domain sys_ctl debug code.
cache_nice_tries and flags entry do not appear in proc fs sched_domain directory, because ctl_table entry is skipped. This patch fix the issue. Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]> --- linux-2.6.23-rc6/kernel/sched.c 2007-09-18 23:47:07.0 -0400 +++ b/kernel/sched.c2007-09-18 23:47:20.0 -0400 @@ -5304,7 +5304,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(14); + struct ctl_table *table = sd_alloc_ctl_entry(12); set_table_entry([0], "min_interval", >min_interval, sizeof(long), 0644, proc_doulongvec_minmax); @@ -5324,10 +5324,10 @@ sd_alloc_ctl_domain_table(struct sched_d sizeof(int), 0644, proc_dointvec_minmax); set_table_entry([8], "imbalance_pct", >imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry([10], "cache_nice_tries", + set_table_entry([9], "cache_nice_tries", >cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry([12], "flags", >flags, + set_table_entry([10], "flags", >flags, sizeof(int), 0644, proc_dointvec_minmax); return table; - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch]some proc entries are missed in sched_domain sys_ctl debug code.
cache_nice_tries and flags entry do not appear in proc fs sched_domain directory, because ctl_table entry is skipped. This patch fix the issue. Signed-off-by: Zou Nan hai [EMAIL PROTECTED] --- linux-2.6.23-rc6/kernel/sched.c 2007-09-18 23:47:07.0 -0400 +++ b/kernel/sched.c2007-09-18 23:47:20.0 -0400 @@ -5304,7 +5304,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(14); + struct ctl_table *table = sd_alloc_ctl_entry(12); set_table_entry(table[0], min_interval, sd-min_interval, sizeof(long), 0644, proc_doulongvec_minmax); @@ -5324,10 +5324,10 @@ sd_alloc_ctl_domain_table(struct sched_d sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(table[8], imbalance_pct, sd-imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(table[10], cache_nice_tries, + set_table_entry(table[9], cache_nice_tries, sd-cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(table[12], flags, sd-flags, + set_table_entry(table[10], flags, sd-flags, sizeof(int), 0644, proc_dointvec_minmax); return table; - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [Patch] Allocate sparsemem memmap above 4G on X86_64
On Fri, 2007-05-18 at 03:32, Andrew Morton wrote: > On 17 May 2007 10:40:07 +0800 > Zou Nan hai <[EMAIL PROTECTED]> wrote: > > > > Please always prefer to use static inline functions rather than macros. > They are more readable, they are more likely to have comments attached to > them and they provide typechecking. > > Please prefer to uninline functions by default. One reason for this is > that adding inlines to headers increases include complexity. This code is > all __init anyway, so the possible few bytes of text will get removed. > > > Try to avoid using the ARCH_HAS_FOO thing. We have two alternatives: > > a) use __attribute__((weak)) > > b) do: > > extern void foo(void); > #define foo foo > >then, elsewhere, > > #ifndef foo > #define foo() bar() > #endif > > Both tricks avoid the introduction of two new symbols into the global > namespace to solve a single problem. On systems with huge amount of physical memory, VFS cache and memory memmap may eat all available system memory under 4G, then the system may fail to allocate swiotlb bounce buffer. There was a fix for this issue in arch/x86_64/mm/numa.c, but that fix dose not cover sparsemem model. This patch add fix to sparsemem model by first try to allocate memmap above 4G. Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]> Acked-by: Suresh Siddha <[EMAIL PROTECTED]> --- arch/x86_64/mm/init.c |6 ++ mm/sparse.c | 11 +++ 2 files changed, 17 insertions(+) diff -Nraup a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c --- a/arch/x86_64/mm/init.c 2007-05-19 16:54:46.0 +0800 +++ b/arch/x86_64/mm/init.c 2007-05-19 17:43:47.0 +0800 @@ -761,3 +761,9 @@ int in_gate_area_no_task(unsigned long a { return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } + +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) +{ + return __alloc_bootmem_core(pgdat->bdata, size, + SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); +} diff -Nraup a/mm/sparse.c b/mm/sparse.c --- a/mm/sparse.c 2007-05-19 16:54:48.0 +0800 +++ b/mm/sparse.c 2007-05-19 17:44:01.0 +0800 @@ -209,6 +209,12 @@ static int __meminit sparse_init_one_sec return 1; } +__attribute__((weak)) +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) +{ + return NULL; +} + static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) { struct page *map; @@ -219,6 +225,11 @@ static struct page __init *sparse_early_ if (map) return map; + map = alloc_bootmem_high_node(NODE_DATA(nid), + sizeof(struct page) * PAGES_PER_SECTION); + if (map) + return map; + map = alloc_bootmem_node(NODE_DATA(nid), sizeof(struct page) * PAGES_PER_SECTION); if (map) - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [Patch] Allocate sparsemem memmap above 4G on X86_64
On Fri, 2007-05-18 at 03:32, Andrew Morton wrote: On 17 May 2007 10:40:07 +0800 Zou Nan hai [EMAIL PROTECTED] wrote: Please always prefer to use static inline functions rather than macros. They are more readable, they are more likely to have comments attached to them and they provide typechecking. Please prefer to uninline functions by default. One reason for this is that adding inlines to headers increases include complexity. This code is all __init anyway, so the possible few bytes of text will get removed. Try to avoid using the ARCH_HAS_FOO thing. We have two alternatives: a) use __attribute__((weak)) b) do: extern void foo(void); #define foo foo then, elsewhere, #ifndef foo #define foo() bar() #endif Both tricks avoid the introduction of two new symbols into the global namespace to solve a single problem. On systems with huge amount of physical memory, VFS cache and memory memmap may eat all available system memory under 4G, then the system may fail to allocate swiotlb bounce buffer. There was a fix for this issue in arch/x86_64/mm/numa.c, but that fix dose not cover sparsemem model. This patch add fix to sparsemem model by first try to allocate memmap above 4G. Signed-off-by: Zou Nan hai [EMAIL PROTECTED] Acked-by: Suresh Siddha [EMAIL PROTECTED] --- arch/x86_64/mm/init.c |6 ++ mm/sparse.c | 11 +++ 2 files changed, 17 insertions(+) diff -Nraup a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c --- a/arch/x86_64/mm/init.c 2007-05-19 16:54:46.0 +0800 +++ b/arch/x86_64/mm/init.c 2007-05-19 17:43:47.0 +0800 @@ -761,3 +761,9 @@ int in_gate_area_no_task(unsigned long a { return (addr = VSYSCALL_START) (addr VSYSCALL_END); } + +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) +{ + return __alloc_bootmem_core(pgdat-bdata, size, + SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); +} diff -Nraup a/mm/sparse.c b/mm/sparse.c --- a/mm/sparse.c 2007-05-19 16:54:48.0 +0800 +++ b/mm/sparse.c 2007-05-19 17:44:01.0 +0800 @@ -209,6 +209,12 @@ static int __meminit sparse_init_one_sec return 1; } +__attribute__((weak)) +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) +{ + return NULL; +} + static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) { struct page *map; @@ -219,6 +225,11 @@ static struct page __init *sparse_early_ if (map) return map; + map = alloc_bootmem_high_node(NODE_DATA(nid), + sizeof(struct page) * PAGES_PER_SECTION); + if (map) + return map; + map = alloc_bootmem_node(NODE_DATA(nid), sizeof(struct page) * PAGES_PER_SECTION); if (map) - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] Allocate sparsemem memmap above 4G on X86_64
On system with huge amount of physical memory. VFS cache and memory memmap may eat all available system memory under 4G, then system may fail to allocated swiotlb bounce buffer. There was a fix in arch/x86_64/mm/numa.c, but that fix does not cover sparsemem model. This patch add fix to sparsemem model. Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]> Acked-by: Siddha, Suresh <[EMAIL PROTECTED]> --- include/asm-x86_64/mmzone.h |5 + include/linux/bootmem.h |3 +++ mm/sparse.c |5 + 3 files changed, 13 insertions(+) diff -Nraup a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h --- a/include/asm-x86_64/mmzone.h 2007-05-17 09:38:02.0 +0800 +++ b/include/asm-x86_64/mmzone.h 2007-05-17 09:54:10.0 +0800 @@ -52,5 +52,10 @@ extern int pfn_valid(unsigned long pfn); #define FAKE_NODE_MIN_HASH_MASK(~(FAKE_NODE_MIN_SIZE - 1uL)) #endif +#define ARCH_HAS_ALLOC_BOOTMEM_HIGH_NODE 1 +#define alloc_bootmem_high_node(pgdat,size) \ +({__alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);}) + + #endif #endif diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h --- a/include/linux/bootmem.h 2007-05-17 09:38:02.0 +0800 +++ b/include/linux/bootmem.h 2007-05-17 09:37:00.0 +0800 @@ -131,5 +131,8 @@ extern void *alloc_large_system_hash(con #endif extern int hashdist; /* Distribute hashes across NUMA nodes? */ +#ifndef ARCH_HAS_ALLOC_BOOTMEM_HIGH_NODE +#define alloc_bootmem_high_node(pgdat, size) ({NULL;}) +#endif #endif /* _LINUX_BOOTMEM_H */ diff -Nraup a/mm/sparse.c b/mm/sparse.c --- a/mm/sparse.c 2007-05-17 09:38:03.0 +0800 +++ b/mm/sparse.c 2007-05-17 09:54:27.0 +0800 @@ -219,6 +219,11 @@ static struct page __init *sparse_early_ if (map) return map; + map = alloc_bootmem_high_node(NODE_DATA(nid), + sizeof(struct page) * PAGES_PER_SECTION); +if (map) +return map; + map = alloc_bootmem_node(NODE_DATA(nid), sizeof(struct page) * PAGES_PER_SECTION); if (map) - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] Allocate sparsemem memmap above 4G on X86_64
On system with huge amount of physical memory. VFS cache and memory memmap may eat all available system memory under 4G, then system may fail to allocated swiotlb bounce buffer. There was a fix in arch/x86_64/mm/numa.c, but that fix does not cover sparsemem model. This patch add fix to sparsemem model. Signed-off-by: Zou Nan hai [EMAIL PROTECTED] Acked-by: Siddha, Suresh [EMAIL PROTECTED] --- include/asm-x86_64/mmzone.h |5 + include/linux/bootmem.h |3 +++ mm/sparse.c |5 + 3 files changed, 13 insertions(+) diff -Nraup a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h --- a/include/asm-x86_64/mmzone.h 2007-05-17 09:38:02.0 +0800 +++ b/include/asm-x86_64/mmzone.h 2007-05-17 09:54:10.0 +0800 @@ -52,5 +52,10 @@ extern int pfn_valid(unsigned long pfn); #define FAKE_NODE_MIN_HASH_MASK(~(FAKE_NODE_MIN_SIZE - 1uL)) #endif +#define ARCH_HAS_ALLOC_BOOTMEM_HIGH_NODE 1 +#define alloc_bootmem_high_node(pgdat,size) \ +({__alloc_bootmem_core(pgdat-bdata, size, SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);}) + + #endif #endif diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h --- a/include/linux/bootmem.h 2007-05-17 09:38:02.0 +0800 +++ b/include/linux/bootmem.h 2007-05-17 09:37:00.0 +0800 @@ -131,5 +131,8 @@ extern void *alloc_large_system_hash(con #endif extern int hashdist; /* Distribute hashes across NUMA nodes? */ +#ifndef ARCH_HAS_ALLOC_BOOTMEM_HIGH_NODE +#define alloc_bootmem_high_node(pgdat, size) ({NULL;}) +#endif #endif /* _LINUX_BOOTMEM_H */ diff -Nraup a/mm/sparse.c b/mm/sparse.c --- a/mm/sparse.c 2007-05-17 09:38:03.0 +0800 +++ b/mm/sparse.c 2007-05-17 09:54:27.0 +0800 @@ -219,6 +219,11 @@ static struct page __init *sparse_early_ if (map) return map; + map = alloc_bootmem_high_node(NODE_DATA(nid), + sizeof(struct page) * PAGES_PER_SECTION); +if (map) +return map; + map = alloc_bootmem_node(NODE_DATA(nid), sizeof(struct page) * PAGES_PER_SECTION); if (map) - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] fix an error in /proc/slabinfo print
There is an obvious error in the header of /proc/slabinfo Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]> --- linux-2.6.11-rc3/mm/slab.c 2005-02-03 13:29:33.0 +0800 +++ linux-2.6.11-rc3-fix/mm/slab.c 2005-02-03 13:32:42.318821400 +0800 @@ -2860,7 +2860,7 @@ static void *s_start(struct seq_file *m, seq_puts(m, "slabinfo - version: 2.1\n"); #endif seq_puts(m, "# name "); - seq_puts(m, " : tunables "); + seq_puts(m, " : tunables "); seq_puts(m, " : slabdata "); #if STATS seq_puts(m, " : globalstat " - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch] fix an error in /proc/slabinfo print
There is an obvious error in the header of /proc/slabinfo Signed-off-by: Zou Nan hai [EMAIL PROTECTED] --- linux-2.6.11-rc3/mm/slab.c 2005-02-03 13:29:33.0 +0800 +++ linux-2.6.11-rc3-fix/mm/slab.c 2005-02-03 13:32:42.318821400 +0800 @@ -2860,7 +2860,7 @@ static void *s_start(struct seq_file *m, seq_puts(m, slabinfo - version: 2.1\n); #endif seq_puts(m, # nameactive_objs num_objs objsize objperslab pagesperslab); - seq_puts(m, : tunables batchcount limit sharedfactor); + seq_puts(m, : tunables limit batchcount sharedfactor); seq_puts(m, : slabdata active_slabs num_slabs sharedavail); #if STATS seq_puts(m, : globalstat listallocs maxobjs grown reaped - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
possible performance issue in 4-level page tables
There is a performance regression of lmbench lat_proc fork result on ia64. in 2.6.10 I got Process fork+exit:164.8438 microseconds. in 2.6.11-rc2 Process fork+exit:183.8621 microseconds. I believe this regression was caused by the 4-level page tables change. Since most of the kernel time spend in lat_proc fork is copy_page_range in fork path and clear_page_range in the exit path. Now they are 1 level deeper. Though pud and pgd is same on IA64, there is still some overhead introduced I think. Are any other architectures seeing the same sort of results? Zou Nan hai - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
possible performance issue in 4-level page tables
There is a performance regression of lmbench lat_proc fork result on ia64. in 2.6.10 I got Process fork+exit:164.8438 microseconds. in 2.6.11-rc2 Process fork+exit:183.8621 microseconds. I believe this regression was caused by the 4-level page tables change. Since most of the kernel time spend in lat_proc fork is copy_page_range in fork path and clear_page_range in the exit path. Now they are 1 level deeper. Though pud and pgd is same on IA64, there is still some overhead introduced I think. Are any other architectures seeing the same sort of results? Zou Nan hai - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch]Fix an error in copy_page_range
Hi, There is a bug in copy_page_range in current 2.6.11-rc1 with 4 level page table change. copy_page_range do a continue without adding pgds and addr when pgd_none(*src_pgd) or pgd_bad(*src_pgd). I think it's wrong in logic, copy_page_range will run into infinite loop when when pgd_none(*src_pgd) or pgd_bad(*src_pgd). Although maybe this bug does not break anything currently..., Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]> --- a/mm/memory.c 2005-01-21 01:21:18.0 +0800 +++ b/mm/memory.c 2005-01-21 04:49:13.0 +0800 @@ -442,17 +442,18 @@ int copy_page_range(struct mm_struct *ds if (next > end || next <= addr) next = end; if (pgd_none(*src_pgd)) - continue; + goto next_pgd; if (pgd_bad(*src_pgd)) { pgd_ERROR(*src_pgd); pgd_clear(src_pgd); - continue; + goto next_pgd; } err = copy_pud_range(dst, src, dst_pgd, src_pgd, vma, addr, next); if (err) break; +next_pgd: src_pgd++; dst_pgd++; addr = next; - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[Patch]Fix an error in copy_page_range
Hi, There is a bug in copy_page_range in current 2.6.11-rc1 with 4 level page table change. copy_page_range do a continue without adding pgds and addr when pgd_none(*src_pgd) or pgd_bad(*src_pgd). I think it's wrong in logic, copy_page_range will run into infinite loop when when pgd_none(*src_pgd) or pgd_bad(*src_pgd). Although maybe this bug does not break anything currently..., Signed-off-by: Zou Nan hai [EMAIL PROTECTED] --- a/mm/memory.c 2005-01-21 01:21:18.0 +0800 +++ b/mm/memory.c 2005-01-21 04:49:13.0 +0800 @@ -442,17 +442,18 @@ int copy_page_range(struct mm_struct *ds if (next end || next = addr) next = end; if (pgd_none(*src_pgd)) - continue; + goto next_pgd; if (pgd_bad(*src_pgd)) { pgd_ERROR(*src_pgd); pgd_clear(src_pgd); - continue; + goto next_pgd; } err = copy_pud_range(dst, src, dst_pgd, src_pgd, vma, addr, next); if (err) break; +next_pgd: src_pgd++; dst_pgd++; addr = next; - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/