[Patch] Allocate sparse vmemmap block above 4G

2007-11-07 Thread Zou Nan hai
Resend the patch for more people to review

On some single node x64 system with huge amount of physical memory e.g >
64G. the memmap size maybe very big. 

If the memmap is allocated from low pages, it may occupies too much
memory below 4G. 
then swiotlb could fail to reserve bounce buffer under 4G which will
lead to boot failure.

This patch will first try to allocate memmap memory above 4G in sparse
vmemmap code. 
If it failed, it will allocate memmap above MAX_DMA_ADDRESS. 
This patch is against 2.6.24-rc1-git14

Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]>
Signed-off-by: Suresh Siddha <[EMAIL PROTECTED]>

diff -Nraup a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
--- a/arch/x86/mm/init_64.c 2007-11-06 15:16:12.0 +0800
+++ b/arch/x86/mm/init_64.c 2007-11-06 15:55:50.0 +0800
@@ -448,6 +448,13 @@ void online_page(struct page *page)
num_physpages++;
 }
 
+void * __meminit alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
+unsigned long align)
+{
+return __alloc_bootmem_core(pgdat->bdata, size,
+align, (4UL*1024*1024*1024), 0, 1);
+}
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * Memory is added always to NORMAL zone. This means you will never get
diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
--- a/include/linux/bootmem.h   2007-11-06 16:06:31.0 +0800
+++ b/include/linux/bootmem.h   2007-11-06 15:50:36.0 +0800
@@ -61,6 +61,10 @@ extern void *__alloc_bootmem_core(struct
  unsigned long limit,
  int strict_goal);
 
+extern void *alloc_bootmem_high_node(pg_data_t *pgdat,
+unsigned long size,
+unsigned long align);
+
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void reserve_bootmem(unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
diff -Nraup a/mm/bootmem.c b/mm/bootmem.c
--- a/mm/bootmem.c  2007-11-06 16:06:31.0 +0800
+++ b/mm/bootmem.c  2007-11-06 15:49:20.0 +0800
@@ -492,3 +492,11 @@ void * __init __alloc_bootmem_low_node(p
return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
ARCH_LOW_ADDRESS_LIMIT, 0);
 }
+
+__attribute__((weak)) __meminit
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
+unsigned long align)
+{
+return NULL;
+}
+
diff -Nraup a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
--- a/mm/sparse-vmemmap.c   2007-11-06 15:16:12.0 +0800
+++ b/mm/sparse-vmemmap.c   2007-11-06 16:08:52.0 +0800
@@ -43,9 +43,13 @@ void * __meminit vmemmap_alloc_block(uns
if (page)
return page_address(page);
return NULL;
-   } else
+   } else {
+   void *p = alloc_bootmem_high_node(NODE_DATA(node), size, size);
+   if (p)
+   return p;
return __alloc_bootmem_node(NODE_DATA(node), size, size,
__pa(MAX_DMA_ADDRESS));
+   }
 }
 
 void __meminit vmemmap_verify(pte_t *pte, int node,




-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Patch]Add strict_goal parameter to __alloc_bootmem_core

2007-11-07 Thread Zou Nan hai
Resend the patch for more people to review.

If __alloc_bootmem_core was given a goal, it will first try to allocate
memory above that goal. If failed, it will try from the low pages.

Sometimes we don't want this behavior, we want the goal to be strict.

This patch introduce a strict_goal parameter to __alloc_bootmem_core, 

If strict_goal is set, __alloc_bootmem_core will return NULL to indicate
it can't allocate memory above that goal.

Note we do not scan from last_success if strict_goal is set, it will
scan from the beginning of the goal instead
We skip this optimization to keep the code simple because strict_goal is
not supposed to be used in hot path.

Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]>
Signed-off-by: Suresh Siddha <[EMAIL PROTECTED]>

diff -Nraup a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
--- a/arch/x86/mm/numa_64.c 2007-10-24 11:50:57.0 +0800
+++ b/arch/x86/mm/numa_64.c 2007-11-07 13:06:50.0 +0800
@@ -247,7 +247,7 @@ void __init setup_node_zones(int nodeid)
__alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
memmapsize, SMP_CACHE_BYTES, 
round_down(limit - memmapsize, PAGE_SIZE), 
-   limit);
+   limit, 1);
 #endif
 } 
 
diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
--- a/include/linux/bootmem.h   2007-11-07 13:06:35.0 +0800
+++ b/include/linux/bootmem.h   2007-11-07 13:06:04.0 +0800
@@ -58,7 +58,8 @@ extern void *__alloc_bootmem_core(struct
  unsigned long size,
  unsigned long align,
  unsigned long goal,
- unsigned long limit);
+ unsigned long limit,
+ int strict_goal);
 
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void reserve_bootmem(unsigned long addr, unsigned long size);
diff -Nraup a/mm/bootmem.c b/mm/bootmem.c
--- a/mm/bootmem.c  2007-11-07 13:06:35.0 +0800
+++ b/mm/bootmem.c  2007-11-07 13:06:18.0 +0800
@@ -179,7 +179,7 @@ static void __init free_bootmem_core(boo
  */
 void * __init
 __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
- unsigned long align, unsigned long goal, unsigned long limit)
+ unsigned long align, unsigned long goal, unsigned long limit, int 
strict_goal)
 {
unsigned long offset, remaining_size, areasize, preferred;
unsigned long i, start = 0, incr, eidx, end_pfn;
@@ -212,15 +212,20 @@ __alloc_bootmem_core(struct bootmem_data
/*
 * We try to allocate bootmem pages above 'goal'
 * first, then we try to allocate lower pages.
-*/
-   if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) 
{
-   preferred = goal - bdata->node_boot_start;
+* if the goal is not strict.
+ */
+
+   preferred = 0;
+   if (goal) {
+   if (goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) 
{
+   preferred = goal - bdata->node_boot_start;
 
if (bdata->last_success >= preferred)
-   if (!limit || (limit && limit > bdata->last_success))
+   if (!strict_goal && (!limit || (limit && limit > 
bdata->last_success)))
preferred = bdata->last_success;
-   } else
-   preferred = 0;
+   } else if (strict_goal)
+return NULL;
+   }
 
preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
@@ -247,7 +252,7 @@ restart_scan:
i = ALIGN(j, incr);
}
 
-   if (preferred > offset) {
+   if (preferred > offset && !strict_goal) {
preferred = offset;
goto restart_scan;
}
@@ -421,7 +426,7 @@ void * __init __alloc_bootmem_nopanic(un
void *ptr;
 
list_for_each_entry(bdata, _list, list) {
-   ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
+   ptr = __alloc_bootmem_core(bdata, size, align, goal, 0, 0);
if (ptr)
return ptr;
}
@@ -449,7 +454,7 @@ void * __init __alloc_bootmem_node(pg_da
 {
void *ptr;
 
-   ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+   ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0, 0);
if (ptr)
return ptr;
 
@@ -468,7 +473,7 @@ void * __init __alloc_bootmem_low(unsign
 
list_for_each_entry(bdata, _list, list) {
ptr = __alloc_bootmem_core(bdata, size, align, goal,
-   

[Patch] Allocate sparse vmemmap block above 4G

2007-11-07 Thread Zou Nan hai
Resend the patch for more people to review

On some single node x64 system with huge amount of physical memory e.g 
64G. the memmap size maybe very big. 

If the memmap is allocated from low pages, it may occupies too much
memory below 4G. 
then swiotlb could fail to reserve bounce buffer under 4G which will
lead to boot failure.

This patch will first try to allocate memmap memory above 4G in sparse
vmemmap code. 
If it failed, it will allocate memmap above MAX_DMA_ADDRESS. 
This patch is against 2.6.24-rc1-git14

Signed-off-by: Zou Nan hai [EMAIL PROTECTED]
Signed-off-by: Suresh Siddha [EMAIL PROTECTED]

diff -Nraup a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
--- a/arch/x86/mm/init_64.c 2007-11-06 15:16:12.0 +0800
+++ b/arch/x86/mm/init_64.c 2007-11-06 15:55:50.0 +0800
@@ -448,6 +448,13 @@ void online_page(struct page *page)
num_physpages++;
 }
 
+void * __meminit alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
+unsigned long align)
+{
+return __alloc_bootmem_core(pgdat-bdata, size,
+align, (4UL*1024*1024*1024), 0, 1);
+}
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * Memory is added always to NORMAL zone. This means you will never get
diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
--- a/include/linux/bootmem.h   2007-11-06 16:06:31.0 +0800
+++ b/include/linux/bootmem.h   2007-11-06 15:50:36.0 +0800
@@ -61,6 +61,10 @@ extern void *__alloc_bootmem_core(struct
  unsigned long limit,
  int strict_goal);
 
+extern void *alloc_bootmem_high_node(pg_data_t *pgdat,
+unsigned long size,
+unsigned long align);
+
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void reserve_bootmem(unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
diff -Nraup a/mm/bootmem.c b/mm/bootmem.c
--- a/mm/bootmem.c  2007-11-06 16:06:31.0 +0800
+++ b/mm/bootmem.c  2007-11-06 15:49:20.0 +0800
@@ -492,3 +492,11 @@ void * __init __alloc_bootmem_low_node(p
return __alloc_bootmem_core(pgdat-bdata, size, align, goal,
ARCH_LOW_ADDRESS_LIMIT, 0);
 }
+
+__attribute__((weak)) __meminit
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
+unsigned long align)
+{
+return NULL;
+}
+
diff -Nraup a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
--- a/mm/sparse-vmemmap.c   2007-11-06 15:16:12.0 +0800
+++ b/mm/sparse-vmemmap.c   2007-11-06 16:08:52.0 +0800
@@ -43,9 +43,13 @@ void * __meminit vmemmap_alloc_block(uns
if (page)
return page_address(page);
return NULL;
-   } else
+   } else {
+   void *p = alloc_bootmem_high_node(NODE_DATA(node), size, size);
+   if (p)
+   return p;
return __alloc_bootmem_node(NODE_DATA(node), size, size,
__pa(MAX_DMA_ADDRESS));
+   }
 }
 
 void __meminit vmemmap_verify(pte_t *pte, int node,




-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Patch]Add strict_goal parameter to __alloc_bootmem_core

2007-11-07 Thread Zou Nan hai
Resend the patch for more people to review.

If __alloc_bootmem_core was given a goal, it will first try to allocate
memory above that goal. If failed, it will try from the low pages.

Sometimes we don't want this behavior, we want the goal to be strict.

This patch introduce a strict_goal parameter to __alloc_bootmem_core, 

If strict_goal is set, __alloc_bootmem_core will return NULL to indicate
it can't allocate memory above that goal.

Note we do not scan from last_success if strict_goal is set, it will
scan from the beginning of the goal instead
We skip this optimization to keep the code simple because strict_goal is
not supposed to be used in hot path.

Signed-off-by: Zou Nan hai [EMAIL PROTECTED]
Signed-off-by: Suresh Siddha [EMAIL PROTECTED]

diff -Nraup a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
--- a/arch/x86/mm/numa_64.c 2007-10-24 11:50:57.0 +0800
+++ b/arch/x86/mm/numa_64.c 2007-11-07 13:06:50.0 +0800
@@ -247,7 +247,7 @@ void __init setup_node_zones(int nodeid)
__alloc_bootmem_core(NODE_DATA(nodeid)-bdata, 
memmapsize, SMP_CACHE_BYTES, 
round_down(limit - memmapsize, PAGE_SIZE), 
-   limit);
+   limit, 1);
 #endif
 } 
 
diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
--- a/include/linux/bootmem.h   2007-11-07 13:06:35.0 +0800
+++ b/include/linux/bootmem.h   2007-11-07 13:06:04.0 +0800
@@ -58,7 +58,8 @@ extern void *__alloc_bootmem_core(struct
  unsigned long size,
  unsigned long align,
  unsigned long goal,
- unsigned long limit);
+ unsigned long limit,
+ int strict_goal);
 
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void reserve_bootmem(unsigned long addr, unsigned long size);
diff -Nraup a/mm/bootmem.c b/mm/bootmem.c
--- a/mm/bootmem.c  2007-11-07 13:06:35.0 +0800
+++ b/mm/bootmem.c  2007-11-07 13:06:18.0 +0800
@@ -179,7 +179,7 @@ static void __init free_bootmem_core(boo
  */
 void * __init
 __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
- unsigned long align, unsigned long goal, unsigned long limit)
+ unsigned long align, unsigned long goal, unsigned long limit, int 
strict_goal)
 {
unsigned long offset, remaining_size, areasize, preferred;
unsigned long i, start = 0, incr, eidx, end_pfn;
@@ -212,15 +212,20 @@ __alloc_bootmem_core(struct bootmem_data
/*
 * We try to allocate bootmem pages above 'goal'
 * first, then we try to allocate lower pages.
-*/
-   if (goal  goal = bdata-node_boot_start  PFN_DOWN(goal)  end_pfn) 
{
-   preferred = goal - bdata-node_boot_start;
+* if the goal is not strict.
+ */
+
+   preferred = 0;
+   if (goal) {
+   if (goal = bdata-node_boot_start  PFN_DOWN(goal)  end_pfn) 
{
+   preferred = goal - bdata-node_boot_start;
 
if (bdata-last_success = preferred)
-   if (!limit || (limit  limit  bdata-last_success))
+   if (!strict_goal  (!limit || (limit  limit  
bdata-last_success)))
preferred = bdata-last_success;
-   } else
-   preferred = 0;
+   } else if (strict_goal)
+return NULL;
+   }
 
preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
@@ -247,7 +252,7 @@ restart_scan:
i = ALIGN(j, incr);
}
 
-   if (preferred  offset) {
+   if (preferred  offset  !strict_goal) {
preferred = offset;
goto restart_scan;
}
@@ -421,7 +426,7 @@ void * __init __alloc_bootmem_nopanic(un
void *ptr;
 
list_for_each_entry(bdata, bdata_list, list) {
-   ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
+   ptr = __alloc_bootmem_core(bdata, size, align, goal, 0, 0);
if (ptr)
return ptr;
}
@@ -449,7 +454,7 @@ void * __init __alloc_bootmem_node(pg_da
 {
void *ptr;
 
-   ptr = __alloc_bootmem_core(pgdat-bdata, size, align, goal, 0);
+   ptr = __alloc_bootmem_core(pgdat-bdata, size, align, goal, 0, 0);
if (ptr)
return ptr;
 
@@ -468,7 +473,7 @@ void * __init __alloc_bootmem_low(unsign
 
list_for_each_entry(bdata, bdata_list, list) {
ptr = __alloc_bootmem_core(bdata, size, align, goal,
-   ARCH_LOW_ADDRESS_LIMIT);
+   ARCH_LOW_ADDRESS_LIMIT, 0);
if (ptr)
return

[Patch]Add strict_goal parameter to __alloc_bootmem_core

2007-11-06 Thread Zou Nan hai
If __alloc_bootmem_core was given a goal, it will first try to allocate
memory above that goal. If failed, it will try from the low pages.

Sometimes we don't want this behavior, we want the goal to be strict.

This patch introduce a strict_goal parameter to __alloc_bootmem_core, 

If strict_goal is set, __alloc_bootmem_core will return NULL to indicate
it can't allocate memory above that goal.

Note we do not scan from last_success if strict_goal is set, it will
scan from the beginning of the goal instead
We skip this optimization to keep the code simple because strict_goal is
not supposed to be used in hot path.

Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]>
Signed-off-by: Suresh Siddha <[EMAIL PROTECTED]>

diff -Nraup a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
--- a/arch/x86/mm/numa_64.c 2007-10-24 11:50:57.0 +0800
+++ b/arch/x86/mm/numa_64.c 2007-11-07 13:06:50.0 +0800
@@ -247,7 +247,7 @@ void __init setup_node_zones(int nodeid)
__alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
memmapsize, SMP_CACHE_BYTES, 
round_down(limit - memmapsize, PAGE_SIZE), 
-   limit);
+   limit, 1);
 #endif
 } 
 
diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
--- a/include/linux/bootmem.h   2007-11-07 13:06:35.0 +0800
+++ b/include/linux/bootmem.h   2007-11-07 13:06:04.0 +0800
@@ -58,7 +58,8 @@ extern void *__alloc_bootmem_core(struct
  unsigned long size,
  unsigned long align,
  unsigned long goal,
- unsigned long limit);
+ unsigned long limit,
+ int strict_goal);
 
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void reserve_bootmem(unsigned long addr, unsigned long size);
diff -Nraup a/mm/bootmem.c b/mm/bootmem.c
--- a/mm/bootmem.c  2007-11-07 13:06:35.0 +0800
+++ b/mm/bootmem.c  2007-11-07 13:06:18.0 +0800
@@ -179,7 +179,7 @@ static void __init free_bootmem_core(boo
  */
 void * __init
 __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
- unsigned long align, unsigned long goal, unsigned long limit)
+ unsigned long align, unsigned long goal, unsigned long limit, int 
strict_goal)
 {
unsigned long offset, remaining_size, areasize, preferred;
unsigned long i, start = 0, incr, eidx, end_pfn;
@@ -212,15 +212,20 @@ __alloc_bootmem_core(struct bootmem_data
/*
 * We try to allocate bootmem pages above 'goal'
 * first, then we try to allocate lower pages.
-*/
-   if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) 
{
-   preferred = goal - bdata->node_boot_start;
+* if the goal is not strict.
+ */
+
+   preferred = 0;
+   if (goal) {
+   if (goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) 
{
+   preferred = goal - bdata->node_boot_start;
 
if (bdata->last_success >= preferred)
-   if (!limit || (limit && limit > bdata->last_success))
+   if (!strict_goal && (!limit || (limit && limit > 
bdata->last_success)))
preferred = bdata->last_success;
-   } else
-   preferred = 0;
+   } else if (strict_goal)
+return NULL;
+   }
 
preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
@@ -247,7 +252,7 @@ restart_scan:
i = ALIGN(j, incr);
}
 
-   if (preferred > offset) {
+   if (preferred > offset && !strict_goal) {
preferred = offset;
goto restart_scan;
}
@@ -421,7 +426,7 @@ void * __init __alloc_bootmem_nopanic(un
void *ptr;
 
list_for_each_entry(bdata, _list, list) {
-   ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
+   ptr = __alloc_bootmem_core(bdata, size, align, goal, 0, 0);
if (ptr)
return ptr;
}
@@ -449,7 +454,7 @@ void * __init __alloc_bootmem_node(pg_da
 {
void *ptr;
 
-   ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+   ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0, 0);
if (ptr)
return ptr;
 
@@ -468,7 +473,7 @@ void * __init __alloc_bootmem_low(unsign
 
list_for_each_entry(bdata, _list, list) {
ptr = __alloc_bootmem_core(bdata, size, align, goal,
-   ARCH_LOW_ADDRESS_LIMIT);
+  

[Patch] Allocate sparse vmemmap block above 4G

2007-11-06 Thread Zou Nan hai
Try to allocate sparse vmemmap block above 4G on x64 system.

On some single node x64 system with huge amount of physical memory e.g >
64G. the memmap size maybe very big. 

If the memmap is allocated from low pages, it may occupies too much
memory below 4G. 
then swiotlb could fail to reserve bounce buffer under 4G which will
lead to boot failure.

This patch will first try to allocate memmap memory above 4G in sparse
vmemmap code. 
If it failed, it will allocate memmap above MAX_DMA_ADDRESS. 
This patch is against 2.6.24-rc1-git14

Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]>
Signed-off-by: Suresh Siddha <[EMAIL PROTECTED]>


diff -Nraup a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
--- a/arch/x86/mm/init_64.c 2007-11-06 15:16:12.0 +0800
+++ b/arch/x86/mm/init_64.c 2007-11-06 15:55:50.0 +0800
@@ -448,6 +448,13 @@ void online_page(struct page *page)
num_physpages++;
 }
 
+void * __meminit alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
+unsigned long align)
+{
+return __alloc_bootmem_core(pgdat->bdata, size,
+align, (4UL*1024*1024*1024), 0, 1);
+}
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * Memory is added always to NORMAL zone. This means you will never get
diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
--- a/include/linux/bootmem.h   2007-11-06 16:06:31.0 +0800
+++ b/include/linux/bootmem.h   2007-11-06 15:50:36.0 +0800
@@ -61,6 +61,10 @@ extern void *__alloc_bootmem_core(struct
  unsigned long limit,
  int strict_goal);
 
+extern void *alloc_bootmem_high_node(pg_data_t *pgdat,
+unsigned long size,
+unsigned long align);
+
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void reserve_bootmem(unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
diff -Nraup a/mm/bootmem.c b/mm/bootmem.c
--- a/mm/bootmem.c  2007-11-06 16:06:31.0 +0800
+++ b/mm/bootmem.c  2007-11-06 15:49:20.0 +0800
@@ -492,3 +492,11 @@ void * __init __alloc_bootmem_low_node(p
return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
ARCH_LOW_ADDRESS_LIMIT, 0);
 }
+
+__attribute__((weak)) __meminit
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
+unsigned long align)
+{
+return NULL;
+}
+
diff -Nraup a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
--- a/mm/sparse-vmemmap.c   2007-11-06 15:16:12.0 +0800
+++ b/mm/sparse-vmemmap.c   2007-11-06 16:08:52.0 +0800
@@ -43,9 +43,13 @@ void * __meminit vmemmap_alloc_block(uns
if (page)
return page_address(page);
return NULL;
-   } else
+   } else {
+   void *p = alloc_bootmem_high_node(NODE_DATA(node), size, size);
+   if (p)
+   return p;
return __alloc_bootmem_node(NODE_DATA(node), size, size,
__pa(MAX_DMA_ADDRESS));
+   }
 }
 
 void __meminit vmemmap_verify(pte_t *pte, int node,


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Patch2/2] fix wrong proc cpuinfo on x64

2007-11-06 Thread Zou Nan hai
in 2.6.24-rc1 kernel, 
The /proc/cpuinfo display is wrong.

Another issue is that it will display bogus cpus with wrong information
if the kernel is compiled with a big CONFIG_NR_CPU.

That is because before a cpu in cpu_present_map is up, c->cpu_index of
that cpu is 0.
thus the cpu_online(c->cpu_index) check in show_cpuinfo is invalid.

This patch will let cpuinfo_op use cpu_online_map instead of
cpu_present_map to iterate cpus.


Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]>

--- linux-2.6.24-rc1/arch/x86/kernel/setup_64.c 2007-10-29 22:03:05.0 
-0400
+++ b/arch/x86/kernel/setup_64.c2007-11-05 23:09:06.0 -0500
@@ -1078,8 +1078,6 @@ static int show_cpuinfo(struct seq_file 
 
 
 #ifdef CONFIG_SMP
-   if (!cpu_online(c->cpu_index))
-   return 0;
cpu = c->cpu_index;
 #endif
 
@@ -1171,15 +1169,15 @@ static int show_cpuinfo(struct seq_file 
 static void *c_start(struct seq_file *m, loff_t *pos)
 {
if (*pos == 0)  /* just in case, cpu 0 is not the first */
-   *pos = first_cpu(cpu_possible_map);
-   if ((*pos) < NR_CPUS && cpu_possible(*pos))
+   *pos = first_cpu(cpu_online_map);
+   if ((*pos) < NR_CPUS && cpu_online(*pos))
return _data(*pos);
return NULL;
 }
 
 static void *c_next(struct seq_file *m, void *v, loff_t *pos)
 {
-   *pos = next_cpu(*pos, cpu_possible_map);
+   *pos = next_cpu(*pos, cpu_online_map);
return c_start(m, pos);
 }
 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Patch2/2] fix wrong proc cpuinfo on x64

2007-11-06 Thread Zou Nan hai
in 2.6.24-rc1 kernel, 
The /proc/cpuinfo display is wrong.

Another issue is that it will display bogus cpus with wrong information
if the kernel is compiled with a big CONFIG_NR_CPU.

That is because before a cpu in cpu_present_map is up, c-cpu_index of
that cpu is 0.
thus the cpu_online(c-cpu_index) check in show_cpuinfo is invalid.

This patch will let cpuinfo_op use cpu_online_map instead of
cpu_present_map to iterate cpus.


Signed-off-by: Zou Nan hai [EMAIL PROTECTED]

--- linux-2.6.24-rc1/arch/x86/kernel/setup_64.c 2007-10-29 22:03:05.0 
-0400
+++ b/arch/x86/kernel/setup_64.c2007-11-05 23:09:06.0 -0500
@@ -1078,8 +1078,6 @@ static int show_cpuinfo(struct seq_file 
 
 
 #ifdef CONFIG_SMP
-   if (!cpu_online(c-cpu_index))
-   return 0;
cpu = c-cpu_index;
 #endif
 
@@ -1171,15 +1169,15 @@ static int show_cpuinfo(struct seq_file 
 static void *c_start(struct seq_file *m, loff_t *pos)
 {
if (*pos == 0)  /* just in case, cpu 0 is not the first */
-   *pos = first_cpu(cpu_possible_map);
-   if ((*pos)  NR_CPUS  cpu_possible(*pos))
+   *pos = first_cpu(cpu_online_map);
+   if ((*pos)  NR_CPUS  cpu_online(*pos))
return cpu_data(*pos);
return NULL;
 }
 
 static void *c_next(struct seq_file *m, void *v, loff_t *pos)
 {
-   *pos = next_cpu(*pos, cpu_possible_map);
+   *pos = next_cpu(*pos, cpu_online_map);
return c_start(m, pos);
 }
 

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Patch]Add strict_goal parameter to __alloc_bootmem_core

2007-11-06 Thread Zou Nan hai
If __alloc_bootmem_core was given a goal, it will first try to allocate
memory above that goal. If failed, it will try from the low pages.

Sometimes we don't want this behavior, we want the goal to be strict.

This patch introduce a strict_goal parameter to __alloc_bootmem_core, 

If strict_goal is set, __alloc_bootmem_core will return NULL to indicate
it can't allocate memory above that goal.

Note we do not scan from last_success if strict_goal is set, it will
scan from the beginning of the goal instead
We skip this optimization to keep the code simple because strict_goal is
not supposed to be used in hot path.

Signed-off-by: Zou Nan hai [EMAIL PROTECTED]
Signed-off-by: Suresh Siddha [EMAIL PROTECTED]

diff -Nraup a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
--- a/arch/x86/mm/numa_64.c 2007-10-24 11:50:57.0 +0800
+++ b/arch/x86/mm/numa_64.c 2007-11-07 13:06:50.0 +0800
@@ -247,7 +247,7 @@ void __init setup_node_zones(int nodeid)
__alloc_bootmem_core(NODE_DATA(nodeid)-bdata, 
memmapsize, SMP_CACHE_BYTES, 
round_down(limit - memmapsize, PAGE_SIZE), 
-   limit);
+   limit, 1);
 #endif
 } 
 
diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
--- a/include/linux/bootmem.h   2007-11-07 13:06:35.0 +0800
+++ b/include/linux/bootmem.h   2007-11-07 13:06:04.0 +0800
@@ -58,7 +58,8 @@ extern void *__alloc_bootmem_core(struct
  unsigned long size,
  unsigned long align,
  unsigned long goal,
- unsigned long limit);
+ unsigned long limit,
+ int strict_goal);
 
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void reserve_bootmem(unsigned long addr, unsigned long size);
diff -Nraup a/mm/bootmem.c b/mm/bootmem.c
--- a/mm/bootmem.c  2007-11-07 13:06:35.0 +0800
+++ b/mm/bootmem.c  2007-11-07 13:06:18.0 +0800
@@ -179,7 +179,7 @@ static void __init free_bootmem_core(boo
  */
 void * __init
 __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
- unsigned long align, unsigned long goal, unsigned long limit)
+ unsigned long align, unsigned long goal, unsigned long limit, int 
strict_goal)
 {
unsigned long offset, remaining_size, areasize, preferred;
unsigned long i, start = 0, incr, eidx, end_pfn;
@@ -212,15 +212,20 @@ __alloc_bootmem_core(struct bootmem_data
/*
 * We try to allocate bootmem pages above 'goal'
 * first, then we try to allocate lower pages.
-*/
-   if (goal  goal = bdata-node_boot_start  PFN_DOWN(goal)  end_pfn) 
{
-   preferred = goal - bdata-node_boot_start;
+* if the goal is not strict.
+ */
+
+   preferred = 0;
+   if (goal) {
+   if (goal = bdata-node_boot_start  PFN_DOWN(goal)  end_pfn) 
{
+   preferred = goal - bdata-node_boot_start;
 
if (bdata-last_success = preferred)
-   if (!limit || (limit  limit  bdata-last_success))
+   if (!strict_goal  (!limit || (limit  limit  
bdata-last_success)))
preferred = bdata-last_success;
-   } else
-   preferred = 0;
+   } else if (strict_goal)
+return NULL;
+   }
 
preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
@@ -247,7 +252,7 @@ restart_scan:
i = ALIGN(j, incr);
}
 
-   if (preferred  offset) {
+   if (preferred  offset  !strict_goal) {
preferred = offset;
goto restart_scan;
}
@@ -421,7 +426,7 @@ void * __init __alloc_bootmem_nopanic(un
void *ptr;
 
list_for_each_entry(bdata, bdata_list, list) {
-   ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
+   ptr = __alloc_bootmem_core(bdata, size, align, goal, 0, 0);
if (ptr)
return ptr;
}
@@ -449,7 +454,7 @@ void * __init __alloc_bootmem_node(pg_da
 {
void *ptr;
 
-   ptr = __alloc_bootmem_core(pgdat-bdata, size, align, goal, 0);
+   ptr = __alloc_bootmem_core(pgdat-bdata, size, align, goal, 0, 0);
if (ptr)
return ptr;
 
@@ -468,7 +473,7 @@ void * __init __alloc_bootmem_low(unsign
 
list_for_each_entry(bdata, bdata_list, list) {
ptr = __alloc_bootmem_core(bdata, size, align, goal,
-   ARCH_LOW_ADDRESS_LIMIT);
+   ARCH_LOW_ADDRESS_LIMIT, 0);
if (ptr)
return ptr;
}
@@ -485,5 +490,5 @@ void

[Patch] Allocate sparse vmemmap block above 4G

2007-11-06 Thread Zou Nan hai
Try to allocate sparse vmemmap block above 4G on x64 system.

On some single node x64 system with huge amount of physical memory e.g 
64G. the memmap size maybe very big. 

If the memmap is allocated from low pages, it may occupies too much
memory below 4G. 
then swiotlb could fail to reserve bounce buffer under 4G which will
lead to boot failure.

This patch will first try to allocate memmap memory above 4G in sparse
vmemmap code. 
If it failed, it will allocate memmap above MAX_DMA_ADDRESS. 
This patch is against 2.6.24-rc1-git14

Signed-off-by: Zou Nan hai [EMAIL PROTECTED]
Signed-off-by: Suresh Siddha [EMAIL PROTECTED]


diff -Nraup a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
--- a/arch/x86/mm/init_64.c 2007-11-06 15:16:12.0 +0800
+++ b/arch/x86/mm/init_64.c 2007-11-06 15:55:50.0 +0800
@@ -448,6 +448,13 @@ void online_page(struct page *page)
num_physpages++;
 }
 
+void * __meminit alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
+unsigned long align)
+{
+return __alloc_bootmem_core(pgdat-bdata, size,
+align, (4UL*1024*1024*1024), 0, 1);
+}
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * Memory is added always to NORMAL zone. This means you will never get
diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
--- a/include/linux/bootmem.h   2007-11-06 16:06:31.0 +0800
+++ b/include/linux/bootmem.h   2007-11-06 15:50:36.0 +0800
@@ -61,6 +61,10 @@ extern void *__alloc_bootmem_core(struct
  unsigned long limit,
  int strict_goal);
 
+extern void *alloc_bootmem_high_node(pg_data_t *pgdat,
+unsigned long size,
+unsigned long align);
+
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void reserve_bootmem(unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
diff -Nraup a/mm/bootmem.c b/mm/bootmem.c
--- a/mm/bootmem.c  2007-11-06 16:06:31.0 +0800
+++ b/mm/bootmem.c  2007-11-06 15:49:20.0 +0800
@@ -492,3 +492,11 @@ void * __init __alloc_bootmem_low_node(p
return __alloc_bootmem_core(pgdat-bdata, size, align, goal,
ARCH_LOW_ADDRESS_LIMIT, 0);
 }
+
+__attribute__((weak)) __meminit
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size,
+unsigned long align)
+{
+return NULL;
+}
+
diff -Nraup a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
--- a/mm/sparse-vmemmap.c   2007-11-06 15:16:12.0 +0800
+++ b/mm/sparse-vmemmap.c   2007-11-06 16:08:52.0 +0800
@@ -43,9 +43,13 @@ void * __meminit vmemmap_alloc_block(uns
if (page)
return page_address(page);
return NULL;
-   } else
+   } else {
+   void *p = alloc_bootmem_high_node(NODE_DATA(node), size, size);
+   if (p)
+   return p;
return __alloc_bootmem_node(NODE_DATA(node), size, size,
__pa(MAX_DMA_ADDRESS));
+   }
 }
 
 void __meminit vmemmap_verify(pte_t *pte, int node,


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Patch1/2] fix wrong proc cpuinfo on x64

2007-11-05 Thread Zou Nan hai

in 2.6.24-rc1 kernel, 
The /proc/cpuinfo display is wrong.

One issue is every processor id appears to be 0.

That is because smp_store_cpu_info will set cpuinfo_x86->cpu_index
to cpu id then call identify_cpu
identify_cpu will call early_identify_cpu which set c->cpu_index back to
0.

This patch set cpu_index after identify_cpu to fix the issue.

Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]>

--- linux-2.6.24-rc1/arch/x86/kernel/smpboot_64.c   2007-10-29 
22:03:05.0 -0400
+++ b/arch/x86/kernel/smpboot_64.c  2007-11-05 22:12:57.0 -0500
@@ -141,8 +141,8 @@ static void __cpuinit smp_store_cpu_info
struct cpuinfo_x86 *c = _data(id);
 
*c = boot_cpu_data;
-   c->cpu_index = id;
identify_cpu(c);
+   c->cpu_index = id;
print_cpu_info(c);
 }
 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Patch1/2] fix wrong proc cpuinfo on x64

2007-11-05 Thread Zou Nan hai

in 2.6.24-rc1 kernel, 
The /proc/cpuinfo display is wrong.

One issue is every processor id appears to be 0.

That is because smp_store_cpu_info will set cpuinfo_x86-cpu_index
to cpu id then call identify_cpu
identify_cpu will call early_identify_cpu which set c-cpu_index back to
0.

This patch set cpu_index after identify_cpu to fix the issue.

Signed-off-by: Zou Nan hai [EMAIL PROTECTED]

--- linux-2.6.24-rc1/arch/x86/kernel/smpboot_64.c   2007-10-29 
22:03:05.0 -0400
+++ b/arch/x86/kernel/smpboot_64.c  2007-11-05 22:12:57.0 -0500
@@ -141,8 +141,8 @@ static void __cpuinit smp_store_cpu_info
struct cpuinfo_x86 *c = cpu_data(id);
 
*c = boot_cpu_data;
-   c-cpu_index = id;
identify_cpu(c);
+   c-cpu_index = id;
print_cpu_info(c);
 }
 

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.23 boot failures on x86-64.

2007-10-30 Thread Zou Nan hai
On Wed, 2007-10-31 at 14:04, Zou Nan hai wrote:
> On Tue, 2007-10-30 at 05:21, Martin Ebourne wrote:
> > On Mon, 2007-10-29 at 15:43 -0400, Dave Jones wrote:
> > > On Mon, Oct 29, 2007 at 08:03:09PM +0100, Andi Kleen wrote:
> > >  > >  > But if allocating bootmem >4G doesn't work on these systems
> > >  > >  > most likely they have more problems anyways. It might be better
> > >  > >  > to find out what goes wrong exactly.
> > >  > > Any ideas on what to instrument ?
> > >  > 
> > >  > See what address the bootmem_alloc_high returns; check if it overlaps
> > >  > with something etc.
> > >  > 
> > >  > Fill the memory on the system and see if it can access all of its 
> > > memory.
> > > 
> > > Martin, as you have one of the affected systems, do you feel up to this?
> > 
> > Faking a node at -1fff
> > Bootmem setup node 0 -1fff
> > sparse_early_mem_map_alloc: returned address 8170b000
> > 
> > My box has 512MB of RAM.
> > 
> > Cheers,
> > 
> > Martin.
> 
> Oops, sorry,
> seem to be a mistake of me.
> I forget to exclude the DMA range.
> 
> Does the following patch fix the issue?
> 
> Thanks
> Zou Nan hai
> 
> --- a/arch/x86/mm/init_64.c   2007-10-31 11:24:11.0 +0800
> +++ b/arch/x86/mm/init_64.c   2007-10-31 12:31:02.0 +0800
> @@ -731,7 +731,7 @@ int in_gate_area_no_task(unsigned long a
>  void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
>  {
>   return __alloc_bootmem_core(pgdat->bdata, size,
> - SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
> + SMP_CACHE_BYTES, (4UL*1024*1024*1024), 
> __pa(MAX_DMA_ADDRESS));
>  }
>  
>  const char *arch_vma_name(struct vm_area_struct *vma)
> 
> 
> 
>  

Please ignore the patch, the patch is wrong.

However I think the root cause is when __alloc_bootmem_core fail to
allocate a memory above 4G it will fall back to allocate from the lowest
page. 
Then happens to be allocated in DMA region sometimes...

Since this code path is dead, I am OK to revert the patch.

Suresh and I will check the CONFIG_SPARSE_VMEMMAP path.
Thanks
Zou Nan hai




 
 
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.23 boot failures on x86-64.

2007-10-30 Thread Zou Nan hai
On Tue, 2007-10-30 at 05:21, Martin Ebourne wrote:
> On Mon, 2007-10-29 at 15:43 -0400, Dave Jones wrote:
> > On Mon, Oct 29, 2007 at 08:03:09PM +0100, Andi Kleen wrote:
> >  > >  > But if allocating bootmem >4G doesn't work on these systems
> >  > >  > most likely they have more problems anyways. It might be better
> >  > >  > to find out what goes wrong exactly.
> >  > > Any ideas on what to instrument ?
> >  > 
> >  > See what address the bootmem_alloc_high returns; check if it overlaps
> >  > with something etc.
> >  > 
> >  > Fill the memory on the system and see if it can access all of its memory.
> > 
> > Martin, as you have one of the affected systems, do you feel up to this?
> 
> Faking a node at -1fff
> Bootmem setup node 0 -1fff
> sparse_early_mem_map_alloc: returned address 8170b000
> 
> My box has 512MB of RAM.
> 
> Cheers,
> 
> Martin.

Oops, sorry,
seem to be a mistake of me.
I forget to exclude the DMA range.

Does the following patch fix the issue?

Thanks
Zou Nan hai

--- a/arch/x86/mm/init_64.c 2007-10-31 11:24:11.0 +0800
+++ b/arch/x86/mm/init_64.c 2007-10-31 12:31:02.0 +0800
@@ -731,7 +731,7 @@ int in_gate_area_no_task(unsigned long a
 void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
 {
return __alloc_bootmem_core(pgdat->bdata, size,
-   SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
+   SMP_CACHE_BYTES, (4UL*1024*1024*1024), 
__pa(MAX_DMA_ADDRESS));
 }
 
 const char *arch_vma_name(struct vm_area_struct *vma)



 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.23 boot failures on x86-64.

2007-10-30 Thread Zou Nan hai
On Tue, 2007-10-30 at 05:21, Martin Ebourne wrote:
 On Mon, 2007-10-29 at 15:43 -0400, Dave Jones wrote:
  On Mon, Oct 29, 2007 at 08:03:09PM +0100, Andi Kleen wrote:
   But if allocating bootmem 4G doesn't work on these systems
   most likely they have more problems anyways. It might be better
   to find out what goes wrong exactly.
 Any ideas on what to instrument ?

See what address the bootmem_alloc_high returns; check if it overlaps
with something etc.

Fill the memory on the system and see if it can access all of its memory.
  
  Martin, as you have one of the affected systems, do you feel up to this?
 
 Faking a node at -1fff
 Bootmem setup node 0 -1fff
 sparse_early_mem_map_alloc: returned address 8170b000
 
 My box has 512MB of RAM.
 
 Cheers,
 
 Martin.

Oops, sorry,
seem to be a mistake of me.
I forget to exclude the DMA range.

Does the following patch fix the issue?

Thanks
Zou Nan hai

--- a/arch/x86/mm/init_64.c 2007-10-31 11:24:11.0 +0800
+++ b/arch/x86/mm/init_64.c 2007-10-31 12:31:02.0 +0800
@@ -731,7 +731,7 @@ int in_gate_area_no_task(unsigned long a
 void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
 {
return __alloc_bootmem_core(pgdat-bdata, size,
-   SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
+   SMP_CACHE_BYTES, (4UL*1024*1024*1024), 
__pa(MAX_DMA_ADDRESS));
 }
 
 const char *arch_vma_name(struct vm_area_struct *vma)



 
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.23 boot failures on x86-64.

2007-10-30 Thread Zou Nan hai
On Wed, 2007-10-31 at 14:04, Zou Nan hai wrote:
 On Tue, 2007-10-30 at 05:21, Martin Ebourne wrote:
  On Mon, 2007-10-29 at 15:43 -0400, Dave Jones wrote:
   On Mon, Oct 29, 2007 at 08:03:09PM +0100, Andi Kleen wrote:
But if allocating bootmem 4G doesn't work on these systems
most likely they have more problems anyways. It might be better
to find out what goes wrong exactly.
  Any ideas on what to instrument ?
 
 See what address the bootmem_alloc_high returns; check if it overlaps
 with something etc.
 
 Fill the memory on the system and see if it can access all of its 
   memory.
   
   Martin, as you have one of the affected systems, do you feel up to this?
  
  Faking a node at -1fff
  Bootmem setup node 0 -1fff
  sparse_early_mem_map_alloc: returned address 8170b000
  
  My box has 512MB of RAM.
  
  Cheers,
  
  Martin.
 
 Oops, sorry,
 seem to be a mistake of me.
 I forget to exclude the DMA range.
 
 Does the following patch fix the issue?
 
 Thanks
 Zou Nan hai
 
 --- a/arch/x86/mm/init_64.c   2007-10-31 11:24:11.0 +0800
 +++ b/arch/x86/mm/init_64.c   2007-10-31 12:31:02.0 +0800
 @@ -731,7 +731,7 @@ int in_gate_area_no_task(unsigned long a
  void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
  {
   return __alloc_bootmem_core(pgdat-bdata, size,
 - SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
 + SMP_CACHE_BYTES, (4UL*1024*1024*1024), 
 __pa(MAX_DMA_ADDRESS));
  }
  
  const char *arch_vma_name(struct vm_area_struct *vma)
 
 
 
  

Please ignore the patch, the patch is wrong.

However I think the root cause is when __alloc_bootmem_core fail to
allocate a memory above 4G it will fall back to allocate from the lowest
page. 
Then happens to be allocated in DMA region sometimes...

Since this code path is dead, I am OK to revert the patch.

Suresh and I will check the CONFIG_SPARSE_VMEMMAP path.
Thanks
Zou Nan hai




 
 
 
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Patch]some proc entries are missed in sched_domain sys_ctl debug code.

2007-09-18 Thread Zou Nan hai
cache_nice_tries and flags entry do not appear in proc fs sched_domain 
directory,
because ctl_table entry is skipped.

This patch fix the issue.

Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]>

--- linux-2.6.23-rc6/kernel/sched.c 2007-09-18 23:47:07.0 -0400
+++ b/kernel/sched.c2007-09-18 23:47:20.0 -0400
@@ -5304,7 +5304,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-   struct ctl_table *table = sd_alloc_ctl_entry(14);
+   struct ctl_table *table = sd_alloc_ctl_entry(12);
 
set_table_entry([0], "min_interval", >min_interval,
sizeof(long), 0644, proc_doulongvec_minmax);
@@ -5324,10 +5324,10 @@ sd_alloc_ctl_domain_table(struct sched_d
sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry([8], "imbalance_pct", >imbalance_pct,
sizeof(int), 0644, proc_dointvec_minmax);
-   set_table_entry([10], "cache_nice_tries",
+   set_table_entry([9], "cache_nice_tries",
>cache_nice_tries,
sizeof(int), 0644, proc_dointvec_minmax);
-   set_table_entry([12], "flags", >flags,
+   set_table_entry([10], "flags", >flags,
sizeof(int), 0644, proc_dointvec_minmax);
 
return table;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Patch]some proc entries are missed in sched_domain sys_ctl debug code.

2007-09-18 Thread Zou Nan hai
cache_nice_tries and flags entry do not appear in proc fs sched_domain 
directory,
because ctl_table entry is skipped.

This patch fix the issue.

Signed-off-by: Zou Nan hai [EMAIL PROTECTED]

--- linux-2.6.23-rc6/kernel/sched.c 2007-09-18 23:47:07.0 -0400
+++ b/kernel/sched.c2007-09-18 23:47:20.0 -0400
@@ -5304,7 +5304,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-   struct ctl_table *table = sd_alloc_ctl_entry(14);
+   struct ctl_table *table = sd_alloc_ctl_entry(12);
 
set_table_entry(table[0], min_interval, sd-min_interval,
sizeof(long), 0644, proc_doulongvec_minmax);
@@ -5324,10 +5324,10 @@ sd_alloc_ctl_domain_table(struct sched_d
sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(table[8], imbalance_pct, sd-imbalance_pct,
sizeof(int), 0644, proc_dointvec_minmax);
-   set_table_entry(table[10], cache_nice_tries,
+   set_table_entry(table[9], cache_nice_tries,
sd-cache_nice_tries,
sizeof(int), 0644, proc_dointvec_minmax);
-   set_table_entry(table[12], flags, sd-flags,
+   set_table_entry(table[10], flags, sd-flags,
sizeof(int), 0644, proc_dointvec_minmax);
 
return table;
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Patch] Allocate sparsemem memmap above 4G on X86_64

2007-05-17 Thread Zou Nan hai
On Fri, 2007-05-18 at 03:32, Andrew Morton wrote:
> On 17 May 2007 10:40:07 +0800
> Zou Nan hai <[EMAIL PROTECTED]> wrote:
> 
> > 
> Please always prefer to use static inline functions rather than macros. 
> They are more readable, they are more likely to have comments attached to
> them and they provide typechecking.
> 
> Please prefer to uninline functions by default.  One reason for this is
> that adding inlines to headers increases include complexity.  This code is
> all __init anyway, so the possible few bytes of text will get removed.
> 
> 
> Try to avoid using the ARCH_HAS_FOO thing.  We have two alternatives:
> 
> a) use __attribute__((weak))
> 
> b) do:
> 
>   extern void foo(void);
>   #define foo foo
> 
>then, elsewhere,
> 
>   #ifndef foo
>   #define foo() bar()
>   #endif
> 
> Both tricks avoid the introduction of two new symbols into the global
> namespace to solve a single problem.
  On systems with huge amount of physical memory, VFS cache and memory
memmap may eat all available system memory under 4G, then the system may
fail to allocate swiotlb bounce buffer.
  There was a fix for this issue in arch/x86_64/mm/numa.c, but that fix
dose not cover sparsemem model.
  This patch add fix to sparsemem model by first try to allocate memmap
above 4G.

Signed-off-by:  Zou Nan hai <[EMAIL PROTECTED]>
Acked-by:   Suresh Siddha <[EMAIL PROTECTED]>
---
 arch/x86_64/mm/init.c |6 ++
 mm/sparse.c   |   11 +++
 2 files changed, 17 insertions(+)

diff -Nraup a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
--- a/arch/x86_64/mm/init.c 2007-05-19 16:54:46.0 +0800
+++ b/arch/x86_64/mm/init.c 2007-05-19 17:43:47.0 +0800
@@ -761,3 +761,9 @@ int in_gate_area_no_task(unsigned long a
 {
return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
 }
+
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+{
+   return __alloc_bootmem_core(pgdat->bdata, size,
+   SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
+}
diff -Nraup a/mm/sparse.c b/mm/sparse.c
--- a/mm/sparse.c   2007-05-19 16:54:48.0 +0800
+++ b/mm/sparse.c   2007-05-19 17:44:01.0 +0800
@@ -209,6 +209,12 @@ static int __meminit sparse_init_one_sec
return 1;
 }
 
+__attribute__((weak))
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+{
+   return NULL;
+}
+
 static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
 {
struct page *map;
@@ -219,6 +225,11 @@ static struct page __init *sparse_early_
if (map)
return map;
 
+   map = alloc_bootmem_high_node(NODE_DATA(nid),
+   sizeof(struct page) * PAGES_PER_SECTION);
+   if (map)
+   return map;
+
map = alloc_bootmem_node(NODE_DATA(nid),
sizeof(struct page) * PAGES_PER_SECTION);
if (map)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Patch] Allocate sparsemem memmap above 4G on X86_64

2007-05-17 Thread Zou Nan hai
On Fri, 2007-05-18 at 03:32, Andrew Morton wrote:
 On 17 May 2007 10:40:07 +0800
 Zou Nan hai [EMAIL PROTECTED] wrote:
 
  
 Please always prefer to use static inline functions rather than macros. 
 They are more readable, they are more likely to have comments attached to
 them and they provide typechecking.
 
 Please prefer to uninline functions by default.  One reason for this is
 that adding inlines to headers increases include complexity.  This code is
 all __init anyway, so the possible few bytes of text will get removed.
 
 
 Try to avoid using the ARCH_HAS_FOO thing.  We have two alternatives:
 
 a) use __attribute__((weak))
 
 b) do:
 
   extern void foo(void);
   #define foo foo
 
then, elsewhere,
 
   #ifndef foo
   #define foo() bar()
   #endif
 
 Both tricks avoid the introduction of two new symbols into the global
 namespace to solve a single problem.
  On systems with huge amount of physical memory, VFS cache and memory
memmap may eat all available system memory under 4G, then the system may
fail to allocate swiotlb bounce buffer.
  There was a fix for this issue in arch/x86_64/mm/numa.c, but that fix
dose not cover sparsemem model.
  This patch add fix to sparsemem model by first try to allocate memmap
above 4G.

Signed-off-by:  Zou Nan hai [EMAIL PROTECTED]
Acked-by:   Suresh Siddha [EMAIL PROTECTED]
---
 arch/x86_64/mm/init.c |6 ++
 mm/sparse.c   |   11 +++
 2 files changed, 17 insertions(+)

diff -Nraup a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
--- a/arch/x86_64/mm/init.c 2007-05-19 16:54:46.0 +0800
+++ b/arch/x86_64/mm/init.c 2007-05-19 17:43:47.0 +0800
@@ -761,3 +761,9 @@ int in_gate_area_no_task(unsigned long a
 {
return (addr = VSYSCALL_START)  (addr  VSYSCALL_END);
 }
+
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+{
+   return __alloc_bootmem_core(pgdat-bdata, size,
+   SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
+}
diff -Nraup a/mm/sparse.c b/mm/sparse.c
--- a/mm/sparse.c   2007-05-19 16:54:48.0 +0800
+++ b/mm/sparse.c   2007-05-19 17:44:01.0 +0800
@@ -209,6 +209,12 @@ static int __meminit sparse_init_one_sec
return 1;
 }
 
+__attribute__((weak))
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+{
+   return NULL;
+}
+
 static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
 {
struct page *map;
@@ -219,6 +225,11 @@ static struct page __init *sparse_early_
if (map)
return map;
 
+   map = alloc_bootmem_high_node(NODE_DATA(nid),
+   sizeof(struct page) * PAGES_PER_SECTION);
+   if (map)
+   return map;
+
map = alloc_bootmem_node(NODE_DATA(nid),
sizeof(struct page) * PAGES_PER_SECTION);
if (map)
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Patch] Allocate sparsemem memmap above 4G on X86_64

2007-05-16 Thread Zou Nan hai
On system with huge amount of physical memory. 
VFS cache and memory memmap may eat all available system memory under
4G, then system may fail to allocated swiotlb bounce buffer. 

There was a fix in arch/x86_64/mm/numa.c, but that fix does not cover
sparsemem model.
This patch add fix to sparsemem model.

Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]>
Acked-by: Siddha, Suresh <[EMAIL PROTECTED]>
---
 include/asm-x86_64/mmzone.h |5 +
 include/linux/bootmem.h |3 +++
 mm/sparse.c |5 +
 3 files changed, 13 insertions(+)

diff -Nraup a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
--- a/include/asm-x86_64/mmzone.h   2007-05-17 09:38:02.0 +0800
+++ b/include/asm-x86_64/mmzone.h   2007-05-17 09:54:10.0 +0800
@@ -52,5 +52,10 @@ extern int pfn_valid(unsigned long pfn);
 #define FAKE_NODE_MIN_HASH_MASK(~(FAKE_NODE_MIN_SIZE - 1uL))
 #endif
 
+#define ARCH_HAS_ALLOC_BOOTMEM_HIGH_NODE 1
+#define alloc_bootmem_high_node(pgdat,size) \
+({__alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, 
(4UL*1024*1024*1024), 0);})
+
+
 #endif
 #endif
diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
--- a/include/linux/bootmem.h   2007-05-17 09:38:02.0 +0800
+++ b/include/linux/bootmem.h   2007-05-17 09:37:00.0 +0800
@@ -131,5 +131,8 @@ extern void *alloc_large_system_hash(con
 #endif
 extern int hashdist;   /* Distribute hashes across NUMA nodes? */
 
+#ifndef ARCH_HAS_ALLOC_BOOTMEM_HIGH_NODE
+#define alloc_bootmem_high_node(pgdat, size) ({NULL;})
+#endif
 
 #endif /* _LINUX_BOOTMEM_H */
diff -Nraup a/mm/sparse.c b/mm/sparse.c
--- a/mm/sparse.c   2007-05-17 09:38:03.0 +0800
+++ b/mm/sparse.c   2007-05-17 09:54:27.0 +0800
@@ -219,6 +219,11 @@ static struct page __init *sparse_early_
if (map)
return map;
 
+   map = alloc_bootmem_high_node(NODE_DATA(nid),
+   sizeof(struct page) * PAGES_PER_SECTION);
+if (map)
+return map;
+
map = alloc_bootmem_node(NODE_DATA(nid),
sizeof(struct page) * PAGES_PER_SECTION);
if (map)




-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Patch] Allocate sparsemem memmap above 4G on X86_64

2007-05-16 Thread Zou Nan hai
On system with huge amount of physical memory. 
VFS cache and memory memmap may eat all available system memory under
4G, then system may fail to allocated swiotlb bounce buffer. 

There was a fix in arch/x86_64/mm/numa.c, but that fix does not cover
sparsemem model.
This patch add fix to sparsemem model.

Signed-off-by: Zou Nan hai [EMAIL PROTECTED]
Acked-by: Siddha, Suresh [EMAIL PROTECTED]
---
 include/asm-x86_64/mmzone.h |5 +
 include/linux/bootmem.h |3 +++
 mm/sparse.c |5 +
 3 files changed, 13 insertions(+)

diff -Nraup a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
--- a/include/asm-x86_64/mmzone.h   2007-05-17 09:38:02.0 +0800
+++ b/include/asm-x86_64/mmzone.h   2007-05-17 09:54:10.0 +0800
@@ -52,5 +52,10 @@ extern int pfn_valid(unsigned long pfn);
 #define FAKE_NODE_MIN_HASH_MASK(~(FAKE_NODE_MIN_SIZE - 1uL))
 #endif
 
+#define ARCH_HAS_ALLOC_BOOTMEM_HIGH_NODE 1
+#define alloc_bootmem_high_node(pgdat,size) \
+({__alloc_bootmem_core(pgdat-bdata, size, SMP_CACHE_BYTES, 
(4UL*1024*1024*1024), 0);})
+
+
 #endif
 #endif
diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
--- a/include/linux/bootmem.h   2007-05-17 09:38:02.0 +0800
+++ b/include/linux/bootmem.h   2007-05-17 09:37:00.0 +0800
@@ -131,5 +131,8 @@ extern void *alloc_large_system_hash(con
 #endif
 extern int hashdist;   /* Distribute hashes across NUMA nodes? */
 
+#ifndef ARCH_HAS_ALLOC_BOOTMEM_HIGH_NODE
+#define alloc_bootmem_high_node(pgdat, size) ({NULL;})
+#endif
 
 #endif /* _LINUX_BOOTMEM_H */
diff -Nraup a/mm/sparse.c b/mm/sparse.c
--- a/mm/sparse.c   2007-05-17 09:38:03.0 +0800
+++ b/mm/sparse.c   2007-05-17 09:54:27.0 +0800
@@ -219,6 +219,11 @@ static struct page __init *sparse_early_
if (map)
return map;
 
+   map = alloc_bootmem_high_node(NODE_DATA(nid),
+   sizeof(struct page) * PAGES_PER_SECTION);
+if (map)
+return map;
+
map = alloc_bootmem_node(NODE_DATA(nid),
sizeof(struct page) * PAGES_PER_SECTION);
if (map)




-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Patch] fix an error in /proc/slabinfo print

2005-02-03 Thread Zou Nan hai
There is an obvious error in the header of /proc/slabinfo

Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]>

--- linux-2.6.11-rc3/mm/slab.c  2005-02-03 13:29:33.0 +0800
+++ linux-2.6.11-rc3-fix/mm/slab.c  2005-02-03 13:32:42.318821400 +0800
@@ -2860,7 +2860,7 @@ static void *s_start(struct seq_file *m,
seq_puts(m, "slabinfo - version: 2.1\n");
 #endif
seq_puts(m, "# name  
  ");
-   seq_puts(m, " : tunables   ");
+   seq_puts(m, " : tunables   ");
seq_puts(m, " : slabdata   
");
 #if STATS
seq_puts(m, " : globalstat
"



-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Patch] fix an error in /proc/slabinfo print

2005-02-03 Thread Zou Nan hai
There is an obvious error in the header of /proc/slabinfo

Signed-off-by: Zou Nan hai [EMAIL PROTECTED]

--- linux-2.6.11-rc3/mm/slab.c  2005-02-03 13:29:33.0 +0800
+++ linux-2.6.11-rc3-fix/mm/slab.c  2005-02-03 13:32:42.318821400 +0800
@@ -2860,7 +2860,7 @@ static void *s_start(struct seq_file *m,
seq_puts(m, slabinfo - version: 2.1\n);
 #endif
seq_puts(m, # nameactive_objs num_objs 
objsize objperslab pagesperslab);
-   seq_puts(m,  : tunables batchcount limit sharedfactor);
+   seq_puts(m,  : tunables limit batchcount sharedfactor);
seq_puts(m,  : slabdata active_slabs num_slabs 
sharedavail);
 #if STATS
seq_puts(m,  : globalstat listallocs maxobjs grown 
reaped



-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


possible performance issue in 4-level page tables

2005-01-31 Thread Zou Nan hai

There is a performance regression of lmbench
lat_proc fork result on ia64.

in 
2.6.10 

I got 
Process fork+exit:164.8438 microseconds.

in 2.6.11-rc2
Process fork+exit:183.8621 microseconds.

I believe this regression was caused by 
the 4-level page tables change.

Since most of the kernel time spend in lat_proc fork is copy_page_range
in fork path and clear_page_range in the exit path. Now they are 1 level
deeper.

Though pud and pgd is same on IA64, there is still some overhead
introduced I think.
 
Are any other architectures seeing the same sort of results?

Zou Nan hai

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


possible performance issue in 4-level page tables

2005-01-31 Thread Zou Nan hai

There is a performance regression of lmbench
lat_proc fork result on ia64.

in 
2.6.10 

I got 
Process fork+exit:164.8438 microseconds.

in 2.6.11-rc2
Process fork+exit:183.8621 microseconds.

I believe this regression was caused by 
the 4-level page tables change.

Since most of the kernel time spend in lat_proc fork is copy_page_range
in fork path and clear_page_range in the exit path. Now they are 1 level
deeper.

Though pud and pgd is same on IA64, there is still some overhead
introduced I think.
 
Are any other architectures seeing the same sort of results?

Zou Nan hai

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Patch]Fix an error in copy_page_range

2005-01-19 Thread Zou Nan hai
Hi, 

There is a bug in copy_page_range in current 2.6.11-rc1 with 4 level
page table change. copy_page_range do a continue without adding pgds and
addr when pgd_none(*src_pgd) or pgd_bad(*src_pgd).

I think it's wrong in logic, copy_page_range will run into infinite loop
when when pgd_none(*src_pgd) or pgd_bad(*src_pgd).

Although maybe this bug does not break anything currently..., 


Signed-off-by: Zou Nan hai <[EMAIL PROTECTED]>

--- a/mm/memory.c   2005-01-21 01:21:18.0 +0800
+++ b/mm/memory.c   2005-01-21 04:49:13.0 +0800
@@ -442,17 +442,18 @@ int copy_page_range(struct mm_struct *ds
if (next > end || next <= addr)
next = end;
if (pgd_none(*src_pgd))
-   continue;
+   goto next_pgd;
if (pgd_bad(*src_pgd)) {
pgd_ERROR(*src_pgd);
pgd_clear(src_pgd);
-   continue;
+   goto next_pgd;
}
err = copy_pud_range(dst, src, dst_pgd, src_pgd,
vma, addr, next);
if (err)
break;
 
+next_pgd:
src_pgd++;
dst_pgd++;
addr = next;



-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Patch]Fix an error in copy_page_range

2005-01-19 Thread Zou Nan hai
Hi, 

There is a bug in copy_page_range in current 2.6.11-rc1 with 4 level
page table change. copy_page_range do a continue without adding pgds and
addr when pgd_none(*src_pgd) or pgd_bad(*src_pgd).

I think it's wrong in logic, copy_page_range will run into infinite loop
when when pgd_none(*src_pgd) or pgd_bad(*src_pgd).

Although maybe this bug does not break anything currently..., 


Signed-off-by: Zou Nan hai [EMAIL PROTECTED]

--- a/mm/memory.c   2005-01-21 01:21:18.0 +0800
+++ b/mm/memory.c   2005-01-21 04:49:13.0 +0800
@@ -442,17 +442,18 @@ int copy_page_range(struct mm_struct *ds
if (next  end || next = addr)
next = end;
if (pgd_none(*src_pgd))
-   continue;
+   goto next_pgd;
if (pgd_bad(*src_pgd)) {
pgd_ERROR(*src_pgd);
pgd_clear(src_pgd);
-   continue;
+   goto next_pgd;
}
err = copy_pud_range(dst, src, dst_pgd, src_pgd,
vma, addr, next);
if (err)
break;
 
+next_pgd:
src_pgd++;
dst_pgd++;
addr = next;



-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/