[patch -mm 1/7] x86_64: configurable fake numa node sizes

2007-03-01 Thread David Rientjes
Extends the numa=fake x86_64 command-line option to allow for configurable
node sizes.  These nodes can be used in conjunction with cpusets for
coarse memory resource management.

The old command-line option is still supported:
  numa=fake=32  gives 32 fake NUMA nodes, ignoring the NUMA setup of the
actual machine.

But now you may configure your system for the node sizes of your choice:
  numa=fake=2*512,1024,2*256
gives two 512M nodes, one 1024M node, two 256M nodes, and
the rest of system memory to a sixth node.

The existing hash function is maintained to support the various node sizes
that are possible with this implementation.

Each node of the same size receives roughly the same amount of available
pages, regardless of any reserved memory with its address range.  The
total available pages on the system is calculated and divided by the
number of equal nodes to allocate.  These nodes are then dynamically
allocated and their borders extended until such time as their number of
available pages reaches the required size.

Configurable node sizes are recommended when used in conjunction with
cpusets for memory control because it eliminates the overhead associated
with scanning the zonelists of many smaller full nodes on page_alloc().

Cc: Andi Kleen <[EMAIL PROTECTED]>
Signed-off-by: David Rientjes <[EMAIL PROTECTED]>
---
 Documentation/x86_64/boot-options.txt |8 +-
 arch/x86_64/mm/numa.c |  255 +++--
 include/asm-x86_64/mmzone.h   |2 +-
 3 files changed, 155 insertions(+), 110 deletions(-)

diff --git a/Documentation/x86_64/boot-options.txt 
b/Documentation/x86_64/boot-options.txt
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -149,7 +149,13 @@ NUMA
 
   numa=noacpi   Don't parse the SRAT table for NUMA setup
 
-  numa=fake=X   Fake X nodes and ignore NUMA setup of the actual machine.
+  numa=fake=CMDLINE
+   If a number, fakes CMDLINE nodes and ignores NUMA setup of the
+   actual machine.  Otherwise, system memory is configured
+   depending on the sizes and coefficients listed.  For example:
+   numa=fake=2*512,1024,4*256
+   gives two 512M nodes, a 1024M node, and four 256M nodes.  The
+   remaining system RAM is allocated to an additional node.
 
   numa=hotadd=percent
Only allow hotadd memory to preallocate page structures upto
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -276,125 +276,166 @@ void __init numa_init_array(void)
 
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
-int numa_fake __initdata = 0;
+#define E820_ADDR_HOLE_SIZE(start, end)
\
+   (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) <<  \
+   PAGE_SHIFT)
+char *cmdline __initdata;
 
 /*
- * This function is used to find out if the start and end correspond to
- * different zones.
+ * Setups up nid to range from addr to addr + size.  If the end boundary is
+ * greater than max_addr, then max_addr is used instead.  The return value is 0
+ * if there is additional memory left for allocation past addr and -1 
otherwise.
+ * addr is adjusted to be at the end of the node.
  */
-int zone_cross_over(unsigned long start, unsigned long end)
+static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
+  u64 size, u64 max_addr)
 {
-   if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) &&
-   (end >= (MAX_DMA32_PFN << PAGE_SHIFT)))
-   return 1;
-   return 0;
+   int ret = 0;
+   nodes[nid].start = *addr;
+   *addr += size;
+   if (*addr >= max_addr) {
+   *addr = max_addr;
+   ret = -1;
+   }
+   nodes[nid].end = *addr;
+   node_set_online(nid);
+   printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
+  nodes[nid].start, nodes[nid].end,
+  (nodes[nid].end - nodes[nid].start) >> 20);
+   return ret;
 }
 
-static int __init numa_emulation(unsigned long start_pfn, unsigned long 
end_pfn)
+/*
+ * Splits num_nodes nodes up equally starting at node_start.  The return value
+ * is the number of nodes split up and addr is adjusted to be at the end of the
+ * last node allocated.
+ */
+static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
+ u64 max_addr, int node_start,
+ int num_nodes)
 {
-   int i, big;
-   struct bootnode nodes[MAX_NUMNODES];
-   unsigned long sz, old_sz;
-   unsigned long hole_size;
-   unsigned long start, end;
-   unsigned long max_addr = (end_pfn << PAGE_SHIFT);
-
-   start = (start_pfn << PAGE_SHIFT);
-   hole_size = e820_hole_size(start, max_addr);
-   

[patch -mm 1/7] x86_64: configurable fake numa node sizes

2007-03-01 Thread David Rientjes
Extends the numa=fake x86_64 command-line option to allow for configurable
node sizes.  These nodes can be used in conjunction with cpusets for
coarse memory resource management.

The old command-line option is still supported:
  numa=fake=32  gives 32 fake NUMA nodes, ignoring the NUMA setup of the
actual machine.

But now you may configure your system for the node sizes of your choice:
  numa=fake=2*512,1024,2*256
gives two 512M nodes, one 1024M node, two 256M nodes, and
the rest of system memory to a sixth node.

The existing hash function is maintained to support the various node sizes
that are possible with this implementation.

Each node of the same size receives roughly the same amount of available
pages, regardless of any reserved memory with its address range.  The
total available pages on the system is calculated and divided by the
number of equal nodes to allocate.  These nodes are then dynamically
allocated and their borders extended until such time as their number of
available pages reaches the required size.

Configurable node sizes are recommended when used in conjunction with
cpusets for memory control because it eliminates the overhead associated
with scanning the zonelists of many smaller full nodes on page_alloc().

Cc: Andi Kleen [EMAIL PROTECTED]
Signed-off-by: David Rientjes [EMAIL PROTECTED]
---
 Documentation/x86_64/boot-options.txt |8 +-
 arch/x86_64/mm/numa.c |  255 +++--
 include/asm-x86_64/mmzone.h   |2 +-
 3 files changed, 155 insertions(+), 110 deletions(-)

diff --git a/Documentation/x86_64/boot-options.txt 
b/Documentation/x86_64/boot-options.txt
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -149,7 +149,13 @@ NUMA
 
   numa=noacpi   Don't parse the SRAT table for NUMA setup
 
-  numa=fake=X   Fake X nodes and ignore NUMA setup of the actual machine.
+  numa=fake=CMDLINE
+   If a number, fakes CMDLINE nodes and ignores NUMA setup of the
+   actual machine.  Otherwise, system memory is configured
+   depending on the sizes and coefficients listed.  For example:
+   numa=fake=2*512,1024,4*256
+   gives two 512M nodes, a 1024M node, and four 256M nodes.  The
+   remaining system RAM is allocated to an additional node.
 
   numa=hotadd=percent
Only allow hotadd memory to preallocate page structures upto
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -276,125 +276,166 @@ void __init numa_init_array(void)
 
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
-int numa_fake __initdata = 0;
+#define E820_ADDR_HOLE_SIZE(start, end)
\
+   (e820_hole_size((start)  PAGE_SHIFT, (end)  PAGE_SHIFT)   \
+   PAGE_SHIFT)
+char *cmdline __initdata;
 
 /*
- * This function is used to find out if the start and end correspond to
- * different zones.
+ * Setups up nid to range from addr to addr + size.  If the end boundary is
+ * greater than max_addr, then max_addr is used instead.  The return value is 0
+ * if there is additional memory left for allocation past addr and -1 
otherwise.
+ * addr is adjusted to be at the end of the node.
  */
-int zone_cross_over(unsigned long start, unsigned long end)
+static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
+  u64 size, u64 max_addr)
 {
-   if ((start  (MAX_DMA32_PFN  PAGE_SHIFT)) 
-   (end = (MAX_DMA32_PFN  PAGE_SHIFT)))
-   return 1;
-   return 0;
+   int ret = 0;
+   nodes[nid].start = *addr;
+   *addr += size;
+   if (*addr = max_addr) {
+   *addr = max_addr;
+   ret = -1;
+   }
+   nodes[nid].end = *addr;
+   node_set_online(nid);
+   printk(KERN_INFO Faking node %d at %016Lx-%016Lx (%LuMB)\n, nid,
+  nodes[nid].start, nodes[nid].end,
+  (nodes[nid].end - nodes[nid].start)  20);
+   return ret;
 }
 
-static int __init numa_emulation(unsigned long start_pfn, unsigned long 
end_pfn)
+/*
+ * Splits num_nodes nodes up equally starting at node_start.  The return value
+ * is the number of nodes split up and addr is adjusted to be at the end of the
+ * last node allocated.
+ */
+static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
+ u64 max_addr, int node_start,
+ int num_nodes)
 {
-   int i, big;
-   struct bootnode nodes[MAX_NUMNODES];
-   unsigned long sz, old_sz;
-   unsigned long hole_size;
-   unsigned long start, end;
-   unsigned long max_addr = (end_pfn  PAGE_SHIFT);
-
-   start = (start_pfn  PAGE_SHIFT);
-   hole_size = e820_hole_size(start, max_addr);
-   sz = (max_addr - start - 

[patch -mm 1/7] x86_64: configurable fake numa node sizes

2007-01-31 Thread David Rientjes
Extends the numa=fake x86_64 command-line option to allow for configurable
node sizes.  These nodes can be used in conjunction with cpusets for
coarse memory resource management.

The old command-line option is still supported:
  numa=fake=32  gives 32 fake NUMA nodes, ignoring the NUMA setup of the
actual machine.

But now you may configure your system for the node sizes of your choice:
  numa=fake=2*512,1024,2*256
gives two 512M nodes, one 1024M node, two 256M nodes, and
the rest of system memory to a sixth node.

The existing hash function is maintained to support the various node sizes
that are possible with this implementation.

Each node of the same size receives roughly the same amount of available
pages, regardless of any reserved memory with its address range.  The
total available pages on the system is calculated and divided by the
number of equal nodes to allocate.  These nodes are then dynamically
allocated and their borders extended until such time as their number of
available pages reaches the required size.

Configurable node sizes are recommended when used in conjunction with
cpusets for memory control because it eliminates the overhead associated
with scanning the zonelists of many smaller full nodes on page_alloc().

Cc: Andi Kleen <[EMAIL PROTECTED]>
Signed-off-by: David Rientjes <[EMAIL PROTECTED]>
---
 Documentation/x86_64/boot-options.txt |8 +-
 arch/x86_64/mm/numa.c |  255 +++--
 include/asm-x86_64/mmzone.h   |2 +-
 3 files changed, 155 insertions(+), 110 deletions(-)

diff --git a/Documentation/x86_64/boot-options.txt 
b/Documentation/x86_64/boot-options.txt
index 625a21d..87f4279 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -149,7 +149,13 @@ NUMA
 
   numa=noacpi   Don't parse the SRAT table for NUMA setup
 
-  numa=fake=X   Fake X nodes and ignore NUMA setup of the actual machine.
+  numa=fake=CMDLINE
+   If a number, fakes CMDLINE nodes and ignores NUMA setup of the
+   actual machine.  Otherwise, system memory is configured
+   depending on the sizes and coefficients listed.  For example:
+   numa=fake=2*512,1024,4*256
+   gives two 512M nodes, a 1024M node, and four 256M nodes.  The
+   remaining system RAM is allocated to an additional node.
 
   numa=hotadd=percent
Only allow hotadd memory to preallocate page structures upto
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 9ff3141..a5bda07 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -276,125 +276,166 @@ void __init numa_init_array(void)
 
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
-int numa_fake __initdata = 0;
+#define E820_ADDR_HOLE_SIZE(start, end)
\
+   (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) <<  \
+   PAGE_SHIFT)
+char *cmdline __initdata;
 
 /*
- * This function is used to find out if the start and end correspond to
- * different zones.
+ * Setups up nid to range from addr to addr + size.  If the end boundary is
+ * greater than max_addr, then max_addr is used instead.  The return value is 0
+ * if there is additional memory left for allocation past addr and -1 
otherwise.
+ * addr is adjusted to be at the end of the node.
  */
-int zone_cross_over(unsigned long start, unsigned long end)
+static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
+  u64 size, u64 max_addr)
 {
-   if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) &&
-   (end >= (MAX_DMA32_PFN << PAGE_SHIFT)))
-   return 1;
-   return 0;
+   int ret = 0;
+   nodes[nid].start = *addr;
+   *addr += size;
+   if (*addr >= max_addr) {
+   *addr = max_addr;
+   ret = -1;
+   }
+   nodes[nid].end = *addr;
+   node_set_online(nid);
+   printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
+  nodes[nid].start, nodes[nid].end,
+  (nodes[nid].end - nodes[nid].start) >> 20);
+   return ret;
 }
 
-static int __init numa_emulation(unsigned long start_pfn, unsigned long 
end_pfn)
+/*
+ * Splits num_nodes nodes up equally starting at node_start.  The return value
+ * is the number of nodes split up and addr is adjusted to be at the end of the
+ * last node allocated.
+ */
+static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
+ u64 max_addr, int node_start,
+ int num_nodes)
 {
-   int i, big;
-   struct bootnode nodes[MAX_NUMNODES];
-   unsigned long sz, old_sz;
-   unsigned long hole_size;
-   unsigned long start, end;
-   unsigned long max_addr = (end_pfn << PAGE_SHIFT);
-
-   start = (start_pfn << PAGE_SHIFT);

[patch -mm 1/7] x86_64: configurable fake numa node sizes

2007-01-31 Thread David Rientjes
Extends the numa=fake x86_64 command-line option to allow for configurable
node sizes.  These nodes can be used in conjunction with cpusets for
coarse memory resource management.

The old command-line option is still supported:
  numa=fake=32  gives 32 fake NUMA nodes, ignoring the NUMA setup of the
actual machine.

But now you may configure your system for the node sizes of your choice:
  numa=fake=2*512,1024,2*256
gives two 512M nodes, one 1024M node, two 256M nodes, and
the rest of system memory to a sixth node.

The existing hash function is maintained to support the various node sizes
that are possible with this implementation.

Each node of the same size receives roughly the same amount of available
pages, regardless of any reserved memory with its address range.  The
total available pages on the system is calculated and divided by the
number of equal nodes to allocate.  These nodes are then dynamically
allocated and their borders extended until such time as their number of
available pages reaches the required size.

Configurable node sizes are recommended when used in conjunction with
cpusets for memory control because it eliminates the overhead associated
with scanning the zonelists of many smaller full nodes on page_alloc().

Cc: Andi Kleen [EMAIL PROTECTED]
Signed-off-by: David Rientjes [EMAIL PROTECTED]
---
 Documentation/x86_64/boot-options.txt |8 +-
 arch/x86_64/mm/numa.c |  255 +++--
 include/asm-x86_64/mmzone.h   |2 +-
 3 files changed, 155 insertions(+), 110 deletions(-)

diff --git a/Documentation/x86_64/boot-options.txt 
b/Documentation/x86_64/boot-options.txt
index 625a21d..87f4279 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -149,7 +149,13 @@ NUMA
 
   numa=noacpi   Don't parse the SRAT table for NUMA setup
 
-  numa=fake=X   Fake X nodes and ignore NUMA setup of the actual machine.
+  numa=fake=CMDLINE
+   If a number, fakes CMDLINE nodes and ignores NUMA setup of the
+   actual machine.  Otherwise, system memory is configured
+   depending on the sizes and coefficients listed.  For example:
+   numa=fake=2*512,1024,4*256
+   gives two 512M nodes, a 1024M node, and four 256M nodes.  The
+   remaining system RAM is allocated to an additional node.
 
   numa=hotadd=percent
Only allow hotadd memory to preallocate page structures upto
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 9ff3141..a5bda07 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -276,125 +276,166 @@ void __init numa_init_array(void)
 
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
-int numa_fake __initdata = 0;
+#define E820_ADDR_HOLE_SIZE(start, end)
\
+   (e820_hole_size((start)  PAGE_SHIFT, (end)  PAGE_SHIFT)   \
+   PAGE_SHIFT)
+char *cmdline __initdata;
 
 /*
- * This function is used to find out if the start and end correspond to
- * different zones.
+ * Setups up nid to range from addr to addr + size.  If the end boundary is
+ * greater than max_addr, then max_addr is used instead.  The return value is 0
+ * if there is additional memory left for allocation past addr and -1 
otherwise.
+ * addr is adjusted to be at the end of the node.
  */
-int zone_cross_over(unsigned long start, unsigned long end)
+static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
+  u64 size, u64 max_addr)
 {
-   if ((start  (MAX_DMA32_PFN  PAGE_SHIFT)) 
-   (end = (MAX_DMA32_PFN  PAGE_SHIFT)))
-   return 1;
-   return 0;
+   int ret = 0;
+   nodes[nid].start = *addr;
+   *addr += size;
+   if (*addr = max_addr) {
+   *addr = max_addr;
+   ret = -1;
+   }
+   nodes[nid].end = *addr;
+   node_set_online(nid);
+   printk(KERN_INFO Faking node %d at %016Lx-%016Lx (%LuMB)\n, nid,
+  nodes[nid].start, nodes[nid].end,
+  (nodes[nid].end - nodes[nid].start)  20);
+   return ret;
 }
 
-static int __init numa_emulation(unsigned long start_pfn, unsigned long 
end_pfn)
+/*
+ * Splits num_nodes nodes up equally starting at node_start.  The return value
+ * is the number of nodes split up and addr is adjusted to be at the end of the
+ * last node allocated.
+ */
+static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
+ u64 max_addr, int node_start,
+ int num_nodes)
 {
-   int i, big;
-   struct bootnode nodes[MAX_NUMNODES];
-   unsigned long sz, old_sz;
-   unsigned long hole_size;
-   unsigned long start, end;
-   unsigned long max_addr = (end_pfn  PAGE_SHIFT);
-
-   start = (start_pfn  PAGE_SHIFT);
-   hole_size =