According to the user-provided assignment bind the respective part
of the guest's memory to the given host node. This uses Linux'
mbind syscall (which is wrapped only in libnuma) to realize the
pinning right after the allocation.
Failures are not fatal, but produce a warning.

Signed-off-by: Andre Przywara <andre.przyw...@amd.com>
---
 hw/pc.c |   58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 58 insertions(+), 0 deletions(-)

diff --git a/hw/pc.c b/hw/pc.c
index 1b24409..dbfc082 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -42,6 +42,15 @@
 #include "device-assignment.h"
 #include "kvm.h"
 
+#ifdef CONFIG_NUMA
+#include <numa.h>
+#include <numaif.h>
+#ifndef MPOL_F_RELATIVE_NODES
+  #define MPOL_F_RELATIVE_NODES (1 << 14)
+  #define MPOL_F_STATIC_NODES (1 << 15)
+#endif
+#endif
+
 /* output Bochs bios info messages */
 //#define DEBUG_BIOS
 
@@ -882,6 +891,53 @@ void pc_cpus_init(const char *cpu_model)
     }
 }
 
+static void bind_numa(ram_addr_t ram_addr)
+{
+#ifdef CONFIG_NUMA
+    int i;
+    char* ram_ptr;
+    ram_addr_t len, ram_offset;
+    int bind_mode;
+
+    ram_ptr = qemu_get_ram_ptr(ram_addr);
+
+    ram_offset = 0;
+    for (i = 0; i < nb_numa_nodes; i++) {
+        len = numa_info[i].guest_mem;
+        if (numa_info[i].flags != 0) {
+            switch (numa_info[i].flags & NODE_HOST_POLICY_MASK) {
+            case NODE_HOST_BIND:
+                bind_mode = MPOL_BIND;
+                break;
+            case NODE_HOST_INTERLEAVE:
+                bind_mode = MPOL_INTERLEAVE;
+                break;
+            case NODE_HOST_PREFERRED:
+                bind_mode = MPOL_PREFERRED;
+                break;
+            default:
+                bind_mode = MPOL_DEFAULT;
+                break;
+            }
+            bind_mode |= (numa_info[i].flags & NODE_HOST_RELATIVE) ?
+                MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES;
+
+            /* This is a workaround for a long standing bug in Linux'
+             * mbind implementation, which cuts off the last specified
+             * node. To stay compatible should this bug be fixed, we
+             * specify one more node and zero this one out.
+             */
+            clear_bit(numa_num_configured_nodes() + 1, numa_info[i].host_mem);
+            if (mbind(ram_ptr + ram_offset, len, bind_mode,
+                numa_info[i].host_mem, numa_num_configured_nodes() + 1, 0))
+                    perror("mbind");
+        }
+        ram_offset += len;
+    }
+#endif
+    return;
+}
+
 void pc_memory_init(ram_addr_t ram_size,
                     const char *kernel_filename,
                     const char *kernel_cmdline,
@@ -919,6 +975,8 @@ void pc_memory_init(ram_addr_t ram_size,
     cpu_register_physical_memory(0x100000,
                  below_4g_mem_size - 0x100000,
                  ram_addr + 0x100000);
+    bind_numa(ram_addr);
+
 #if TARGET_PHYS_ADDR_BITS > 32
     cpu_register_physical_memory(0x100000000ULL, above_4g_mem_size,
                                  ram_addr + below_4g_mem_size);
-- 
1.6.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to