[PATCH v3 1/1] numa: add 'spm' option for Specific Purpose Memory

fanhuang Mon, 08 Dec 2025 03:42:01 -0800

This patch adds support for Specific Purpose Memory (SPM) through the
NUMA node configuration. When 'spm=on' is specified for a NUMA node,
QEMU will set the E820 type to E820_SOFT_RESERVED for this memory region.


This allows guest operating systems to recognize the memory as soft reserved
memory, which can be used for device-specific memory management.

The implementation directly iterates over NUMA nodes to update E820 entries,
avoiding unnecessary RAMBlock traversal and flags, as suggested in code review.

Usage:
  -numa node,nodeid=0,memdev=m1,spm=on

Signed-off-by: fanhuang <[email protected]>
---
 hw/core/numa.c               |  3 ++
 hw/i386/e820_memory_layout.c | 73 ++++++++++++++++++++++++++++++++++++
 hw/i386/e820_memory_layout.h |  2 +
 hw/i386/pc.c                 | 55 +++++++++++++++++++++++++++
 include/system/numa.h        |  1 +
 qapi/machine.json            |  5 +++
 qemu-options.hx              | 11 +++++-
 7 files changed, 148 insertions(+), 2 deletions(-)

diff --git a/hw/core/numa.c b/hw/core/numa.c
index 218576f745..e680130460 100644
--- a/hw/core/numa.c
+++ b/hw/core/numa.c
@@ -163,6 +163,9 @@ static void parse_numa_node(MachineState *ms, 
NumaNodeOptions *node,
         numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
     }
 
+    /* Store spm configuration for later processing */
+    numa_info[nodenr].is_spm = node->has_spm && node->spm;
+
     numa_info[nodenr].present = true;
     max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
     ms->numa_state->num_nodes++;
diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
index 3e848fb69c..5b090ac6df 100644
--- a/hw/i386/e820_memory_layout.c
+++ b/hw/i386/e820_memory_layout.c
@@ -46,3 +46,76 @@ bool e820_get_entry(int idx, uint32_t type, uint64_t 
*address, uint64_t *length)
     }
     return false;
 }
+
+bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type)
+{
+    uint64_t end = start + length;
+    bool updated = false;
+    assert(!e820_done);
+
+    /* For E820_SOFT_RESERVED, validate range is within E820_RAM */
+    if (new_type == E820_SOFT_RESERVED) {
+        bool range_in_ram = false;
+        for (size_t j = 0; j < e820_entries; j++) {
+            uint64_t ram_start = le64_to_cpu(e820_table[j].address);
+            uint64_t ram_end = ram_start + le64_to_cpu(e820_table[j].length);
+            uint32_t ram_type = le32_to_cpu(e820_table[j].type);
+
+            if (ram_type == E820_RAM && ram_start <= start && ram_end >= end) {
+                range_in_ram = true;
+                break;
+            }
+        }
+        if (!range_in_ram) {
+            return false;
+        }
+    }
+
+    /* Find entry that contains the target range and update it */
+    for (size_t i = 0; i < e820_entries; i++) {
+        uint64_t entry_start = le64_to_cpu(e820_table[i].address);
+        uint64_t entry_length = le64_to_cpu(e820_table[i].length);
+        uint64_t entry_end = entry_start + entry_length;
+
+        if (entry_start <= start && entry_end >= end) {
+            uint32_t original_type = e820_table[i].type;
+
+            /* Remove original entry */
+            memmove(&e820_table[i], &e820_table[i + 1],
+                    (e820_entries - i - 1) * sizeof(struct e820_entry));
+            e820_entries--;
+
+            /* Add split parts inline */
+            if (entry_start < start) {
+                e820_table = g_renew(struct e820_entry, e820_table,
+                                     e820_entries + 1);
+                e820_table[e820_entries].address = cpu_to_le64(entry_start);
+                e820_table[e820_entries].length =
+                    cpu_to_le64(start - entry_start);
+                e820_table[e820_entries].type = original_type;
+                e820_entries++;
+            }
+
+            e820_table = g_renew(struct e820_entry, e820_table,
+                                 e820_entries + 1);
+            e820_table[e820_entries].address = cpu_to_le64(start);
+            e820_table[e820_entries].length = cpu_to_le64(length);
+            e820_table[e820_entries].type = cpu_to_le32(new_type);
+            e820_entries++;
+
+            if (end < entry_end) {
+                e820_table = g_renew(struct e820_entry, e820_table,
+                                     e820_entries + 1);
+                e820_table[e820_entries].address = cpu_to_le64(end);
+                e820_table[e820_entries].length = cpu_to_le64(entry_end - end);
+                e820_table[e820_entries].type = original_type;
+                e820_entries++;
+            }
+
+            updated = true;
+            break;
+        }
+    }
+
+    return updated;
+}
diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
index b50acfa201..657cc679e2 100644
--- a/hw/i386/e820_memory_layout.h
+++ b/hw/i386/e820_memory_layout.h
@@ -15,6 +15,7 @@
 #define E820_ACPI       3
 #define E820_NVS        4
 #define E820_UNUSABLE   5
+#define E820_SOFT_RESERVED  0xEFFFFFFF
 
 struct e820_entry {
     uint64_t address;
@@ -26,5 +27,6 @@ void e820_add_entry(uint64_t address, uint64_t length, 
uint32_t type);
 bool e820_get_entry(int index, uint32_t type,
                     uint64_t *address, uint64_t *length);
 int e820_get_table(struct e820_entry **table);
+bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t 
new_type);
 
 #endif
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index f8b919cb6c..ccb2af2a56 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -791,6 +791,58 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, 
uint64_t pci_hole64_size)
     return pc_above_4g_end(pcms) - 1;
 }
 
+/*
+ * Update E820 entries for NUMA nodes marked as SPM (Specific Purpose Memory).
+ * This function directly iterates over NUMA nodes instead of RAMBlocks,
+ * as suggested by code review to simplify the implementation.
+ */
+static void pc_update_spm_memory(X86MachineState *x86ms)
+{
+    MachineState *ms = MACHINE(x86ms);
+    uint64_t addr = 0;
+
+    for (int i = 0; i < ms->numa_state->num_nodes; i++) {
+        NodeInfo *numa_info = &ms->numa_state->nodes[i];
+        uint64_t node_size = numa_info->node_mem;
+
+        /* Process SPM nodes */
+        if (numa_info->is_spm && numa_info->node_memdev) {
+            uint64_t guest_addr;
+
+            /* Calculate guest physical address accounting for PCI hole */
+            if (addr < x86ms->below_4g_mem_size) {
+                if (addr + node_size <= x86ms->below_4g_mem_size) {
+                    /* Entirely below 4GB */
+                    guest_addr = addr;
+                } else {
+                    /* Spans across 4GB boundary - should not happen with 
proper config */
+                    warn_report("SPM node %d spans 4GB boundary, "
+                                "using address above 4GB", i);
+                    guest_addr = 0x100000000ULL + 
+                                (addr + node_size - x86ms->below_4g_mem_size);
+                }
+            } else {
+                /* Above 4GB, account for PCI hole */
+                guest_addr = 0x100000000ULL + 
+                            (addr - x86ms->below_4g_mem_size);
+            }
+
+            /* Update E820 entry type to SOFT_RESERVED */
+            if (!e820_update_entry_type(guest_addr, node_size, 
+                                       E820_SOFT_RESERVED)) {
+                warn_report("Failed to update E820 entry for SPM node %d "
+                           "at 0x%" PRIx64 " length 0x%" PRIx64,
+                           i, guest_addr, node_size);
+            }
+        }
+
+        /* Accumulate address for next node */
+        if (numa_info->node_memdev) {
+            addr += node_size;
+        }
+    }
+}
+
 /*
  * AMD systems with an IOMMU have an additional hole close to the
  * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
@@ -907,6 +959,9 @@ void pc_memory_init(PCMachineState *pcms,
         e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED);
     }
 
+    /* Update E820 for NUMA nodes marked as SPM */
+    pc_update_spm_memory(x86ms);
+
     if (!pcmc->has_reserved_memory &&
         (machine->ram_slots ||
          (machine->maxram_size > machine->ram_size))) {
diff --git a/include/system/numa.h b/include/system/numa.h
index 1044b0eb6e..438511a756 100644
--- a/include/system/numa.h
+++ b/include/system/numa.h
@@ -41,6 +41,7 @@ typedef struct NodeInfo {
     bool present;
     bool has_cpu;
     bool has_gi;
+    bool is_spm;
     uint8_t lb_info_provided;
     uint16_t initiator;
     uint8_t distance[MAX_NODES];
diff --git a/qapi/machine.json b/qapi/machine.json
index 907cb25f75..98c2367ee6 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -500,6 +500,10 @@
 # @memdev: memory backend object.  If specified for one node, it must
 #     be specified for all nodes.
 #
+# @spm: if true, mark the memory region of this node as Specific
+#     Purpose Memory (SPM).  This will set the E820 type to
+#     E820_SOFT_RESERVED for guest OS.  (default: false, since 9.2)
+#
 # @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, points
 #     to the nodeid which has the memory controller responsible for
 #     this NUMA node.  This field provides additional information as
@@ -514,6 +518,7 @@
    '*cpus':   ['uint16'],
    '*mem':    'size',
    '*memdev': 'str',
+   '*spm':    'bool',
    '*initiator': 'uint16' }}
 
 ##
diff --git a/qemu-options.hx b/qemu-options.hx
index fca2b7bc74..7d914a9bc6 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -431,7 +431,7 @@ ERST
 
 DEF("numa", HAS_ARG, QEMU_OPTION_numa,
     "-numa 
node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
-    "-numa 
node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
+    "-numa 
node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node][,spm=on|off]\n"
     "-numa dist,src=source,dst=destination,val=distance\n"
     "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n"
     "-numa 
hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n"
@@ -440,7 +440,7 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa,
 SRST
 ``-numa 
node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator]``
   \ 
-``-numa 
node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator]``
+``-numa 
node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator][,spm=on|off]``
   \
 ``-numa dist,src=source,dst=destination,val=distance``
   \ 
@@ -508,6 +508,13 @@ SRST
     largest bandwidth) to this NUMA node. Note that this option can be
     set only when the machine property 'hmat' is set to 'on'.
 
+    '\ ``spm``\ ' option marks the memory region of this NUMA node as
+    Specific Purpose Memory (SPM). When enabled, the memory will be
+    reported as soft reserved (E820 type 0xEFFFFFFF) to the guest OS,
+    which can then manage it separately from normal system RAM. This is
+    useful for device-specific memory that should not be used as general
+    purpose memory. This option is only supported on x86 platforms.
+
     Following example creates a machine with 2 NUMA nodes, node 0 has
     CPU. node 1 has only memory, and its initiator is node 0. Note that
     because node 0 has CPU, by default the initiator of node 0 is itself
-- 
2.34.1

[PATCH v3 1/1] numa: add 'spm' option for Specific Purpose Memory

Reply via email to