N_MEMORY nodes are intended to contain general System RAM. Today, some
device drivers hotplug their memory (marked Specific Purpose or Reserved)
to get access to mm/ services, but don't intend it for general consumption.

Create N_MEMORY_PRIVATE for memory nodes whose memory is not intended for
general consumption. This state is mutually exclusive with N_MEMORY.

Add the node_private infrastructure for N_MEMORY_PRIVATE nodes:

  - struct node_private: Per-node container stored in NODE_DATA(nid),
    holding driver callbacks (ops), owner, and refcount.

  - struct node_private_ops: Initial structure with void *reserved
    placeholder and flags field.  Callbacks will be added by subsequent
    commits as each consumer is wired up.

  - folio_is_private_node() / page_is_private_node(): check if a
    folio/page resides on a private node.

  - folio_node_private_ops() / node_private_flags(): retrieve the ops
    vtable or flags for a folio's node.

  - Registration API: node_private_register()/unregister() for drivers
    to register callbacks for private nodes. Only one driver callback
    can be registered per node - attempting to register different ops
    returns -EBUSY.

  - sysfs attribute exposing N_MEMORY_PRIVATE node state.

Zonelist construction changes for private nodes are deferred to a
subsequent commit.

Signed-off-by: Gregory Price <[email protected]>
---
 drivers/base/node.c          | 197 ++++++++++++++++++++++++++++++++
 include/linux/mmzone.h       |   4 +
 include/linux/node_private.h | 210 +++++++++++++++++++++++++++++++++++
 include/linux/nodemask.h     |   1 +
 4 files changed, 412 insertions(+)
 create mode 100644 include/linux/node_private.h

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 00cf4532f121..646dc48a23b5 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -22,6 +22,7 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/memblock.h>
+#include <linux/node_private.h>
 
 static const struct bus_type node_subsys = {
        .name = "node",
@@ -861,6 +862,198 @@ void register_memory_blocks_under_node_hotplug(int nid, 
unsigned long start_pfn,
                           (void *)&nid, register_mem_block_under_node_hotplug);
        return;
 }
+
+static DEFINE_MUTEX(node_private_lock);
+static bool node_private_initialized;
+
+/**
+ * node_private_register - Register a private node
+ * @nid: Node identifier
+ * @np: The node_private structure (driver-allocated, driver-owned)
+ *
+ * Register a driver for a private node. Only one driver can register
+ * per node. If another driver has already registered (with different np),
+ * -EBUSY is returned. Re-registration with the same np is allowed.
+ *
+ * The driver owns the node_private memory and must ensure it remains valid
+ * until refcount reaches 0 after node_private_unregister().
+ *
+ * Returns 0 on success, negative errno on failure.
+ */
+int node_private_register(int nid, struct node_private *np)
+{
+       struct node_private *existing;
+       pg_data_t *pgdat;
+       int ret = 0;
+
+       if (!np || !node_possible(nid))
+               return -EINVAL;
+
+       if (!node_private_initialized)
+               return -ENODEV;
+
+       mutex_lock(&node_private_lock);
+       mem_hotplug_begin();
+
+       /* N_MEMORY_PRIVATE and N_MEMORY are mutually exclusive */
+       if (node_state(nid, N_MEMORY)) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       pgdat = NODE_DATA(nid);
+       existing = rcu_dereference_protected(pgdat->node_private,
+                                            
lockdep_is_held(&node_private_lock));
+
+       /* Only one source my register this node */
+       if (existing) {
+               if (existing != np) {
+                       ret = -EBUSY;
+                       goto out;
+               }
+               goto out;
+       }
+
+       refcount_set(&np->refcount, 1);
+       init_completion(&np->released);
+
+       rcu_assign_pointer(pgdat->node_private, np);
+       pgdat->private = true;
+
+out:
+       mem_hotplug_done();
+       mutex_unlock(&node_private_lock);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(node_private_register);
+
+/**
+ * node_private_set_ops - Set service callbacks on a registered private node
+ * @nid: Node identifier
+ * @ops: Service callbacks and flags (driver-owned, must outlive registration)
+ *
+ * Validates flag dependencies and sets the ops on the node's node_private.
+ * The node must already be registered via node_private_register().
+ *
+ * Returns 0 on success, -EINVAL for invalid flag combinations,
+ * -ENODEV if no node_private is registered on @nid.
+ */
+int node_private_set_ops(int nid, const struct node_private_ops *ops)
+{
+       struct node_private *np;
+       int ret = 0;
+
+       if (!ops)
+               return -EINVAL;
+
+       if (!node_possible(nid))
+               return -EINVAL;
+
+       mutex_lock(&node_private_lock);
+       np = rcu_dereference_protected(NODE_DATA(nid)->node_private,
+                                      lockdep_is_held(&node_private_lock));
+       if (!np)
+               ret = -ENODEV;
+       else
+               np->ops = ops;
+       mutex_unlock(&node_private_lock);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(node_private_set_ops);
+
+/**
+ * node_private_clear_ops - Clear service callbacks from a private node
+ * @nid: Node identifier
+ * @ops: Expected ops pointer (must match current ops)
+ *
+ * Clears the ops only if @ops matches the currently registered ops,
+ * preventing one service from accidentally clearing another's callbacks.
+ *
+ * Returns 0 on success, -ENODEV if no node_private is registered,
+ * -EINVAL if @ops does not match.
+ */
+int node_private_clear_ops(int nid, const struct node_private_ops *ops)
+{
+       struct node_private *np;
+       int ret = 0;
+
+       if (!node_possible(nid))
+               return -EINVAL;
+
+       mutex_lock(&node_private_lock);
+       np = rcu_dereference_protected(NODE_DATA(nid)->node_private,
+                                      lockdep_is_held(&node_private_lock));
+       if (!np)
+               ret = -ENODEV;
+       else if (np->ops != ops)
+               ret = -EINVAL;
+       else
+               np->ops = NULL;
+       mutex_unlock(&node_private_lock);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(node_private_clear_ops);
+
+/**
+ * node_private_unregister - Unregister a private node
+ * @nid: Node identifier
+ *
+ * Unregister the driver from a private node. Only succeeds if all memory
+ * has been offlined and the node is no longer N_MEMORY_PRIVATE.
+ * When successful, drops the refcount to 0 indicating the driver can
+ * free its context.
+ *
+ * N_MEMORY_PRIVATE state is cleared by offline_pages() when the last
+ * memory is offlined, not by this function.
+ *
+ * Return: 0 if unregistered, -EBUSY if N_MEMORY_PRIVATE is still set
+ * (other memory blocks remain on this node).
+ */
+int node_private_unregister(int nid)
+{
+       struct node_private *np;
+       pg_data_t *pgdat;
+
+       if (!node_possible(nid))
+               return 0;
+
+       mutex_lock(&node_private_lock);
+       mem_hotplug_begin();
+
+       pgdat = NODE_DATA(nid);
+       np = rcu_dereference_protected(pgdat->node_private,
+                                      lockdep_is_held(&node_private_lock));
+       if (!np) {
+               mem_hotplug_done();
+               mutex_unlock(&node_private_lock);
+               return 0;
+       }
+
+       /*
+        * Only unregister if all memory is offline and N_MEMORY_PRIVATE is
+        * cleared. N_MEMORY_PRIVATE is cleared by offline_pages() when the
+        * last memory block is offlined.
+        */
+       if (node_state(nid, N_MEMORY_PRIVATE)) {
+               mem_hotplug_done();
+               mutex_unlock(&node_private_lock);
+               return -EBUSY;
+       }
+
+       rcu_assign_pointer(pgdat->node_private, NULL);
+       pgdat->private = false;
+
+       mem_hotplug_done();
+       mutex_unlock(&node_private_lock);
+
+       synchronize_rcu();
+
+       if (!refcount_dec_and_test(&np->refcount))
+               wait_for_completion(&np->released);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(node_private_unregister);
+
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 /**
@@ -959,6 +1152,7 @@ static struct node_attr node_state_attr[] = {
        [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
 #endif
        [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
+       [N_MEMORY_PRIVATE] = _NODE_ATTR(has_private_memory, N_MEMORY_PRIVATE),
        [N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
        [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator,
                                           N_GENERIC_INITIATOR),
@@ -972,6 +1166,7 @@ static struct attribute *node_state_attrs[] = {
        &node_state_attr[N_HIGH_MEMORY].attr.attr,
 #endif
        &node_state_attr[N_MEMORY].attr.attr,
+       &node_state_attr[N_MEMORY_PRIVATE].attr.attr,
        &node_state_attr[N_CPU].attr.attr,
        &node_state_attr[N_GENERIC_INITIATOR].attr.attr,
        NULL
@@ -1007,5 +1202,7 @@ void __init node_dev_init(void)
                        panic("%s() failed to add node: %d\n", __func__, ret);
        }
 
+       node_private_initialized = true;
+
        register_memory_blocks_under_nodes();
 }
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b01cb1e49896..992eb1c5a2c6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -25,6 +25,8 @@
 #include <linux/zswap.h>
 #include <asm/page.h>
 
+struct node_private;
+
 /* Free memory management - zoned buddy allocator.  */
 #ifndef CONFIG_ARCH_FORCE_MAX_ORDER
 #define MAX_PAGE_ORDER 10
@@ -1514,6 +1516,8 @@ typedef struct pglist_data {
        atomic_long_t           vm_stat[NR_VM_NODE_STAT_ITEMS];
 #ifdef CONFIG_NUMA
        struct memory_tier __rcu *memtier;
+       struct node_private __rcu *node_private;
+       bool private;
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
        struct memory_failure_stats mf_stats;
diff --git a/include/linux/node_private.h b/include/linux/node_private.h
new file mode 100644
index 000000000000..6a70ec39d569
--- /dev/null
+++ b/include/linux/node_private.h
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_NODE_PRIVATE_H
+#define _LINUX_NODE_PRIVATE_H
+
+#include <linux/completion.h>
+#include <linux/mm.h>
+#include <linux/nodemask.h>
+#include <linux/rcupdate.h>
+#include <linux/refcount.h>
+
+struct page;
+struct vm_area_struct;
+struct vm_fault;
+
+/**
+ * struct node_private_ops - Callbacks for private node services
+ *
+ * Services register these callbacks to intercept MM operations that affect
+ * their private nodes.
+ *
+ * Flag bits control which MM subsystems may operate on folios on this node.
+ *
+ * The pgdat->node_private pointer is RCU-protected.  Callbacks fall into
+ * three categories based on their calling context:
+ *
+ * Folio-referenced callbacks (RCU released before callback):
+ *   The caller holds a reference to a folio on the private node, which
+ *   pins the node's memory online and prevents node_private teardown.
+ *
+ * Refcounted callbacks (RCU released before callback):
+ *   The caller has no folio on the private node (e.g., folios are on a
+ *   source node being migrated TO this node).  A temporary refcount is
+ *   taken on node_private under rcu_read_lock to keep the structure (and
+ *   the service module) alive across the callback.  node_private_unregister
+ *   waits for all temporary references to drain before returning.
+ *
+ * Non-folio callbacks (rcu_read_lock held during callback):
+ *   No folio reference exists, so rcu_read_lock is held across the
+ *   callback to prevent node_private from being freed.
+ *   These callbacks MUST NOT sleep.
+ *
+ * @flags: Operation exclusion flags (NP_OPS_* constants).
+ *
+ */
+struct node_private_ops {
+       unsigned long flags;
+};
+
+/**
+ * struct node_private - Per-node container for N_MEMORY_PRIVATE nodes
+ *
+ * This structure is allocated by the driver and passed to 
node_private_register().
+ * The driver owns the memory and must ensure it remains valid until after
+ * node_private_unregister() returns with the reference count dropped to 0.
+ *
+ * @owner: Opaque driver identifier
+ * @refcount: Reference count (1 = registered; temporary refs for non-folio
+ *             callbacks that may sleep; 0 = fully released)
+ * @released: Signaled when refcount drops to 0; unregister waits on this
+ * @ops: Service callbacks and exclusion flags (NULL until service registers)
+ */
+struct node_private {
+       void *owner;
+       refcount_t refcount;
+       struct completion released;
+       const struct node_private_ops *ops;
+};
+
+#ifdef CONFIG_NUMA
+
+#include <linux/mmzone.h>
+
+/**
+ * folio_is_private_node - Check if folio is on an N_MEMORY_PRIVATE node
+ * @folio: The folio to check
+ *
+ * Returns true if the folio resides on a private node.
+ */
+static inline bool folio_is_private_node(struct folio *folio)
+{
+       return node_state(folio_nid(folio), N_MEMORY_PRIVATE);
+}
+
+/**
+ * page_is_private_node - Check if page is on an N_MEMORY_PRIVATE node
+ * @page: The page to check
+ *
+ * Returns true if the page resides on a private node.
+ */
+static inline bool page_is_private_node(struct page *page)
+{
+       return node_state(page_to_nid(page), N_MEMORY_PRIVATE);
+}
+
+static inline const struct node_private_ops *
+folio_node_private_ops(struct folio *folio)
+{
+       const struct node_private_ops *ops;
+       struct node_private *np;
+
+       rcu_read_lock();
+       np = rcu_dereference(NODE_DATA(folio_nid(folio))->node_private);
+       ops = np ? np->ops : NULL;
+       rcu_read_unlock();
+
+       return ops;
+}
+
+static inline unsigned long node_private_flags(int nid)
+{
+       struct node_private *np;
+       unsigned long flags;
+
+       rcu_read_lock();
+       np = rcu_dereference(NODE_DATA(nid)->node_private);
+       flags = (np && np->ops) ? np->ops->flags : 0;
+       rcu_read_unlock();
+
+       return flags;
+}
+
+static inline bool folio_private_flags(struct folio *f, unsigned long flag)
+{
+       return node_private_flags(folio_nid(f)) & flag;
+}
+
+static inline bool node_private_has_flag(int nid, unsigned long flag)
+{
+       return node_private_flags(nid) & flag;
+}
+
+static inline bool zone_private_flags(struct zone *z, unsigned long flag)
+{
+       return node_private_flags(zone_to_nid(z)) & flag;
+}
+
+#else /* !CONFIG_NUMA */
+
+static inline bool folio_is_private_node(struct folio *folio)
+{
+       return false;
+}
+
+static inline bool page_is_private_node(struct page *page)
+{
+       return false;
+}
+
+static inline const struct node_private_ops *
+folio_node_private_ops(struct folio *folio)
+{
+       return NULL;
+}
+
+static inline unsigned long node_private_flags(int nid)
+{
+       return 0;
+}
+
+static inline bool folio_private_flags(struct folio *f, unsigned long flag)
+{
+       return false;
+}
+
+static inline bool node_private_has_flag(int nid, unsigned long flag)
+{
+       return false;
+}
+
+static inline bool zone_private_flags(struct zone *z, unsigned long flag)
+{
+       return false;
+}
+
+#endif /* CONFIG_NUMA */
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+
+int node_private_register(int nid, struct node_private *np);
+int node_private_unregister(int nid);
+int node_private_set_ops(int nid, const struct node_private_ops *ops);
+int node_private_clear_ops(int nid, const struct node_private_ops *ops);
+
+#else /* !CONFIG_NUMA || !CONFIG_MEMORY_HOTPLUG */
+
+static inline int node_private_register(int nid, struct node_private *np)
+{
+       return -ENODEV;
+}
+
+static inline int node_private_unregister(int nid)
+{
+       return 0;
+}
+
+static inline int node_private_set_ops(int nid,
+                                      const struct node_private_ops *ops)
+{
+       return -ENODEV;
+}
+
+static inline int node_private_clear_ops(int nid,
+                                        const struct node_private_ops *ops)
+{
+       return -ENODEV;
+}
+
+#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
+
+#endif /* _LINUX_NODE_PRIVATE_H */
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index bd38648c998d..c9bcfd5a9a06 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -391,6 +391,7 @@ enum node_states {
        N_HIGH_MEMORY = N_NORMAL_MEMORY,
 #endif
        N_MEMORY,               /* The node has memory(regular, high, movable) 
*/
+       N_MEMORY_PRIVATE,       /* The node's memory is private */
        N_CPU,          /* The node has one or more cpus */
        N_GENERIC_INITIATOR,    /* The node has one or more Generic Initiators 
*/
        NR_NODE_STATES
-- 
2.53.0


Reply via email to