The dax kmem driver currently onlines memory automatically during
probe using the system's default online policy but provides no way
to control or query the entire region state at runtime.

Additionally, there is no atomic mechanism to offline and remove
the entire set of memory blocks together.  Instead, this is presently
done in two steps: (offline all, remove all).  This creates a race
condition where external entities can operate directly on the blocks
and cause hot-unplug to fail.

Add a new 'hotplug' sysfs attribute that allows userspace to control
and query the entire memory region state.

The interface supports the following states:
  - "unplug": memory is offline and blocks are not present
  - "online": memory is online as normal system RAM
  - "online_movable": memory is online in ZONE_MOVABLE

Valid transitions:
  - unplugged      -> online
  - unplugged      -> online_movable
  - online         -> unplugged
  - online_movable -> unplugged

"offline" (memory blocks exist but are offline by default) is not
supported because it's functionally equivalent to "unplugged" and
entices races between offlining and unplugging.

The initial state after probe currently checks if online_type matches
mhp_get_default_online_type() - and if so calls dax_kmem_do_hotplug.

This causes the creation of memory blocks, despite the fact that we
should be in an unplugged state. This preserves userland backward
compatibility for existing tools that expect the memory blocks to be
present after kmem probe - and can be deprecated over time.

As with any hot-remove mechanism, the removal can fail and if rollback
fails the system can be left in an inconsistent state.

Unbind Note:
  We used to call remove_memory() during unbind, which would fire a
  BUG() if any of the memory blocks were online at that time.  We lift
  this into a WARN in the cleanup routine and don't attempt hotremove
  if ->state is not DAX_KMEM_UNPLUGGED or MMOP_OFFLINE.

  The resources are still leaked but this prevents deadlock on unbind
  if a memory region happens to be impossible to hotremove.

Suggested-by: Hannes Reinecke <[email protected]>
Suggested-by: David Hildenbrand <[email protected]>
Signed-off-by: Gregory Price <[email protected]>
---
 Documentation/ABI/testing/sysfs-bus-dax |  17 +++
 drivers/dax/kmem.c                      | 164 +++++++++++++++++++++---
 2 files changed, 161 insertions(+), 20 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-dax 
b/Documentation/ABI/testing/sysfs-bus-dax
index b34266bfae49..faf6f63a368c 100644
--- a/Documentation/ABI/testing/sysfs-bus-dax
+++ b/Documentation/ABI/testing/sysfs-bus-dax
@@ -151,3 +151,20 @@ Description:
                memmap_on_memory parameter for memory_hotplug. This is
                typically set on the kernel command line -
                memory_hotplug.memmap_on_memory set to 'true' or 'force'."
+
+What:          /sys/bus/dax/devices/daxX.Y/hotplug
+Date:          January, 2026
+KernelVersion: v6.21
+Contact:       [email protected]
+Description:
+               (RW) Controls what hotplug state of the memory region.
+               Applies to all memory blocks associated with the device.
+               Only applies to dax_kmem devices.
+
+                States: [unplugged, online, online_movable]
+                Arguments:
+                 "unplug": memory is offline and blocks are not present
+                 "online": memory is online as normal system RAM
+                 "online_movable": memory is online in ZONE_MOVABLE
+
+               Devices must unplug to online into a different state.
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 8be9286f0ea3..5dbd5b7862fd 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -40,10 +40,16 @@ static int dax_kmem_range(struct dev_dax *dev_dax, int i, 
struct range *r)
        return 0;
 }
 
+#define DAX_KMEM_UNPLUGGED     (-1)
+
 struct dax_kmem_data {
        const char *res_name;
        int mgid;
        struct memory_dev_type *mtype;
+       int numa_node;
+       struct dev_dax *dev_dax;
+       int state;
+       struct mutex lock; /* protects hotplug state transitions */
        struct resource *res[];
 };
 
@@ -51,8 +57,10 @@ struct dax_kmem_data {
  * dax_kmem_do_hotplug - hotplug memory for dax kmem device
  * @dev_dax: the dev_dax instance
  * @data: the dax_kmem_data structure with resource tracking
+ * @online_type: MMOP_ONLINE or MMOP_ONLINE_MOVABLE
  *
- * Hotplugs all ranges in the dev_dax region as system memory.
+ * Hotplugs all ranges in the dev_dax region as system memory using
+ * the specified online type.
  *
  * Returns the number of successfully mapped ranges, or negative error.
  */
@@ -64,6 +72,12 @@ static int dax_kmem_do_hotplug(struct dev_dax *dev_dax,
        int i, rc, onlined = 0;
        mhp_t mhp_flags;
 
+       if (data->state == MMOP_ONLINE || data->state == MMOP_ONLINE_MOVABLE)
+               return -EINVAL;
+
+       if (online_type != MMOP_ONLINE && online_type != MMOP_ONLINE_MOVABLE)
+               return -EINVAL;
+
        for (i = 0; i < dev_dax->nr_range; i++) {
                struct range range;
 
@@ -156,9 +170,9 @@ static int dax_kmem_init_resources(struct dev_dax *dev_dax,
  * @dev_dax: the dev_dax instance
  * @data: the dax_kmem_data structure with resource tracking
  *
- * Removes all ranges in the dev_dax region.
+ * Offlines and removes all ranges in the dev_dax region.
  *
- * Returns the number of successfully removed ranges.
+ * Returns the number of successfully removed ranges, or negative error.
  */
 static int dax_kmem_do_hotremove(struct dev_dax *dev_dax,
                                 struct dax_kmem_data *data)
@@ -178,7 +192,7 @@ static int dax_kmem_do_hotremove(struct dev_dax *dev_dax,
                if (!data->res[i])
                        continue;
 
-               rc = remove_memory(range.start, range_len(&range));
+               rc = offline_and_remove_memory(range.start, range_len(&range));
                if (rc == 0) {
                        /* Release the resource for the successfully removed 
range */
                        remove_resource(data->res[i]);
@@ -214,6 +228,20 @@ static void dax_kmem_cleanup_resources(struct dev_dax 
*dev_dax,
 {
        int i;
 
+       /*
+        * If the device unbind occurs before memory is hotremoved, we can never
+        * remove the memory (requires reboot).  Attempting an offline operation
+        * here may cause deadlock and a failure to finish the unbind.
+        *
+        * This WARN used to be a BUG called by remove_memory().
+        *
+        * Note: This leaks the resources.
+        */
+       if (WARN(((data->state != DAX_KMEM_UNPLUGGED) &&
+                 (data->state != MMOP_OFFLINE)),
+                "Hotplug memory regions stuck online until reboot"))
+               return;
+
        for (i = 0; i < dev_dax->nr_range; i++) {
                if (!data->res[i])
                        continue;
@@ -223,6 +251,98 @@ static void dax_kmem_cleanup_resources(struct dev_dax 
*dev_dax,
        }
 }
 
+static int dax_kmem_parse_state(const char *buf)
+{
+       if (sysfs_streq(buf, "unplug"))
+               return DAX_KMEM_UNPLUGGED;
+       if (sysfs_streq(buf, "online"))
+               return MMOP_ONLINE;
+       if (sysfs_streq(buf, "online_movable"))
+               return MMOP_ONLINE_MOVABLE;
+       return -EINVAL;
+}
+
+static ssize_t hotplug_show(struct device *dev,
+                           struct device_attribute *attr, char *buf)
+{
+       struct dax_kmem_data *data = dev_get_drvdata(dev);
+       const char *state_str;
+
+       if (!data)
+               return -ENXIO;
+
+       switch (data->state) {
+       case DAX_KMEM_UNPLUGGED:
+               state_str = "unplugged";
+               break;
+       case MMOP_OFFLINE:
+               state_str = "offline";
+               break;
+       case MMOP_ONLINE:
+               state_str = "online";
+               break;
+       case MMOP_ONLINE_MOVABLE:
+               state_str = "online_movable";
+               break;
+       default:
+               state_str = "unknown";
+               break;
+       }
+
+       return sysfs_emit(buf, "%s\n", state_str);
+}
+
+static ssize_t hotplug_store(struct device *dev, struct device_attribute *attr,
+                            const char *buf, size_t len)
+{
+       struct dev_dax *dev_dax = to_dev_dax(dev);
+       struct dax_kmem_data *data = dev_get_drvdata(dev);
+       int online_type;
+       int rc;
+
+       if (!data)
+               return -ENXIO;
+
+       online_type = dax_kmem_parse_state(buf);
+       if (online_type < DAX_KMEM_UNPLUGGED)
+               return online_type;
+
+       guard(mutex)(&data->lock);
+
+       /* Already in requested state */
+       if (data->state == online_type)
+               return len;
+
+       if (online_type == DAX_KMEM_UNPLUGGED) {
+               rc = dax_kmem_do_hotremove(dev_dax, data);
+               if (rc < 0) {
+                       dev_warn(dev, "hotplug state is inconsistent\n");
+                       return rc;
+               }
+               if (rc < dev_dax->nr_range)
+                       dev_warn(dev, "partial hotremove: %d of %d ranges 
removed\n",
+                                rc, dev_dax->nr_range);
+               else
+                       data->state = DAX_KMEM_UNPLUGGED;
+               return len;
+       }
+
+       /*
+        * online_type is MMOP_ONLINE or MMOP_ONLINE_MOVABLE
+        * Cannot switch between online types without unplugging first
+        */
+       if (data->state == MMOP_ONLINE || data->state == MMOP_ONLINE_MOVABLE)
+               return -EBUSY;
+
+       rc = dax_kmem_do_hotplug(dev_dax, data, online_type);
+       if (rc < 0)
+               return rc;
+
+       data->state = online_type;
+       return len;
+}
+static DEVICE_ATTR_RW(hotplug);
+
 static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 {
        struct device *dev = &dev_dax->dev;
@@ -291,6 +411,10 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
                goto err_reg_mgid;
        data->mgid = rc;
        data->mtype = mtype;
+       data->numa_node = numa_node;
+       data->dev_dax = dev_dax;
+       data->state = DAX_KMEM_UNPLUGGED;
+       mutex_init(&data->lock);
 
        dev_set_drvdata(dev, data);
 
@@ -301,9 +425,17 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
        /*
         * Hotplug using the configured online type for this device.
         */
-       rc = dax_kmem_do_hotplug(dev_dax, data, dev_dax->online_type);
-       if (rc < 0)
-               goto err_hotplug;
+       if (dev_dax->online_type != MMOP_OFFLINE ||
+           dev_dax->online_type == mhp_get_default_online_type()) {
+               rc = dax_kmem_do_hotplug(dev_dax, data, dev_dax->online_type);
+               if (rc < 0)
+                       goto err_hotplug;
+               data->state = dev_dax->online_type;
+       }
+
+       rc = device_create_file(dev, &dev_attr_hotplug);
+       if (rc)
+               dev_warn(dev, "failed to create hotplug sysfs entry\n");
 
        return 0;
 
@@ -324,23 +456,11 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 #ifdef CONFIG_MEMORY_HOTREMOVE
 static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
 {
-       int success;
        int node = dev_dax->target_node;
        struct device *dev = &dev_dax->dev;
        struct dax_kmem_data *data = dev_get_drvdata(dev);
 
-       /*
-        * We have one shot for removing memory, if some memory blocks were not
-        * offline prior to calling this function remove_memory() will fail, and
-        * there is no way to hotremove this memory until reboot because device
-        * unbind will succeed even if we return failure.
-        */
-       success = dax_kmem_do_hotremove(dev_dax, data);
-       if (success < dev_dax->nr_range) {
-               dev_err(dev, "Hotplug regions stuck online until reboot\n");
-               return;
-       }
-
+       device_remove_file(dev, &dev_attr_hotplug);
        dax_kmem_cleanup_resources(dev_dax, data);
        memory_group_unregister(data->mgid);
        kfree(data->res_name);
@@ -358,6 +478,10 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
 #else
 static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
 {
+       struct device *dev = &dev_dax->dev;
+
+       device_remove_file(dev, &dev_attr_hotplug);
+
        /*
         * Without hotremove purposely leak the request_mem_region() for the
         * device-dax range and return '0' to ->remove() attempts. The removal
-- 
2.53.0


Reply via email to