The dax kmem driver currently onlines memory automatically during
probe using the system's default online policy but provides no way
to control or query the entire region state at runtime.

There is no atomic to offline and remove memory blocks together.

Add a new 'hotplug' sysfs attribute that allows userspace to control
and query the entire memory region state.

The interface supports the following states:
  - "unplug": memory is offline and blocks are not present
  - "online": memory is online as normal system RAM
  - "online_movable": memory is online in ZONE_MOVABLE

Valid transitions:
  - unplugged -> online
  - unplugged -> online_movable
  - online    -> unplugged
  - online_movable -> unplugged

"offline" (memory blocks exist but are offline by default) is not
supported because it's functionally equivalent to "unplugged" and
entices races between offlining and unplugging.

The initial state after probe uses mhp_get_default_online_type() to
preserve backwards compatibility - existing systems with auto-online
policies will continue to work as before.

As with any hot-remove mechanism, the removal can fail and if rollback
fails the system can be left in an inconsistent state.

Unbind Note:
  We used to call remove_memory() during unbind, which would fire a
  BUG() if any of the memory blocks were online at that time.  We lift
  this into a WARN in the cleanup routine and don't attempt hotremove
  if ->state is not DAX_KMEM_UNPLUGGED.

  The resources are still leaked but this prevents deadlock on unbind
  if a memory region happens to be impossible to hotremove.

Suggested-by: Hannes Reinecke <[email protected]>
Suggested-by: David Hildenbrand <[email protected]>
Signed-off-by: Gregory Price <[email protected]>
---
 Documentation/ABI/testing/sysfs-bus-dax |  17 +++
 drivers/dax/kmem.c                      | 159 +++++++++++++++++++++---
 2 files changed, 156 insertions(+), 20 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-dax 
b/Documentation/ABI/testing/sysfs-bus-dax
index b34266bfae49..faf6f63a368c 100644
--- a/Documentation/ABI/testing/sysfs-bus-dax
+++ b/Documentation/ABI/testing/sysfs-bus-dax
@@ -151,3 +151,20 @@ Description:
                memmap_on_memory parameter for memory_hotplug. This is
                typically set on the kernel command line -
                memory_hotplug.memmap_on_memory set to 'true' or 'force'."
+
+What:          /sys/bus/dax/devices/daxX.Y/hotplug
+Date:          January, 2026
+KernelVersion: v6.21
+Contact:       [email protected]
+Description:
+               (RW) Controls what hotplug state of the memory region.
+               Applies to all memory blocks associated with the device.
+               Only applies to dax_kmem devices.
+
+                States: [unplugged, online, online_movable]
+                Arguments:
+                 "unplug": memory is offline and blocks are not present
+                 "online": memory is online as normal system RAM
+                 "online_movable": memory is online in ZONE_MOVABLE
+
+               Devices must unplug to online into a different state.
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 3929cb8576de..c222ae9d675d 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -44,9 +44,15 @@ static int dax_kmem_range(struct dev_dax *dev_dax, int i, 
struct range *r)
        return 0;
 }
 
+#define DAX_KMEM_UNPLUGGED     (-1)
+
 struct dax_kmem_data {
        const char *res_name;
        int mgid;
+       int numa_node;
+       struct dev_dax *dev_dax;
+       int state;
+       struct mutex lock; /* protects hotplug state transitions */
        struct resource *res[];
 };
 
@@ -69,8 +75,10 @@ static void kmem_put_memory_types(void)
  * dax_kmem_do_hotplug - hotplug memory for dax kmem device
  * @dev_dax: the dev_dax instance
  * @data: the dax_kmem_data structure with resource tracking
+ * @online_type: MMOP_ONLINE or MMOP_ONLINE_MOVABLE
  *
- * Hotplugs all ranges in the dev_dax region as system memory.
+ * Hotplugs all ranges in the dev_dax region as system memory using
+ * the specified online type.
  *
  * Returns the number of successfully mapped ranges, or negative error.
  */
@@ -82,6 +90,12 @@ static int dax_kmem_do_hotplug(struct dev_dax *dev_dax,
        int i, rc, onlined = 0;
        mhp_t mhp_flags;
 
+       if (data->state == MMOP_ONLINE || data->state == MMOP_ONLINE_MOVABLE)
+               return -EINVAL;
+
+       if (online_type != MMOP_ONLINE && online_type != MMOP_ONLINE_MOVABLE)
+               return -EINVAL;
+
        for (i = 0; i < dev_dax->nr_range; i++) {
                struct range range;
 
@@ -174,9 +188,9 @@ static int dax_kmem_init_resources(struct dev_dax *dev_dax,
  * @dev_dax: the dev_dax instance
  * @data: the dax_kmem_data structure with resource tracking
  *
- * Removes all ranges in the dev_dax region.
+ * Offlines and removes all ranges in the dev_dax region.
  *
- * Returns the number of successfully removed ranges.
+ * Returns the number of successfully removed ranges, or negative error.
  */
 static int dax_kmem_do_hotremove(struct dev_dax *dev_dax,
                                 struct dax_kmem_data *data)
@@ -196,7 +210,7 @@ static int dax_kmem_do_hotremove(struct dev_dax *dev_dax,
                if (!data->res[i])
                        continue;
 
-               rc = remove_memory(range.start, range_len(&range));
+               rc = offline_and_remove_memory(range.start, range_len(&range));
                if (rc == 0) {
                        success++;
                        continue;
@@ -228,6 +242,21 @@ static void dax_kmem_cleanup_resources(struct dev_dax 
*dev_dax,
 {
        int i;
 
+       /*
+        * If the device unbind occurs before memory is hotremoved, we can never
+        * remove the memory (requires reboot).  Attempting an offline operation
+        * here may cause deadlock and a failure to finish the unbind.
+        *
+        * This WARN used to be a BUG called by remove_memory().
+        *
+        * Note: This leaks the resources.
+        */
+       if (data->state != DAX_KMEM_UNPLUGGED) {
+               WARN(data->state != DAX_KMEM_UNPLUGGED,
+                    "Hotplug memory regions stuck online until reboot");
+               return;
+       }
+
        for (i = 0; i < dev_dax->nr_range; i++) {
                if (!data->res[i])
                        continue;
@@ -237,6 +266,91 @@ static void dax_kmem_cleanup_resources(struct dev_dax 
*dev_dax,
        }
 }
 
+static int dax_kmem_parse_state(const char *buf)
+{
+       if (sysfs_streq(buf, "unplug"))
+               return DAX_KMEM_UNPLUGGED;
+       if (sysfs_streq(buf, "online"))
+               return MMOP_ONLINE;
+       if (sysfs_streq(buf, "online_movable"))
+               return MMOP_ONLINE_MOVABLE;
+       return -EINVAL;
+}
+
+static ssize_t hotplug_show(struct device *dev,
+                           struct device_attribute *attr, char *buf)
+{
+       struct dax_kmem_data *data = dev_get_drvdata(dev);
+       const char *state_str;
+
+       if (!data)
+               return -ENXIO;
+
+       switch (data->state) {
+       case DAX_KMEM_UNPLUGGED:
+               state_str = "unplugged";
+               break;
+       case MMOP_ONLINE:
+               state_str = "online";
+               break;
+       case MMOP_ONLINE_MOVABLE:
+               state_str = "online_movable";
+               break;
+       default:
+               state_str = "unknown";
+               break;
+       }
+
+       return sysfs_emit(buf, "%s\n", state_str);
+}
+
+static ssize_t hotplug_store(struct device *dev, struct device_attribute *attr,
+                            const char *buf, size_t len)
+{
+       struct dev_dax *dev_dax = to_dev_dax(dev);
+       struct dax_kmem_data *data = dev_get_drvdata(dev);
+       int online_type;
+       int rc;
+
+       if (!data)
+               return -ENXIO;
+
+       online_type = dax_kmem_parse_state(buf);
+       if (online_type < DAX_KMEM_UNPLUGGED)
+               return online_type;
+
+       guard(mutex)(&data->lock);
+
+       /* Already in requested state */
+       if (data->state == online_type)
+               return len;
+
+       if (online_type == DAX_KMEM_UNPLUGGED) {
+               rc = dax_kmem_do_hotremove(dev_dax, data);
+               if (rc < 0) {
+                       dev_warn(dev, "hotplug state is inconsistent\n");
+                       return rc;
+               }
+               data->state = DAX_KMEM_UNPLUGGED;
+               return len;
+       }
+
+       /*
+        * online_type is MMOP_ONLINE or MMOP_ONLINE_MOVABLE
+        * Cannot switch between online types without unplugging first
+        */
+       if (data->state == MMOP_ONLINE || data->state == MMOP_ONLINE_MOVABLE)
+               return -EBUSY;
+
+       rc = dax_kmem_do_hotplug(dev_dax, data, online_type);
+       if (rc < 0)
+               return rc;
+
+       data->state = online_type;
+       return len;
+}
+static DEVICE_ATTR_RW(hotplug);
+
 static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 {
        struct device *dev = &dev_dax->dev;
@@ -246,6 +360,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
        int i, rc;
        int numa_node;
        int adist = MEMTIER_DEFAULT_DAX_ADISTANCE;
+       int online_type;
 
        /*
         * Ensure good NUMA information for the persistent memory.
@@ -304,6 +419,10 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
        if (rc < 0)
                goto err_reg_mgid;
        data->mgid = rc;
+       data->numa_node = numa_node;
+       data->dev_dax = dev_dax;
+       data->state = DAX_KMEM_UNPLUGGED;
+       mutex_init(&data->lock);
 
        dev_set_drvdata(dev, data);
 
@@ -315,9 +434,17 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
         * Hotplug using the system default policy - this preserves backwards
         * for existing users who rely on the default auto-online behavior.
         */
-       rc = dax_kmem_do_hotplug(dev_dax, data, mhp_get_default_online_type());
-       if (rc < 0)
-               goto err_hotplug;
+       online_type = mhp_get_default_online_type();
+       if (online_type != MMOP_OFFLINE) {
+               rc = dax_kmem_do_hotplug(dev_dax, data, online_type);
+               if (rc < 0)
+                       goto err_hotplug;
+               data->state = online_type;
+       }
+
+       rc = device_create_file(dev, &dev_attr_hotplug);
+       if (rc)
+               dev_warn(dev, "failed to create hotplug sysfs entry\n");
 
        return 0;
 
@@ -338,23 +465,11 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 #ifdef CONFIG_MEMORY_HOTREMOVE
 static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
 {
-       int success;
        int node = dev_dax->target_node;
        struct device *dev = &dev_dax->dev;
        struct dax_kmem_data *data = dev_get_drvdata(dev);
 
-       /*
-        * We have one shot for removing memory, if some memory blocks were not
-        * offline prior to calling this function remove_memory() will fail, and
-        * there is no way to hotremove this memory until reboot because device
-        * unbind will succeed even if we return failure.
-        */
-       success = dax_kmem_do_hotremove(dev_dax, data);
-       if (success < dev_dax->nr_range) {
-               dev_err(dev, "Hotplug regions stuck online until reboot\n");
-               return;
-       }
-
+       device_remove_file(dev, &dev_attr_hotplug);
        dax_kmem_cleanup_resources(dev_dax, data);
        memory_group_unregister(data->mgid);
        kfree(data->res_name);
@@ -372,6 +487,10 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
 #else
 static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
 {
+       struct device *dev = &dev_dax->dev;
+
+       device_remove_file(dev, &dev_attr_hotplug);
+
        /*
         * Without hotremove purposely leak the request_mem_region() for the
         * device-dax range and return '0' to ->remove() attempts. The removal
-- 
2.52.0


Reply via email to