On 6/24/26 16:57, Gregory Price wrote:
> offline_and_remove_memory() handles a single contiguous range.
> 
> Callers that manage a device composed of several ranges (dax/kmem)
> currently have to call it in a loop, which gives up atomicity.
> 
> In addition to pushing rollback logic into the driver, the lack
> of atomicity creates a race condition between system daemons trying
> to manage the same resource:
> 
>    - Manager 1:  Offlines memory blocks.    Removes device.
>                                         ^^^^
>    - Manager 2:  Detects offline memory blocks, re-onlines them.
> 
> Add offline_and_remove_memory_ranges(), which takes an array of ranges
> and processes them as one operation under a single lock_device_hotplug():
> 
>   - Phase 1 offlines every block of every range.
>   - Phase 2 removes the ranges only if all ranges are offline.
>   - If any offline fails, the whole operation is reverted.
> 
> This gives callers all-or-nothing semantics for the offline step, so a
> failed or interrupted unplug leaves the device in a consistent state.
> 
> This also resolves the battling managers race - the second manager's
> operation simply fails when the block is destroyed / cannot be onlined.
> 
> offline_and_remove_memory() becomes a thin wrapper that passes its single
> range to the new helper, so the offline/rollback logic lives in one place.
> 
> Suggested-by: David Hildenbrand (Arm) <[email protected]>
> Signed-off-by: Gregory Price <[email protected]>
> ---
>  include/linux/memory_hotplug.h |  7 +++
>  mm/memory_hotplug.c            | 94 ++++++++++++++++++++++++----------
>  2 files changed, 74 insertions(+), 27 deletions(-)
> 
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index d3edeb80aadb..7f1da7c428dc 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -267,6 +267,7 @@ extern int offline_pages(unsigned long start_pfn, 
> unsigned long nr_pages,
>  extern int remove_memory(u64 start, u64 size);
>  extern void __remove_memory(u64 start, u64 size);
>  extern int offline_and_remove_memory(u64 start, u64 size);
> +int offline_and_remove_memory_ranges(const struct range *ranges, int 
> nr_ranges);
>  
>  #else
>  static inline void try_offline_node(int nid) {}
> @@ -283,6 +284,12 @@ static inline int remove_memory(u64 start, u64 size)
>  }
>  
>  static inline void __remove_memory(u64 start, u64 size) {}
> +
> +static inline int offline_and_remove_memory_ranges(const struct range 
> *ranges,
> +                                                int nr_ranges)

Best to use "unsigned int" right from the start and use two tabs to indent.


> +{
> +     return -EBUSY;
> +}
>  #endif /* CONFIG_MEMORY_HOTREMOVE */
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index a66346def504..7d56e0c6ede0 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -2429,58 +2429,98 @@ static int try_reonline_memory_block(struct 
> memory_block *mem, void *arg)
>   */
>  int offline_and_remove_memory(u64 start, u64 size)
>  {
> -     const unsigned long mb_count = size / memory_block_size_bytes();
> +     struct range range = { .start = start, .end = start + size - 1 };

I prefer this more readable as:

struct range range = {
        .start = start,
        .end = start + size - 1,
};

> +
> +     return offline_and_remove_memory_ranges(&range, 1);
> +}
> +EXPORT_SYMBOL_GPL(offline_and_remove_memory);
> +
> +/**
> + * offline_and_remove_memory_ranges - offline and remove multiple memory 
> ranges
> + * @ranges: array of physical address ranges to offline and remove
> + * @nr_ranges: number of entries in @ranges
> + *
> + * Offline and remove several memory ranges as one operation, serialized
> + * against other hotplug operations by a single lock_device_hotplug().
> + *
> + * This offlines all ranges before removing any of them.  If offlining any
> + * range fails, the entire process is reverted and nothing is removed.
> + * This provides a fully atomic semantic for unplugging an entire device.
> + *
> + * Each range must be memory-block aligned in start and size.
> + *
> + * Return: 0 on success, negative errno otherwise.  On failure no range has
> + * been removed.
> + */
> +int offline_and_remove_memory_ranges(const struct range *ranges, int 
> nr_ranges)
> +{
> +     unsigned long mb_total = 0;
>       uint8_t *online_types, *tmp;
> -     int rc;
> +     int i, rc = 0;
>  
> -     if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
> -         !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
> +     if (!ranges || nr_ranges <= 0)

With "unsigned int" this will be !nr_ranges.

Wondering whether we would WARN_ON_ONCE() here.

>               return -EINVAL;
>  
> +     for (i = 0; i < nr_ranges; i++) {
> +             u64 start = ranges[i].start;
> +             u64 size = range_len(&ranges[i]);

Both can be const.

> +
> +             if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
> +                 !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
> +                     return -EINVAL;
> +             mb_total += size / memory_block_size_bytes();
> +     }
> +
>       /*
> -      * We'll remember the old online type of each memory block, so we can
> -      * try to revert whatever we did when offlining one memory block fails
> -      * after offlining some others succeeded.
> +      * Remember the old online type of every memory block across all ranges,
> +      * so we can revert if offlining a later block fails.  All entries start
> +      * as MMOP_OFFLINE so blocks we never touched are skipped on rollback.
>        */
> -     online_types = kmalloc_array(mb_count, sizeof(*online_types),
> +     online_types = kmalloc_array(mb_total, sizeof(*online_types),
>                                    GFP_KERNEL);

Is "mb_total" really more expressive than "mb_count"?

>       if (!online_types)
>               return -ENOMEM;
> -     /*
> -      * Initialize all states to MMOP_OFFLINE, so when we abort processing in
> -      * try_offline_memory_block(), we'll skip all unprocessed blocks in
> -      * try_reonline_memory_block().
> -      */
> -     memset(online_types, MMOP_OFFLINE, mb_count);
> +     memset(online_types, MMOP_OFFLINE, mb_total);
>  
>       lock_device_hotplug();
>  
> +     /* Phase 1: offline every block in every range. */
>       tmp = online_types;
> -     rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
> +     for (i = 0; i < nr_ranges; i++) {
> +             rc = walk_memory_blocks(ranges[i].start, range_len(&ranges[i]),
> +                                     &tmp, try_offline_memory_block);
> +             if (rc)
> +                     break;
> +     }
>  
>       /*
> -      * In case we succeeded to offline all memory, remove it.
> -      * This cannot fail as it cannot get onlined in the meantime.
> +      * Phase 2: Remove each range. This essentially cannot fail as we hold
> +      * the hotplug lock . WARN if that assumption is ever broken.
>        */
>       if (!rc) {
> -             rc = try_remove_memory(start, size);
> -             if (rc)
> -                     pr_err("%s: Failed to remove memory: %d", __func__, rc);
> +             for (i = 0; i < nr_ranges; i++) {
> +                     rc = try_remove_memory(ranges[i].start,
> +                                            range_len(&ranges[i]));
> +                     if (WARN_ON_ONCE(rc)) {
> +                             pr_err("%s: Failed to remove memory: %d",
> +                                    __func__, rc);
> +                             break;

Do we really want to break? I'd say, just warn and continue, and fake rc == 0.
Something is seriously messed up already, and we partially removed memory. There
is no clean rollback possible.

Similar to __remove_memory(), ignoring the error because it offlined it already.

> +                     }
> +             }
>       }

In general, looks much cleaner to me, thanks!

-- 
Cheers,

David

Reply via email to