Hello,

Den 2026-05-12 kl. 10:24, skrev Thomas Hellström:
> Add an optional reclaim callback to struct dmem_cgroup_region. When
> dmem.max is set below the current usage of a cgroup pool, the new limit
> is applied immediately (so that concurrent allocations are throttled
> while reclaim is in progress) and then the driver is asked to evict
> memory to bring usage back below the limit.
> 
> Reclaim is attempted up to a bounded number of times. No error is
> returned to userspace if usage remains above the limit after reclaim,
> and a pending signal will abort the reclaim loop early. This matches
> the behavior of memory.max in the memory cgroup controller.
> 
> Also honor O_NONBLOCK so that if that flag is set during the
> max value write, no reclaim is initiated. The idea is to avoid
> charging the reclaim cost to the writer of the max value.
> 
> v2:
> - Write max before reclaim is attempted (Maarten)
> - Let signals abort the reclaim without error (Maarten)
> - If a new max value is written with the O_NONBLOCK flag,
>   reclaim is not attempted (Maarten)
> - Extract region from the pool parameter rather than
>   passing it explicitly to set_resource_xxx().
> v3:
> - Use an rwsem to protect reclaim callback registration and
>   region unregister against concurrent reclaim invocations,
>   ensuring reclaim_priv is visible when the callback is
>   invoked. (Sashiko-bot)
> 
> Assisted-by: GitHub_Copilot:claude-sonnet-4.6
> Signed-off-by: Thomas Hellström <[email protected]>
> ---
>  include/linux/cgroup_dmem.h |  24 ++++++++
>  kernel/cgroup/dmem.c        | 106 +++++++++++++++++++++++++++++++++---
>  2 files changed, 121 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/cgroup_dmem.h b/include/linux/cgroup_dmem.h
> index dd4869f1d736..c3bce21cbe80 100644
> --- a/include/linux/cgroup_dmem.h
> +++ b/include/linux/cgroup_dmem.h
> @@ -14,6 +14,21 @@ struct dmem_cgroup_pool_state;
>  /* Opaque definition of a cgroup region, used internally */
>  struct dmem_cgroup_region;
>  
> +/**
> + * typedef dmem_cgroup_reclaim_fn_t - Reclaim callback for a dmem cgroup 
> region.
> + * @pool: The cgroup pool that needs memory reclaimed.
> + * @target_bytes: Minimum number of bytes the driver should attempt to free.
> + * @priv: Private data registered with dmem_cgroup_region_set_reclaim().
> + *
> + * Called by the dmem cgroup controller when dmem.max is set below the 
> current
> + * usage of @pool. The driver should evict at least @target_bytes of memory
> + * from @pool. May be called multiple times if usage remains above the limit.
> + *
> + * Return: 0 if progress was made, negative error code otherwise.
> + */
> +typedef int (*dmem_cgroup_reclaim_fn_t)(struct dmem_cgroup_pool_state *pool,
> +                                     u64 target_bytes, void *priv);
> +
>  #if IS_ENABLED(CONFIG_CGROUP_DMEM)
>  struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char 
> *name_fmt, ...) __printf(2,3);
>  void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region);
> @@ -26,6 +41,9 @@ bool dmem_cgroup_state_evict_valuable(struct 
> dmem_cgroup_pool_state *limit_pool,
>                                     bool ignore_low, bool *ret_hit_low);
>  
>  void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool);
> +void dmem_cgroup_region_set_reclaim(struct dmem_cgroup_region *region,
> +                                 dmem_cgroup_reclaim_fn_t reclaim,
> +                                 void *priv);
>  #else
>  static inline __printf(2,3) struct dmem_cgroup_region *
>  dmem_cgroup_register_region(u64 size, const char *name_fmt, ...)
> @@ -62,5 +80,11 @@ bool dmem_cgroup_state_evict_valuable(struct 
> dmem_cgroup_pool_state *limit_pool,
>  static inline void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state 
> *pool)
>  { }
>  
> +static inline void
> +dmem_cgroup_region_set_reclaim(struct dmem_cgroup_region *region,
> +                            dmem_cgroup_reclaim_fn_t reclaim,
> +                            void *priv)
> +{ }
> +
>  #endif
>  #endif       /* _CGROUP_DMEM_H */
> diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
> index 1ab1fb47f271..5fd5a1634d21 100644
> --- a/kernel/cgroup/dmem.c
> +++ b/kernel/cgroup/dmem.c
> @@ -51,6 +51,20 @@ struct dmem_cgroup_region {
>        * No new pools should be added to the region afterwards.
>        */
>       bool unregistered;
> +
> +     /**
> +      * @reclaim: Optional callback invoked when dmem.max is set below the
> +      * current usage of a pool. The driver should attempt to free at least
> +      * @target_bytes from @pool. May be called multiple times if usage
> +      * remains above the limit after returning.
> +      */
> +     dmem_cgroup_reclaim_fn_t reclaim;
> +
> +     /** @reclaim_priv: Private data passed to @reclaim. */
> +     void *reclaim_priv;
> +
> +     /** @unregister_sem: Protect @reclaim while it is running. */
> +     struct rw_semaphore unregister_sem;
>  };
>  
>  struct dmemcg_state {
> @@ -145,21 +159,58 @@ static void free_cg_pool(struct dmem_cgroup_pool_state 
> *pool)
>  }
>  
>  static void
> -set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val)
> +set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val, bool nonblock)
>  {
>       page_counter_set_min(&pool->cnt, val);
>  }
>  
>  static void
> -set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val)
> +set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val, bool nonblock)
>  {
>       page_counter_set_low(&pool->cnt, val);
>  }
>  
>  static void
> -set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val)
> +set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val, bool nonblock)
>  {
> -     page_counter_set_max(&pool->cnt, val);
> +     struct dmem_cgroup_region *region = pool->region;
> +
> +     /*
> +      * Always update the limit, even if usage currently exceeds it.
> +      * Concurrent allocations will be throttled against the new limit
> +      * while reclaim is in progress.
> +      */
> +     xchg(&pool->cnt.max, (unsigned long)val);
> +
> +     if (nonblock || !READ_ONCE(region->reclaim))
> +             return;
> +
> +     for (int retries = 5; retries > 0; retries--) {
Where does 5 come from? This code should retry until no longer above limit, 
otherwise you'll get some hard to debug issues.

> +             u64 usage = page_counter_read(&pool->cnt);
> +             int ret;
> +
> +             if (usage <= val)
> +                     break;
> +
> +             if (signal_pending(current))
> +                     break;
> +
> +             /* Block unregister until the reclaim callback completes. */
> +             if (down_read_interruptible(&region->unregister_sem))
> +                     break;
> +
> +             if (!region->reclaim) {
> +                     up_read(&region->unregister_sem);
> +                     break;
> +             }
> +
> +             ret = region->reclaim(pool, usage - val, region->reclaim_priv);
> +             up_read(&region->unregister_sem);
> +             if (ret)
> +                     break;
> +
> +             cond_resched();
> +     }
>  }
>  
>  static u64 get_resource_low(struct dmem_cgroup_pool_state *pool)
> @@ -184,9 +235,9 @@ static u64 get_resource_current(struct 
> dmem_cgroup_pool_state *pool)
>  
>  static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool)
>  {
> -     set_resource_min(rpool, 0);
> -     set_resource_low(rpool, 0);
> -     set_resource_max(rpool, PAGE_COUNTER_MAX);
> +     set_resource_min(rpool, 0, false);
> +     set_resource_low(rpool, 0, false);
> +     set_resource_max(rpool, PAGE_COUNTER_MAX, false);
>  }
>  
>  static void dmemcs_offline(struct cgroup_subsys_state *css)
> @@ -491,6 +542,12 @@ void dmem_cgroup_unregister_region(struct 
> dmem_cgroup_region *region)
>       region->unregistered = true;
>       spin_unlock(&dmemcg_lock);
>  
> +     /* Ensure all reclaim() callbacks have finished. */
> +     down_write(&region->unregister_sem);
> +     /* Pairs with READ_ONCE() in set_resource_max() */
> +     WRITE_ONCE(region->reclaim, NULL);
> +     up_write(&region->unregister_sem);
> +
>       kref_put(&region->ref, dmemcg_free_region);
>  }
I've thought about it some more, Can we do the same as dma-buf init?

DEFINE_DMEMCG_REGION_INFO(info);
info.size = size.
info.ops = &drm_ttm_dmem_region_ops;
info.region_priv = ttm_region;
info.device_priv = drm_dev;

dmem_region = dmem_cgroup_register_region(&info);

This way we don't need to have a typedef for function pointers,
no need for READ_ONCE() and/or additional locking, which was only
added because it wasn't set at init.

If we can push the responsibility for serialization against unload
to the driver, we should also be able to use drm_dev_enter/exit here
for the reclaim loop?

Something like below:

if (!ops->device_begin(device_priv, &cookie))
        return 0; // Device gone

while (true) {
        ops->reclaim(region_priv, ...);
}

ops->device_end(device_priv, cookie);

Although we will additionally need to ensure that the region holds a refcount on
reclaim_priv until dmemcg_free_region is called, otherwise this breaks.

So 4 ops needed:
- device_begin
- reclaim
- device_end
- free (called after region refcount drops to 0, called immediately on 
!CONFIG_DMEMCG, drops device refcount)

Relatedly, I believe perhaps we should also convert from drmm managed to devm 
managed,
as all memory is already freed after the device is physically detached.

Hopefully this solves all lifetime issues, and this design allows for
additional callbacks into the device or region later on if needed.

Kind regards,
~Maarten Lankhorst

>  EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region);
> @@ -530,6 +587,7 @@ struct dmem_cgroup_region 
> *dmem_cgroup_register_region(u64 size, const char *fmt
>       INIT_LIST_HEAD(&ret->pools);
>       ret->name = region_name;
>       ret->size = size;
> +     init_rwsem(&ret->unregister_sem);
>       kref_init(&ret->ref);
>  
>       spin_lock(&dmemcg_lock);
> @@ -568,6 +626,34 @@ void dmem_cgroup_pool_state_put(struct 
> dmem_cgroup_pool_state *pool)
>  }
>  EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put);
>  
> +/**
> + * dmem_cgroup_region_set_reclaim() - Register a reclaim callback on a 
> region.
> + * @region: The region to register the callback for.
> + * @reclaim: Callback to invoke when dmem.max is set below current usage.
> + *           Called with the pool that needs reclaiming and the number of
> + *           bytes to free. Returns 0 on progress, negative on failure.
> + * @priv: Opaque pointer passed back to @reclaim.
> + *
> + * When dmem.max is lowered below the current usage of a cgroup pool, the
> + * dmem controller will call @reclaim with a target number of bytes to free.
> + * After @reclaim returns the controller retries setting the limit; if usage
> + * is still too high it calls @reclaim again, up to a bounded retry count.
> + */
> +void dmem_cgroup_region_set_reclaim(struct dmem_cgroup_region *region,
> +                                 dmem_cgroup_reclaim_fn_t reclaim,
> +                                 void *priv)
> +{
> +     if (!region)
> +             return;
> +
> +     down_write(&region->unregister_sem);
> +     region->reclaim_priv = priv;
> +     /* Pairs with READ_ONCE() in set_resource_max() */
> +     WRITE_ONCE(region->reclaim, reclaim);
> +     up_write(&region->unregister_sem);
> +}
> +EXPORT_SYMBOL_GPL(dmem_cgroup_region_set_reclaim);
> +
>  static struct dmem_cgroup_pool_state *
>  get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region 
> *region)
>  {
> @@ -725,9 +811,10 @@ static int dmemcg_parse_limit(char *options, u64 
> *new_limit)
>  
>  static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
>                                char *buf, size_t nbytes, loff_t off,
> -                              void (*apply)(struct dmem_cgroup_pool_state *, 
> u64))
> +                              void (*apply)(struct dmem_cgroup_pool_state *, 
> u64, bool))
>  {
>       struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of));
> +     bool nonblock = of->file->f_flags & O_NONBLOCK;
>       int err = 0;
>  
>       while (buf && !err) {
> @@ -772,7 +859,8 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file 
> *of,
>               }
>  
>               /* And commit */
> -             apply(pool, new_limit);
> +             apply(pool, new_limit, nonblock);
> +
>               dmemcg_pool_put(pool);
>  
>  out_put:

Reply via email to