xe: Add a page defragmentation worker

Matthew Brost Tue, 23 Jun 2026 15:27:59 -0700

BOs whose backing TT pages were allocated below the device's beneficial
order (tracked via ttm_tt::beneficial_order_failed) are added to a
per-device defrag list. Add a delayed worker that periodically walks
this list and attempts to re-back each BO with beneficial-order pages,
improving TLB efficiency for long-lived objects that were initially
allocated under memory pressure.


The worker is kicked when the list first transitions to non-empty and
reschedules itself while work remains. On a failed pass it backs off
exponentially (XE_BO_DEFRAG_INTERVAL_MS up to
XE_BO_DEFRAG_INTERVAL_MAX_MS) and retries the whole list later; the
interval is reset to the default on the next fresh enqueue. The worker
stops scheduling once the list drains.

A defrag move synchronously reallocates and re-copies a BO's backing
store, which is not free. To avoid stalling concurrent active work when
many BOs become eligible at once (e.g. after a burst of memory pressure,
which would otherwise manifest as a large FPS drop while the worker
churns through the whole list), cap the number of BOs processed per run
to XE_BO_DEFRAG_NUM_BO_LIMIT_PER_WORK and reschedule. This spreads the
defrag effort over time, keeping it in the background and yielding to
userspace progress.

Defragmenting a BO forces a TTM move (ctx.defrag) that reallocates the
backing at the beneficial order and relocates the contents on the GPU
(see xe_bo_move()). When beneficial-order pages cannot be obtained the
new allocation is discharged during populate and the BO keeps its
original backing, staying on the list for a later retry.

Cc: Carlos Santa <[email protected]>
Cc: Ryan Neph <[email protected]>
Cc: Christian Koenig <[email protected]>
Cc: Huang Rui <[email protected]>
Cc: Matthew Auld <[email protected]>
Cc: Maarten Lankhorst <[email protected]>
Cc: Maxime Ripard <[email protected]>
Cc: Thomas Zimmermann <[email protected]>
Cc: David Airlie <[email protected]>
Cc: Simona Vetter <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: Thomas Hellström <[email protected]>
Assisted-by: GitHub_Copilot:claude-opus-4.8
Signed-off-by: Matthew Brost <[email protected]>
---
 drivers/gpu/drm/xe/xe_bo.c           | 203 ++++++++++++++++++++++++++-
 drivers/gpu/drm/xe/xe_device_types.h |  11 ++
 2 files changed, 207 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 04709343518c..097cd2ad7c1a 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -31,6 +31,7 @@
 #include "xe_pat.h"
 #include "xe_pm.h"
 #include "xe_preempt_fence.h"
+#include "xe_printk.h"
 #include "xe_pxp.h"
 #include "xe_res_cursor.h"
 #include "xe_shrinker.h"
@@ -49,6 +50,32 @@
  */
 #define XE_BO_DEFRAG_RECLAIM_BACKOFF_THRESHOLD 2
 
+/*
+ * Maximum number of BOs the defrag worker will process in a single run before
+ * yielding and rescheduling itself.
+ *
+ * A defrag move synchronously reallocates and re-copies a BO's backing store,
+ * which is not free. If a large number of BOs become eligible at once (e.g.
+ * after a burst of memory pressure), processing them all in one worker run
+ * would hold things up for a long, unbounded stretch and can visibly starve
+ * concurrent active work (e.g. a large FPS drop). Instead, cap the work done
+ * per run and requeue, spreading the defrag effort out over time so it stays
+ * in the background and yields to userspace progress.
+ */
+#define XE_BO_DEFRAG_NUM_BO_LIMIT_PER_WORK     2
+
+/* Default delay before (re)running the defrag worker, in milliseconds. */
+#define XE_BO_DEFRAG_INTERVAL_MS               50
+
+/*
+ * Upper bound for the (exponentially backed off) defrag worker interval, in
+ * milliseconds, so repeated failures don't push the retry arbitrarily far out.
+ */
+#define XE_BO_DEFRAG_INTERVAL_MAX_MS           15000   /* 15 seconds */
+
+static void xe_bo_defrag_worker(struct work_struct *w);
+static void xe_place_from_ttm_type(u32 mem_type, struct ttm_place *place);
+
 const char *const xe_mem_type_to_name[TTM_NUM_MEM_TYPES]  = {
        [XE_PL_SYSTEM] = "system",
        [XE_PL_TT] = "gtt",
@@ -562,12 +589,23 @@ static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, 
struct ttm_tt *tt,
        } else {
                struct xe_device *xe = ttm_to_xe_device(ttm_dev);
 
-               if (atomic_read(&xe->mem.defrag.count) >=
-                   XE_BO_DEFRAG_RECLAIM_BACKOFF_THRESHOLD)
+               if (ctx->defrag)
+                       ctx->beneficial_reclaim_backoff = false;
+               else if (atomic_read(&xe->mem.defrag.count) >=
+                        XE_BO_DEFRAG_RECLAIM_BACKOFF_THRESHOLD)
                        ctx->beneficial_reclaim_backoff = true;
 
                ttm_tt_clear_backed_up(tt);
                err = ttm_pool_alloc(&ttm_dev->pool, tt, ctx);
+               if (!err && ctx->defrag && tt->beneficial_order_failed) {
+                       /*
+                        * Defrag could not obtain beneficial-order pages.
+                        * Discharge the new allocation; the caller keeps the
+                        * BO on its original backing.
+                        */
+                       ttm_pool_free(&ttm_dev->pool, tt);
+                       return -ENOMEM;
+               }
        }
        if (err)
                return err;
@@ -961,30 +999,54 @@ static int xe_ttm_bo_purge(struct ttm_buffer_object 
*ttm_bo, struct ttm_operatio
        return 0;
 }
 
+static void xe_bo_defrag_fini(void *arg)
+{
+       struct xe_device *xe = arg;
+
+       disable_delayed_work_sync(&xe->mem.defrag.worker);
+}
+
 /**
  * xe_bo_defrag_init() - Initialize the device defrag BO tracking
  * @xe: The xe device
  *
- * Initialize the list, lock and count used to track BOs whose backing TT
- * pages were allocated at a sub-optimal order.
+ * Initialize the list, lock, count and delayed worker used to track and
+ * defragment BOs whose backing TT pages were allocated at a sub-optimal order.
  */
 void xe_bo_defrag_init(struct xe_device *xe)
 {
        spin_lock_init(&xe->mem.defrag.lock);
        INIT_LIST_HEAD(&xe->mem.defrag.list);
        atomic_set(&xe->mem.defrag.count, 0);
+       xe->mem.defrag.interval_ms = XE_BO_DEFRAG_INTERVAL_MS;
+       INIT_DELAYED_WORK(&xe->mem.defrag.worker, xe_bo_defrag_worker);
+       devm_add_action_or_reset(xe->drm.dev, xe_bo_defrag_fini, xe);
+}
+
+static void xe_bo_defrag_schedule(struct xe_device *xe)
+{
+       schedule_delayed_work(&xe->mem.defrag.worker,
+                             msecs_to_jiffies(xe->mem.defrag.interval_ms));
 }
 
 static void xe_bo_defrag_add(struct xe_bo *bo)
 {
        struct xe_device *xe = xe_bo_device(bo);
+       bool kick = false;
 
        scoped_guard(spinlock, &xe->mem.defrag.lock) {
                if (list_empty(&bo->defrag_link)) {
+                       /* Kick the worker when the list transitions to 
non-empty. */
+                       kick = list_empty(&xe->mem.defrag.list);
+                       if (kick)
+                               xe->mem.defrag.interval_ms = 
XE_BO_DEFRAG_INTERVAL_MS;
                        list_add_tail(&bo->defrag_link, &xe->mem.defrag.list);
                        atomic_inc(&xe->mem.defrag.count);
                }
        }
+
+       if (kick)
+               xe_bo_defrag_schedule(xe);
 }
 
 /**
@@ -1010,9 +1072,10 @@ void xe_bo_defrag_remove(struct xe_bo *bo)
  * xe_bo_defrag_update() - Update defrag list membership for a BO
  * @bo: The buffer object
  *
- * Add @bo to the device defrag list when it has a populated, non-pinned TT of
- * type ttm_bo_type_device whose pages were allocated at a sub-optimal order
- * (tt->beneficial_order_failed). Otherwise ensure it is removed from the list.
+ * Add @bo to the device defrag list when it is a ttm_bo_type_device BO 
resident
+ * in XE_PL_TT with a populated TT whose pages were allocated at a sub-optimal
+ * order (tt->beneficial_order_failed) and it isn't pinned. Otherwise ensure it
+ * is removed from the list.
  */
 static void xe_bo_defrag_update(struct xe_bo *bo)
 {
@@ -1021,12 +1084,138 @@ static void xe_bo_defrag_update(struct xe_bo *bo)
 
        if (ttm_bo->type == ttm_bo_type_device && tt &&
            ttm_tt_is_populated(tt) && tt->beneficial_order_failed &&
+           ttm_bo->resource && ttm_bo->resource->mem_type == XE_PL_TT &&
            !xe_bo_is_pinned(bo))
                xe_bo_defrag_add(bo);
        else
                xe_bo_defrag_remove(bo);
 }
 
+/*
+ * Attempt to defragment a single BO by forcing a move that reallocates its
+ * backing at the device's beneficial order. Returns 0 if the BO no longer
+ * needs to be tracked (either defragmented or no longer eligible), or a
+ * negative error code if the attempt should be retried later.
+ */
+static int xe_bo_defrag_one(struct xe_device *xe, struct xe_bo *bo)
+{
+       struct ttm_operation_ctx ctx = {
+               .interruptible = false,
+               .no_wait_gpu = false,
+               .gfp_retry_mayfail = true,
+               .defrag = true,
+       };
+       struct ttm_buffer_object *ttm_bo = &bo->ttm;
+       struct ttm_placement placement;
+       struct ttm_place place;
+       int ret;
+
+       xe_bo_lock(bo, false);
+
+       /* Re-check eligibility under the BO lock. */
+       if (xe_bo_is_pinned(bo) || ttm_bo->type != ttm_bo_type_device ||
+           !ttm_bo->resource || ttm_bo->resource->mem_type != XE_PL_TT ||
+           !ttm_bo->ttm || !ttm_tt_is_populated(ttm_bo->ttm) ||
+           !ttm_bo->ttm->beneficial_order_failed) {
+               xe_bo_defrag_remove(bo);
+               ret = 0;
+               goto unlock;
+       }
+
+       xe_place_from_ttm_type(ttm_bo->resource->mem_type, &place);
+       placement.num_placement = 1;
+       placement.placement = &place;
+
+       /*
+        * On success the move reallocates the backing at beneficial order and
+        * drops the BO from the defrag list. On failure the BO keeps its
+        * original backing and stays on the list for a later retry.
+        */
+       ret = ttm_bo_validate(ttm_bo, &placement, &ctx);
+
+       xe_dbg(xe, "Defrag attempt on BO size=%lu: ret=%pe\n", xe_bo_size(bo),
+              ERR_PTR(ret));
+
+unlock:
+       xe_bo_unlock(bo);
+       return ret;
+}
+
+static void xe_bo_defrag_worker(struct work_struct *w)
+{
+       struct delayed_work *dwork = to_delayed_work(w);
+       struct xe_device *xe =
+               container_of(dwork, struct xe_device, mem.defrag.worker);
+       bool requeue = false;
+       int i;
+
+       /*
+        * Process at most XE_BO_DEFRAG_NUM_BO_LIMIT_PER_WORK BOs per run rather
+        * than draining the whole list in one go. Each defrag is synchronous 
and
+        * relatively expensive, so bounding the work per run keeps the worker
+        * from monopolising resources and lets concurrent active work make
+        * progress; any remaining BOs are handled by a follow-up run scheduled
+        * below.
+        */
+       for (i = 0; i < XE_BO_DEFRAG_NUM_BO_LIMIT_PER_WORK; ++i) {
+               struct xe_bo *bo;
+               int ret;
+
+               scoped_guard(spinlock, &xe->mem.defrag.lock) {
+                       bo = list_first_entry_or_null(&xe->mem.defrag.list,
+                                                     struct xe_bo, 
defrag_link);
+                       bo = bo ? xe_bo_get_unless_zero(bo) : NULL;
+               }
+
+               if (!bo)
+                       break;
+
+               ret = xe_bo_defrag_one(xe, bo);
+               xe_bo_put(bo);
+
+               if (ret) {
+                       /*
+                        * Abort the pass and retry the whole list later, 
backing
+                        * off exponentially on every failure.
+                        */
+                       scoped_guard(spinlock, &xe->mem.defrag.lock)
+                               xe->mem.defrag.interval_ms =
+                                       min(xe->mem.defrag.interval_ms * 2,
+                                           (unsigned 
int)XE_BO_DEFRAG_INTERVAL_MAX_MS);
+                       requeue = true;
+                       break;
+               }
+       }
+
+       /*
+        * Decide whether to reschedule:
+        *
+        *  - The loop hit its per-run limit (i == LIMIT): we stopped early to
+        *    spread the work out, so requeue at the default interval (reset 
here
+        *    since this run made progress) whenever the list still has entries.
+        *
+        *  - The loop aborted on a failed defrag (requeue): retry later at the
+        *    backed-off interval computed above.
+        *
+        *  - The loop drained the list (broke on the empty list): nothing left 
to
+        *    do, so stop scheduling. A fresh enqueue will kick the worker 
again.
+        */
+       if (i == XE_BO_DEFRAG_NUM_BO_LIMIT_PER_WORK) {
+               struct xe_bo *bo;
+
+               scoped_guard(spinlock, &xe->mem.defrag.lock) {
+                       bo = list_first_entry_or_null(&xe->mem.defrag.list,
+                                                     struct xe_bo, 
defrag_link);
+                       xe->mem.defrag.interval_ms = XE_BO_DEFRAG_INTERVAL_MS;
+               }
+
+               if (bo)
+                       xe_bo_defrag_schedule(xe);
+       } else if (requeue) {
+               xe_bo_defrag_schedule(xe);
+       }
+}
+
 static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
                      struct ttm_operation_ctx *ctx,
                      struct ttm_resource *new_mem,
diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
b/drivers/gpu/drm/xe/xe_device_types.h
index fe67b0ad938c..4bc2011d5c32 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -299,6 +299,17 @@ struct xe_device {
                         * @mem.defrag.list.
                         */
                        atomic_t count;
+                       /**
+                        * @mem.defrag.worker: Delayed worker that walks
+                        * @mem.defrag.list trying to reallocate BO backing
+                        * store at the device's beneficial order.
+                        */
+                       struct delayed_work worker;
+                       /**
+                        * @mem.defrag.interval_ms: Reschedule interval for
+                        * @mem.defrag.worker, in milliseconds.
+                        */
+                       unsigned int interval_ms;
                } defrag;
        } mem;
 
-- 
2.34.1

[RFC PATCH 09/12] drm/xe: Add a page defragmentation worker

Reply via email to