force_merge with decoupled clear tracker

Arunpravin Paneer Selvam Mon, 04 May 2026 04:11:57 -0700

The current buddy allocator maintains separate clear_tree[] and
dirty_tree[] rbtrees per order, preventing coalescing between cleared
and dirty buddies. Under mixed workloads, this creates a merge barrier:
adjacent buddies frequently end up split across trees, forcing reliance
on __force_merge() during allocation.


__force_merge() performs an O(N x max_order) scan under the VRAM manager
lock, leading to allocation stalls and failures for large contiguous
requests even when sufficient total free memory is available.

Solution

Replace the dual-tree design with:
- A single free_tree[order] rbtree for dirty and mixed free blocks
  (fully cleared free blocks float outside this tree)
- A lightweight out-of-band clear tracker (gpu_clear_tracker)

Fully cleared free blocks are tracked outside the buddy trees using an
augmented interval rbtree, enabling O(log E) lookup of the largest
cleared extents.

Buddy coalescing is now unconditional in __gpu_buddy_free(), regardless
of clear/dirty state. This removes the merge barrier and eliminates the
need for __force_merge().

Benefits

- Correct high-order allocations after mixed clear/dirty workloads
- Elimination of O(N x max_order) merge cost from the allocation path
- O(log E) cleared-extent lookup replacing O(N) scans
- Predictable allocation latency under fragmentation
- Reduced complexity with a single tree per order

Test:
dEQP-VK.memory.allocation.basic.size_8KiB.reverse.count_4000

Below data is from /sys/kernel/debug/dri/1/amdgpu_vram_mm:

Base (dual-tree), before VKCTS test:
  order- 6 free:   6 MiB,  blocks: 26
  order- 5 free:   1 MiB,  blocks: 15
  order- 4 free: 960 KiB,  blocks: 15
  order- 3 free:   5 MiB,  blocks: 171
  order- 2 free:   2 MiB,  blocks: 176
  order- 1 free:   1 MiB,  blocks: 165
  order- 0 free:  16 KiB,  blocks: 4

Base (dual-tree), after VKCTS test:
  order- 6 free: 768 KiB,  blocks: 3
  order- 5 free: 499 MiB,  blocks: 3999
  order- 4 free: 250 MiB,  blocks: 4001
  order- 3 free: 129 MiB,  blocks: 4157
  order- 2 free:  65 MiB,  blocks: 4161
  order- 1 free:  63 MiB,  blocks: 8138
  order- 0 free:  20 KiB,  blocks: 5

Clear tracker, before VKCTS test:
  order- 6 free:   4 MiB,  blocks: 19
  order- 5 free:   2 MiB,  blocks: 18
  order- 4 free: 704 KiB,  blocks: 11
  order- 3 free:   5 MiB,  blocks: 168
  order- 2 free:   2 MiB,  blocks: 174
  order- 1 free:   1 MiB,  blocks: 167
  order- 0 free:  32 KiB,  blocks: 8

Clear tracker, after VKCTS test:
  order- 6 free:   4 MiB,  blocks: 19
  order- 5 free:   2 MiB,  blocks: 18
  order- 4 free: 704 KiB,  blocks: 11
  order- 3 free:   5 MiB,  blocks: 168
  order- 2 free:   2 MiB,  blocks: 174
  order- 1 free:   1 MiB,  blocks: 167
  order- 0 free:  28 KiB,  blocks: 7

v2:
 - Code-style cleanup and minor refactoring
 - Renamed locals for clarity

Cc: Matthew Auld <[email protected]>
Cc: Christian König <[email protected]>
Signed-off-by: Arunpravin Paneer Selvam <[email protected]>
---
 drivers/gpu/buddy.c         | 997 ++++++++++++++++++++++--------------
 drivers/gpu/drm/drm_buddy.c |  44 +-
 include/linux/gpu_buddy.h   |  61 ++-
 3 files changed, 692 insertions(+), 410 deletions(-)

diff --git a/drivers/gpu/buddy.c b/drivers/gpu/buddy.c
index 52686672e99f..8c6bd8b95cdb 100644
--- a/drivers/gpu/buddy.c
+++ b/drivers/gpu/buddy.c
@@ -34,6 +34,334 @@
 #endif
 
 static struct kmem_cache *slab_blocks;
+static struct kmem_cache *slab_extents;
+
+/*
+ * Clear tracker
+ * -------------
+ *
+ * The clear tracker maintains an augmented interval rbtree of contiguous
+ * cleared (zeroed) address ranges, decoupled from the buddy free trees.
+ * Each node covers a maximal coalesced run; adjacent extents are merged
+ * on insertion so the tree always holds the smallest possible number of
+ * extents.  The augmentation field @subtree_max_size lets the allocator
+ * locate the largest cleared extent in O(log E).
+ */
+
+static u64 extent_size(struct gpu_clear_extent *clear_extent)
+{
+       return clear_extent->end - clear_extent->start;
+}
+
+RB_DECLARE_CALLBACKS_MAX(static, gpu_clear_augment_cb,
+                        struct gpu_clear_extent, rb,
+                        u64, subtree_max_size,
+                        extent_size)
+
+static struct gpu_clear_extent *extent_alloc(void)
+{
+       return kmem_cache_zalloc(slab_extents, GFP_KERNEL);
+}
+
+static void extent_free(struct gpu_clear_extent *clear_extent)
+{
+       kmem_cache_free(slab_extents, clear_extent);
+}
+
+/* Return the rightmost extent whose start is strictly below @offset. */
+static struct gpu_clear_extent *
+prev_extent(struct gpu_clear_tracker *clear_tracker, u64 offset)
+{
+       struct rb_node *rb = clear_tracker->root.rb_node;
+       struct gpu_clear_extent *clear_extent = NULL;
+
+       while (rb) {
+               struct gpu_clear_extent *tmp_extent =
+                       rb_entry(rb, struct gpu_clear_extent, rb);
+
+               if (tmp_extent->start < offset) {
+                       clear_extent = tmp_extent;
+                       rb = rb->rb_right;
+               } else {
+                       rb = rb->rb_left;
+               }
+       }
+
+       return clear_extent;
+}
+
+/* Return the leftmost extent whose start is at or above @offset. */
+static struct gpu_clear_extent *
+next_extent(struct gpu_clear_tracker *clear_tracker, u64 offset)
+{
+       struct rb_node *rb = clear_tracker->root.rb_node;
+       struct gpu_clear_extent *clear_extent = NULL;
+
+       while (rb) {
+               struct gpu_clear_extent *tmp_extent =
+                       rb_entry(rb, struct gpu_clear_extent, rb);
+
+               if (tmp_extent->start >= offset) {
+                       clear_extent = tmp_extent;
+                       rb = rb->rb_left;
+               } else {
+                       rb = rb->rb_right;
+               }
+       }
+
+       return clear_extent;
+}
+
+static void insert_extent(struct gpu_clear_tracker *clear_tracker,
+                         struct gpu_clear_extent *clear_extent)
+{
+       struct rb_node **link = &clear_tracker->root.rb_node;
+       struct rb_node *parent = NULL;
+
+       while (*link) {
+               struct gpu_clear_extent *tmp_extent;
+
+               parent = *link;
+               tmp_extent = rb_entry(parent, struct gpu_clear_extent, rb);
+
+               if (clear_extent->start < tmp_extent->start)
+                       link = &parent->rb_left;
+               else
+                       link = &parent->rb_right;
+       }
+
+       clear_extent->subtree_max_size = extent_size(clear_extent);
+       rb_link_node(&clear_extent->rb, parent, link);
+       rb_insert_augmented(&clear_extent->rb, &clear_tracker->root, 
&gpu_clear_augment_cb);
+}
+
+static void remove_extent(struct gpu_clear_tracker *clear_tracker,
+                         struct gpu_clear_extent *clear_extent)
+{
+       rb_erase_augmented(&clear_extent->rb, &clear_tracker->root, 
&gpu_clear_augment_cb);
+       RB_CLEAR_NODE(&clear_extent->rb);
+}
+
+static void gpu_clear_tracker_init(struct gpu_clear_tracker *clear_tracker)
+{
+       clear_tracker->root = RB_ROOT;
+       clear_tracker->total_clear = 0;
+}
+
+static void gpu_clear_tracker_fini(struct gpu_clear_tracker *clear_tracker)
+{
+       struct rb_node *rb;
+
+       while ((rb = rb_first(&clear_tracker->root))) {
+               struct gpu_clear_extent *clear_extent =
+                       rb_entry(rb, struct gpu_clear_extent, rb);
+
+               remove_extent(clear_tracker, clear_extent);
+               extent_free(clear_extent);
+       }
+
+       clear_tracker->total_clear = 0;
+}
+
+/*
+ * Mark the range [start, start + size] as cleared. Merge with the neighbour on
+ * each side if they are contiguous, so the tree never holds two adjacent 
ranges.
+ */
+static void gpu_clear_tracker_mark_clear(struct gpu_clear_tracker 
*clear_tracker,
+                                        u64 start, u64 size)
+{
+       struct gpu_clear_extent *left, *right, *clear_extent;
+       u64 end = start + size;
+
+       if (!size)
+               return;
+
+       /* Find contiguous neighbours, if any. */
+       left = prev_extent(clear_tracker, start);
+       if (left && left->end != start)
+               left = NULL;
+
+       right = next_extent(clear_tracker, end);
+       if (right && right->start != end)
+               right = NULL;
+
+       if (left && right) {
+               /* Merge left + new + right into a single extent. */
+               remove_extent(clear_tracker, left);
+               remove_extent(clear_tracker, right);
+               left->end = right->end;
+               extent_free(right);
+               insert_extent(clear_tracker, left);
+       } else if (left) {
+               /* Extend left neighbour rightwards. */
+               remove_extent(clear_tracker, left);
+               left->end = end;
+               insert_extent(clear_tracker, left);
+       } else if (right) {
+               /* Extend right neighbour leftwards. */
+               remove_extent(clear_tracker, right);
+               right->start = start;
+               insert_extent(clear_tracker, right);
+       } else {
+               /* Standalone extent. */
+               clear_extent = extent_alloc();
+               if (!clear_extent)
+                       return;
+
+               clear_extent->start = start;
+               clear_extent->end   = end;
+               insert_extent(clear_tracker, clear_extent);
+       }
+
+       clear_tracker->total_clear += size;
+}
+
+/*
+ * Mark the range [start, start + size] as dirty. Remove the range from every
+ * overlapping clear extent, splitting one extent in two if the dirty range
+ * falls strictly inside it.
+ */
+static void gpu_clear_tracker_mark_dirty(struct gpu_clear_tracker 
*clear_tracker,
+                                        u64 start, u64 size)
+{
+       struct gpu_clear_extent *clear_extent, *next;
+       u64 end = start + size;
+
+       if (!size)
+               return;
+
+       clear_extent = prev_extent(clear_tracker, start + 1);
+       if (!clear_extent)
+               clear_extent = next_extent(clear_tracker, start);
+
+       while (clear_extent && clear_extent->start < end) {
+               struct rb_node *next_node = rb_next(&clear_extent->rb);
+               u64 extent_start = clear_extent->start;
+               u64 extent_end = clear_extent->end;
+
+               if (next_node)
+                       next = rb_entry(next_node, struct gpu_clear_extent, rb);
+               else
+                       next = NULL;
+
+               /* Skip a non-overlapping neighbour returned by prev_extent(). 
*/
+               if (extent_end <= start) {
+                       clear_extent = next;
+                       continue;
+               }
+
+               if (extent_start < start && extent_end > end) {
+                       /* Dirty range falls strictly inside: split into left + 
right. */
+                       struct gpu_clear_extent *right = extent_alloc();
+                       if (!right)
+                               return;
+
+                       remove_extent(clear_tracker, clear_extent);
+
+                       clear_extent->end = start;
+                       right->start = end;
+                       right->end   = extent_end;
+
+                       insert_extent(clear_tracker, clear_extent);
+                       insert_extent(clear_tracker, right);
+
+                       clear_tracker->total_clear -= size;
+               } else if (extent_start >= start && extent_end <= end) {
+                       /* Extent fully covered: drop it. */
+                       remove_extent(clear_tracker, clear_extent);
+                       extent_free(clear_extent);
+
+                       clear_tracker->total_clear -= (extent_end - 
extent_start);
+               } else if (extent_start < start) {
+                       /* Extent overlaps from the left: trim its right end. */
+                       remove_extent(clear_tracker, clear_extent);
+                       clear_extent->end = start;
+                       insert_extent(clear_tracker, clear_extent);
+
+                       clear_tracker->total_clear -= (extent_end - start);
+               } else {
+                       /* Extent overlaps from the right: trim its left end. */
+                       remove_extent(clear_tracker, clear_extent);
+                       clear_extent->start = end;
+                       insert_extent(clear_tracker, clear_extent);
+
+                       clear_tracker->total_clear -= (end - extent_start);
+               }
+
+               clear_extent = next;
+       }
+}
+
+/*
+ * Returns true if the range [start, start + size] lies entirely within
+ * a single clear extent in the tracker, i.e. the whole range is known
+ * to be cleared.
+ */
+static bool gpu_clear_tracker_is_clear(struct gpu_clear_tracker *clear_tracker,
+                                      u64 start, u64 size)
+{
+       struct gpu_clear_extent *clear_extent;
+       u64 end = start + size;
+
+       clear_extent = prev_extent(clear_tracker, start + 1);
+       if (!clear_extent)
+               return false;
+
+       return clear_extent->start <= start && clear_extent->end >= end;
+}
+
+/*
+ * Search the tracker for a clear extent of at least @min_size bytes
+ * and return it. The walk prefers higher-address candidates: at each
+ * node it descends into the right subtree first, then checks the node
+ * itself, and only falls back to the left subtree if neither matches.
+ *
+ * The augmented @subtree_max_size lets us skip whole subtrees that
+ * cannot contain a large-enough extent, so the search runs in
+ * O(log E) where E is the number of clear extents.
+ *
+ * Returns NULL if no extent satisfies @min_size.
+ */
+static struct gpu_clear_extent *
+gpu_clear_tracker_find(struct gpu_clear_tracker *clear_tracker, u64 min_size)
+{
+       struct rb_node *rb = clear_tracker->root.rb_node;
+
+       while (rb) {
+               struct gpu_clear_extent *clear_extent =
+                       rb_entry(rb, struct gpu_clear_extent, rb);
+               struct rb_node *right = rb->rb_right;
+               struct rb_node *left  = rb->rb_left;
+
+               /* Prefer the right (higher-address) subtree. */
+               if (right) {
+                       struct gpu_clear_extent *r =
+                               rb_entry(right, struct gpu_clear_extent, rb);
+
+                       if (r->subtree_max_size >= min_size) {
+                               rb = right;
+                               continue;
+                       }
+               }
+
+               if (extent_size(clear_extent) >= min_size)
+                       return clear_extent;
+
+               if (left) {
+                       struct gpu_clear_extent *l =
+                               rb_entry(left, struct gpu_clear_extent, rb);
+
+                       if (l->subtree_max_size >= min_size) {
+                               rb = left;
+                               continue;
+                       }
+               }
+
+               break;
+       }
+
+       return NULL;
+}
 
 static unsigned int
 gpu_buddy_block_state(struct gpu_buddy_block *block)
@@ -101,13 +429,6 @@ static void gpu_block_free(struct gpu_buddy *mm,
        kmem_cache_free(slab_blocks, block);
 }
 
-static enum gpu_buddy_free_tree
-get_block_tree(struct gpu_buddy_block *block)
-{
-       return gpu_buddy_block_is_clear(block) ?
-              GPU_BUDDY_CLEAR_TREE : GPU_BUDDY_DIRTY_TREE;
-}
-
 static struct gpu_buddy_block *
 rbtree_get_free_block(const struct rb_node *node)
 {
@@ -120,14 +441,8 @@ rbtree_last_free_block(struct rb_root *root)
        return rbtree_get_free_block(rb_last(root));
 }
 
-static bool rbtree_is_empty(struct rb_root *root)
-{
-       return RB_EMPTY_ROOT(root);
-}
-
 static void rbtree_insert(struct gpu_buddy *mm,
-                         struct gpu_buddy_block *block,
-                         enum gpu_buddy_free_tree tree)
+                         struct gpu_buddy_block *block)
 {
        struct rb_node **link, *parent = NULL;
        unsigned int block_alignment, order;
@@ -137,7 +452,7 @@ static void rbtree_insert(struct gpu_buddy *mm,
        order = gpu_buddy_block_order(block);
        block_alignment = gpu_buddy_block_offset_alignment(block);
 
-       root = &mm->free_trees[tree][order];
+       root = &mm->free_tree[order];
        link = &root->rb_node;
 
        while (*link) {
@@ -167,26 +482,14 @@ static void rbtree_remove(struct gpu_buddy *mm,
                          struct gpu_buddy_block *block)
 {
        unsigned int order = gpu_buddy_block_order(block);
-       enum gpu_buddy_free_tree tree;
-       struct rb_root *root;
 
-       tree = get_block_tree(block);
-       root = &mm->free_trees[tree][order];
+       if (gpu_buddy_block_is_clear(block))
+               return;
 
-       rb_erase_augmented(&block->rb, root, &gpu_buddy_augment_cb);
+       rb_erase_augmented(&block->rb, &mm->free_tree[order], 
&gpu_buddy_augment_cb);
        RB_CLEAR_NODE(&block->rb);
 }
 
-static void clear_reset(struct gpu_buddy_block *block)
-{
-       block->header &= ~GPU_BUDDY_HEADER_CLEAR;
-}
-
-static void mark_cleared(struct gpu_buddy_block *block)
-{
-       block->header |= GPU_BUDDY_HEADER_CLEAR;
-}
-
 static void mark_allocated(struct gpu_buddy *mm,
                           struct gpu_buddy_block *block)
 {
@@ -199,13 +502,18 @@ static void mark_allocated(struct gpu_buddy *mm,
 static void mark_free(struct gpu_buddy *mm,
                      struct gpu_buddy_block *block)
 {
-       enum gpu_buddy_free_tree tree;
-
        block->header &= ~GPU_BUDDY_HEADER_STATE;
        block->header |= GPU_BUDDY_FREE;
 
-       tree = get_block_tree(block);
-       rbtree_insert(mm, block, tree);
+       if (gpu_clear_tracker_is_clear(&mm->clear,
+                                      gpu_buddy_block_offset(block),
+                                      gpu_buddy_block_size(mm, block))) {
+               block->header |= GPU_BUDDY_HEADER_CLEAR;
+               RB_CLEAR_NODE(&block->rb);
+       } else {
+               block->header &= ~GPU_BUDDY_HEADER_CLEAR;
+               rbtree_insert(mm, block);
+       }
 }
 
 static void mark_split(struct gpu_buddy *mm,
@@ -243,36 +551,18 @@ __get_buddy(struct gpu_buddy_block *block)
 }
 
 static unsigned int __gpu_buddy_free(struct gpu_buddy *mm,
-                                    struct gpu_buddy_block *block,
-                                    bool force_merge)
+                                    struct gpu_buddy_block *block)
 {
        struct gpu_buddy_block *parent;
        unsigned int order;
 
        while ((parent = block->parent)) {
-               struct gpu_buddy_block *buddy;
-
-               buddy = __get_buddy(block);
+               struct gpu_buddy_block *buddy = __get_buddy(block);
 
                if (!gpu_buddy_block_is_free(buddy))
                        break;
 
-               if (!force_merge) {
-                       /*
-                        * Check the block and its buddy clear state and exit
-                        * the loop if they both have the dissimilar state.
-                        */
-                       if (gpu_buddy_block_is_clear(block) !=
-                           gpu_buddy_block_is_clear(buddy))
-                               break;
-
-                       if (gpu_buddy_block_is_clear(block))
-                               mark_cleared(parent);
-               }
-
                rbtree_remove(mm, buddy);
-               if (force_merge && gpu_buddy_block_is_clear(buddy))
-                       mm->clear_avail -= gpu_buddy_block_size(mm, buddy);
 
                gpu_block_free(mm, block);
                gpu_block_free(mm, buddy);
@@ -286,66 +576,15 @@ static unsigned int __gpu_buddy_free(struct gpu_buddy *mm,
        return order;
 }
 
-static int __force_merge(struct gpu_buddy *mm,
-                        u64 start,
-                        u64 end,
-                        unsigned int min_order)
+static void undo_partial_split(struct gpu_buddy *mm,
+                              struct gpu_buddy_block *block)
 {
-       unsigned int tree, order;
-       int i;
-
-       if (!min_order)
-               return -ENOMEM;
-
-       if (min_order > mm->max_order)
-               return -EINVAL;
-
-       for_each_free_tree(tree) {
-               for (i = min_order - 1; i >= 0; i--) {
-                       struct rb_node *iter = 
rb_last(&mm->free_trees[tree][i]);
-
-                       while (iter) {
-                               struct gpu_buddy_block *block, *buddy;
-                               u64 block_start, block_end;
-
-                               block = rbtree_get_free_block(iter);
-                               iter = rb_prev(iter);
-
-                               if (!block || !block->parent)
-                                       continue;
-
-                               block_start = gpu_buddy_block_offset(block);
-                               block_end = block_start + 
gpu_buddy_block_size(mm, block) - 1;
-
-                               if (!contains(start, end, block_start, 
block_end))
-                                       continue;
+       struct gpu_buddy_block *buddy = __get_buddy(block);
 
-                               buddy = __get_buddy(block);
-                               if (!gpu_buddy_block_is_free(buddy))
-                                       continue;
-
-                               
gpu_buddy_assert(gpu_buddy_block_is_clear(block) !=
-                                                
gpu_buddy_block_is_clear(buddy));
-
-                               /*
-                                * Advance to the next node when the current 
node is the buddy,
-                                * as freeing the block will also remove its 
buddy from the tree.
-                                */
-                               if (iter == &buddy->rb)
-                                       iter = rb_prev(iter);
-
-                               rbtree_remove(mm, block);
-                               if (gpu_buddy_block_is_clear(block))
-                                       mm->clear_avail -= 
gpu_buddy_block_size(mm, block);
-
-                               order = __gpu_buddy_free(mm, block, true);
-                               if (order >= min_order)
-                                       return 0;
-                       }
-               }
-       }
-
-       return -ENOMEM;
+       if (buddy &&
+           gpu_buddy_block_is_free(block) &&
+           gpu_buddy_block_is_free(buddy))
+               __gpu_buddy_free(mm, block);
 }
 
 /**
@@ -362,7 +601,7 @@ static int __force_merge(struct gpu_buddy *mm,
  */
 int gpu_buddy_init(struct gpu_buddy *mm, u64 size, u64 chunk_size)
 {
-       unsigned int i, j, root_count = 0;
+       unsigned int root_count = 0;
        u64 offset = 0;
 
        if (size < chunk_size)
@@ -384,22 +623,13 @@ int gpu_buddy_init(struct gpu_buddy *mm, u64 size, u64 
chunk_size)
 
        BUG_ON(mm->max_order > GPU_BUDDY_MAX_ORDER);
 
-       mm->free_trees = kmalloc_array(GPU_BUDDY_MAX_FREE_TREES,
-                                      sizeof(*mm->free_trees),
-                                      GFP_KERNEL);
-       if (!mm->free_trees)
+       mm->free_tree = kcalloc(mm->max_order + 1,
+                                sizeof(struct rb_root),
+                                GFP_KERNEL);
+       if (!mm->free_tree)
                return -ENOMEM;
 
-       for_each_free_tree(i) {
-               mm->free_trees[i] = kmalloc_array(mm->max_order + 1,
-                                                 sizeof(struct rb_root),
-                                                 GFP_KERNEL);
-               if (!mm->free_trees[i])
-                       goto out_free_tree;
-
-               for (j = 0; j <= mm->max_order; ++j)
-                       mm->free_trees[i][j] = RB_ROOT;
-       }
+       gpu_clear_tracker_init(&mm->clear);
 
        mm->n_roots = hweight64(size);
 
@@ -444,9 +674,8 @@ int gpu_buddy_init(struct gpu_buddy *mm, u64 size, u64 
chunk_size)
                gpu_block_free(mm, mm->roots[root_count]);
        kfree(mm->roots);
 out_free_tree:
-       while (i--)
-               kfree(mm->free_trees[i]);
-       kfree(mm->free_trees);
+       gpu_clear_tracker_fini(&mm->clear);
+       kfree(mm->free_tree);
        return -ENOMEM;
 }
 EXPORT_SYMBOL(gpu_buddy_init);
@@ -460,7 +689,7 @@ EXPORT_SYMBOL(gpu_buddy_init);
  */
 void gpu_buddy_fini(struct gpu_buddy *mm)
 {
-       u64 root_size, size, start;
+       u64 root_size, size;
        unsigned int order;
        int i;
 
@@ -468,22 +697,17 @@ void gpu_buddy_fini(struct gpu_buddy *mm)
 
        for (i = 0; i < mm->n_roots; ++i) {
                order = ilog2(size) - ilog2(mm->chunk_size);
-               start = gpu_buddy_block_offset(mm->roots[i]);
-               __force_merge(mm, start, start + size, order);
+               root_size = mm->chunk_size << order;
 
                gpu_buddy_assert(gpu_buddy_block_is_free(mm->roots[i]));
-
                gpu_block_free(mm, mm->roots[i]);
-
-               root_size = mm->chunk_size << order;
                size -= root_size;
        }
 
        gpu_buddy_assert(mm->avail == mm->size);
 
-       for_each_free_tree(i)
-               kfree(mm->free_trees[i]);
-       kfree(mm->free_trees);
+       gpu_clear_tracker_fini(&mm->clear);
+       kfree(mm->free_tree);
        kfree(mm->roots);
 }
 EXPORT_SYMBOL(gpu_buddy_fini);
@@ -509,13 +733,6 @@ static int split_block(struct gpu_buddy *mm,
        }
 
        mark_split(mm, block);
-
-       if (gpu_buddy_block_is_clear(block)) {
-               mark_cleared(block->left);
-               mark_cleared(block->right);
-               clear_reset(block);
-       }
-
        mark_free(mm, block->left);
        mark_free(mm, block->right);
 
@@ -533,41 +750,55 @@ static int split_block(struct gpu_buddy *mm,
  */
 void gpu_buddy_reset_clear(struct gpu_buddy *mm, bool is_clear)
 {
-       enum gpu_buddy_free_tree src_tree, dst_tree;
-       u64 root_size, size, start;
-       unsigned int order;
-       int i;
+       unsigned int i;
 
-       size = mm->size;
-       for (i = 0; i < mm->n_roots; ++i) {
-               order = ilog2(size) - ilog2(mm->chunk_size);
-               start = gpu_buddy_block_offset(mm->roots[i]);
-               __force_merge(mm, start, start + size, order);
+       gpu_clear_tracker_fini(&mm->clear);
+       gpu_clear_tracker_init(&mm->clear);
+
+       if (is_clear) {
+               for (i = 0; i <= mm->max_order; ++i) {
+                       struct rb_node *node;
+
+                       node = rb_first(&mm->free_tree[i]);
+                       while (node) {
+                               struct gpu_buddy_block *block =
+                                       rb_entry(node, struct gpu_buddy_block, 
rb);
+
+                               node = rb_next(node);
+                               rb_erase_augmented(&block->rb, 
&mm->free_tree[i], &gpu_buddy_augment_cb);
+                               RB_CLEAR_NODE(&block->rb);
+                               block->header |= GPU_BUDDY_HEADER_CLEAR;
+                               gpu_clear_tracker_mark_clear(&mm->clear,
+                                                            
gpu_buddy_block_offset(block),
+                                                            
gpu_buddy_block_size(mm, block));
+                       }
+               }
+       } else {
+               LIST_HEAD(dfs);
 
-               root_size = mm->chunk_size << order;
-               size -= root_size;
-       }
+               for (i = 0; i < mm->n_roots; ++i)
+                       list_add(&mm->roots[i]->tmp_link, &dfs);
 
-       src_tree = is_clear ? GPU_BUDDY_DIRTY_TREE : GPU_BUDDY_CLEAR_TREE;
-       dst_tree = is_clear ? GPU_BUDDY_CLEAR_TREE : GPU_BUDDY_DIRTY_TREE;
+               while (!list_empty(&dfs)) {
+                       struct gpu_buddy_block *block =
+                               list_first_entry(&dfs, struct gpu_buddy_block, 
tmp_link);
 
-       for (i = 0; i <= mm->max_order; ++i) {
-               struct rb_root *root = &mm->free_trees[src_tree][i];
-               struct gpu_buddy_block *block, *tmp;
+                       list_del(&block->tmp_link);
 
-               rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) {
-                       rbtree_remove(mm, block);
-                       if (is_clear) {
-                               mark_cleared(block);
-                               mm->clear_avail += gpu_buddy_block_size(mm, 
block);
-                       } else {
-                               clear_reset(block);
-                               mm->clear_avail -= gpu_buddy_block_size(mm, 
block);
+                       if (gpu_buddy_block_is_split(block)) {
+                               list_add(&block->right->tmp_link, &dfs);
+                               list_add(&block->left->tmp_link, &dfs);
+                               continue;
                        }
 
-                       rbtree_insert(mm, block, dst_tree);
+                       if (gpu_buddy_block_is_free(block) && 
gpu_buddy_block_is_clear(block)) {
+                               block->header &= ~GPU_BUDDY_HEADER_CLEAR;
+                               rbtree_insert(mm, block);
+                       }
                }
        }
+
+       mm->clear_avail = mm->clear.total_clear;
 }
 EXPORT_SYMBOL(gpu_buddy_reset_clear);
 
@@ -580,12 +811,21 @@ EXPORT_SYMBOL(gpu_buddy_reset_clear);
 void gpu_buddy_free_block(struct gpu_buddy *mm,
                          struct gpu_buddy_block *block)
 {
+       bool was_clear = gpu_buddy_block_is_clear(block);
+       u64 size   = gpu_buddy_block_size(mm, block);
+       u64 offset = gpu_buddy_block_offset(block);
+
        BUG_ON(!gpu_buddy_block_is_allocated(block));
-       mm->avail += gpu_buddy_block_size(mm, block);
-       if (gpu_buddy_block_is_clear(block))
-               mm->clear_avail += gpu_buddy_block_size(mm, block);
 
-       __gpu_buddy_free(mm, block, false);
+       block->header &= ~GPU_BUDDY_HEADER_CLEAR;
+       mm->avail += size;
+
+       if (was_clear) {
+               gpu_clear_tracker_mark_clear(&mm->clear, offset, size);
+               mm->clear_avail = mm->clear.total_clear;
+       }
+
+       __gpu_buddy_free(mm, block);
 }
 EXPORT_SYMBOL(gpu_buddy_free_block);
 
@@ -599,10 +839,15 @@ static void __gpu_buddy_free_list(struct gpu_buddy *mm,
        gpu_buddy_assert(!(mark_dirty && mark_clear));
 
        list_for_each_entry_safe(block, on, objects, link) {
+               /*
+                * Propagate the caller's clear/dirty intent onto the block 
header
+                * before handing it to gpu_buddy_free_block(), which will then
+                * update the clear tracker accordingly.
+                */
                if (mark_clear)
-                       mark_cleared(block);
+                       block->header |= GPU_BUDDY_HEADER_CLEAR;
                else if (mark_dirty)
-                       clear_reset(block);
+                       block->header &= ~GPU_BUDDY_HEADER_CLEAR;
                gpu_buddy_free_block(mm, block);
                cond_resched();
        }
@@ -637,23 +882,14 @@ void gpu_buddy_free_list(struct gpu_buddy *mm,
 }
 EXPORT_SYMBOL(gpu_buddy_free_list);
 
-static bool block_incompatible(struct gpu_buddy_block *block, unsigned int 
flags)
-{
-       bool needs_clear = flags & GPU_BUDDY_CLEAR_ALLOCATION;
-
-       return needs_clear != gpu_buddy_block_is_clear(block);
-}
-
 static struct gpu_buddy_block *
 __alloc_range_bias(struct gpu_buddy *mm,
                   u64 start, u64 end,
                   unsigned int order,
-                  unsigned long flags,
-                  bool fallback)
+                  unsigned long flags)
 {
        u64 req_size = mm->chunk_size << order;
        struct gpu_buddy_block *block;
-       struct gpu_buddy_block *buddy;
        LIST_HEAD(dfs);
        int err;
        int i;
@@ -696,9 +932,6 @@ __alloc_range_bias(struct gpu_buddy *mm,
                                continue;
                }
 
-               if (!fallback && block_incompatible(block, flags))
-                       continue;
-
                if (contains(start, end, block_start, block_end) &&
                    order == gpu_buddy_block_order(block)) {
                        /*
@@ -716,68 +949,32 @@ __alloc_range_bias(struct gpu_buddy *mm,
                                goto err_undo;
                }
 
-               list_add(&block->right->tmp_link, &dfs);
                list_add(&block->left->tmp_link, &dfs);
+               list_add(&block->right->tmp_link, &dfs);
        } while (1);
 
        return ERR_PTR(-ENOSPC);
 
 err_undo:
-       /*
-        * We really don't want to leave around a bunch of split blocks, since
-        * bigger is better, so make sure we merge everything back before we
-        * free the allocated blocks.
-        */
-       buddy = __get_buddy(block);
-       if (buddy &&
-           (gpu_buddy_block_is_free(block) &&
-            gpu_buddy_block_is_free(buddy)))
-               __gpu_buddy_free(mm, block, false);
+       undo_partial_split(mm, block);
        return ERR_PTR(err);
 }
 
-static struct gpu_buddy_block *
-__gpu_buddy_alloc_range_bias(struct gpu_buddy *mm,
-                            u64 start, u64 end,
-                            unsigned int order,
-                            unsigned long flags)
-{
-       struct gpu_buddy_block *block;
-       bool fallback = false;
-
-       block = __alloc_range_bias(mm, start, end, order,
-                                  flags, fallback);
-       if (IS_ERR(block))
-               return __alloc_range_bias(mm, start, end, order,
-                                         flags, !fallback);
-
-       return block;
-}
-
 static struct gpu_buddy_block *
 get_maxblock(struct gpu_buddy *mm,
             unsigned int order,
-            enum gpu_buddy_free_tree tree)
+            unsigned long flags)
 {
-       struct gpu_buddy_block *max_block = NULL, *block = NULL;
-       struct rb_root *root;
+       struct gpu_buddy_block *max_block = NULL, *block;
        unsigned int i;
 
        for (i = order; i <= mm->max_order; ++i) {
-               root = &mm->free_trees[tree][i];
-               block = rbtree_last_free_block(root);
+               block = rbtree_last_free_block(&mm->free_tree[i]);
                if (!block)
                        continue;
-
-               if (!max_block) {
+               if (!max_block ||
+                   gpu_buddy_block_offset(block) > 
gpu_buddy_block_offset(max_block))
                        max_block = block;
-                       continue;
-               }
-
-               if (gpu_buddy_block_offset(block) >
-                   gpu_buddy_block_offset(max_block)) {
-                       max_block = block;
-               }
        }
 
        return max_block;
@@ -789,44 +986,23 @@ alloc_from_freetree(struct gpu_buddy *mm,
                    unsigned long flags)
 {
        struct gpu_buddy_block *block = NULL;
-       struct rb_root *root;
-       enum gpu_buddy_free_tree tree;
        unsigned int tmp;
        int err;
 
-       tree = (flags & GPU_BUDDY_CLEAR_ALLOCATION) ?
-               GPU_BUDDY_CLEAR_TREE : GPU_BUDDY_DIRTY_TREE;
-
        if (flags & GPU_BUDDY_TOPDOWN_ALLOCATION) {
-               block = get_maxblock(mm, order, tree);
+               block = get_maxblock(mm, order, flags);
                if (block)
-                       /* Store the obtained block order */
                        tmp = gpu_buddy_block_order(block);
        } else {
                for (tmp = order; tmp <= mm->max_order; ++tmp) {
-                       /* Get RB tree root for this order and tree */
-                       root = &mm->free_trees[tree][tmp];
-                       block = rbtree_last_free_block(root);
+                       block = rbtree_last_free_block(&mm->free_tree[tmp]);
                        if (block)
                                break;
                }
        }
 
-       if (!block) {
-               /* Try allocating from the other tree */
-               tree = (tree == GPU_BUDDY_CLEAR_TREE) ?
-                       GPU_BUDDY_DIRTY_TREE : GPU_BUDDY_CLEAR_TREE;
-
-               for (tmp = order; tmp <= mm->max_order; ++tmp) {
-                       root = &mm->free_trees[tree][tmp];
-                       block = rbtree_last_free_block(root);
-                       if (block)
-                               break;
-               }
-
-               if (!block)
-                       return ERR_PTR(-ENOSPC);
-       }
+       if (!block)
+               return ERR_PTR(-ENOSPC);
 
        BUG_ON(!gpu_buddy_block_is_free(block));
 
@@ -835,14 +1011,18 @@ alloc_from_freetree(struct gpu_buddy *mm,
                if (unlikely(err))
                        goto err_undo;
 
-               block = block->right;
+               if (!(flags & GPU_BUDDY_CLEAR_ALLOCATION) &&
+                   gpu_buddy_block_is_clear(block->right))
+                       block = block->left;
+               else
+                       block = block->right;
                tmp--;
        }
        return block;
 
 err_undo:
        if (tmp != order)
-               __gpu_buddy_free(mm, block, false);
+               __gpu_buddy_free(mm, block);
        return ERR_PTR(err);
 }
 
@@ -863,12 +1043,11 @@ static bool gpu_buddy_subtree_can_satisfy(struct rb_node 
*node,
 
 static struct gpu_buddy_block *
 gpu_buddy_find_block_aligned(struct gpu_buddy *mm,
-                            enum gpu_buddy_free_tree tree,
                             unsigned int order,
                             unsigned int alignment,
                             unsigned long flags)
 {
-       struct rb_root *root = &mm->free_trees[tree][order];
+       struct rb_root *root = &mm->free_tree[order];
        struct rb_node *rb = root->rb_node;
 
        while (rb) {
@@ -906,8 +1085,6 @@ gpu_buddy_offset_aligned_allocation(struct gpu_buddy *mm,
 {
        struct gpu_buddy_block *block = NULL;
        unsigned int order, tmp, alignment;
-       struct gpu_buddy_block *buddy;
-       enum gpu_buddy_free_tree tree;
        unsigned long pages;
        int err;
 
@@ -915,19 +1092,8 @@ gpu_buddy_offset_aligned_allocation(struct gpu_buddy *mm,
        pages = size >> ilog2(mm->chunk_size);
        order = fls(pages) - 1;
 
-       tree = (flags & GPU_BUDDY_CLEAR_ALLOCATION) ?
-               GPU_BUDDY_CLEAR_TREE : GPU_BUDDY_DIRTY_TREE;
-
        for (tmp = order; tmp <= mm->max_order; ++tmp) {
-               block = gpu_buddy_find_block_aligned(mm, tree, tmp,
-                                                    alignment, flags);
-               if (!block) {
-                       tree = (tree == GPU_BUDDY_CLEAR_TREE) ?
-                               GPU_BUDDY_DIRTY_TREE : GPU_BUDDY_CLEAR_TREE;
-                       block = gpu_buddy_find_block_aligned(mm, tree, tmp,
-                                                            alignment, flags);
-               }
-
+               block = gpu_buddy_find_block_aligned(mm, tmp, alignment, flags);
                if (block)
                        break;
        }
@@ -954,27 +1120,18 @@ gpu_buddy_offset_aligned_allocation(struct gpu_buddy *mm,
        return block;
 
 err_undo:
-       /*
-        * We really don't want to leave around a bunch of split blocks, since
-        * bigger is better, so make sure we merge everything back before we
-        * free the allocated blocks.
-        */
-       buddy = __get_buddy(block);
-       if (buddy &&
-           (gpu_buddy_block_is_free(block) &&
-            gpu_buddy_block_is_free(buddy)))
-               __gpu_buddy_free(mm, block, false);
+       undo_partial_split(mm, block);
        return ERR_PTR(err);
 }
 
 static int __alloc_range(struct gpu_buddy *mm,
                         struct list_head *dfs,
                         u64 start, u64 size,
+                        unsigned long flags,
                         struct list_head *blocks,
                         u64 *total_allocated_on_err)
 {
        struct gpu_buddy_block *block;
-       struct gpu_buddy_block *buddy;
        u64 total_allocated = 0;
        LIST_HEAD(allocated);
        u64 end;
@@ -1007,16 +1164,24 @@ static int __alloc_range(struct gpu_buddy *mm,
 
                if (contains(start, end, block_start, block_end)) {
                        if (gpu_buddy_block_is_free(block)) {
+                               u64 bsize = gpu_buddy_block_size(mm, block);
+                               u64 boff  = gpu_buddy_block_offset(block);
+
                                mark_allocated(mm, block);
-                               total_allocated += gpu_buddy_block_size(mm, 
block);
-                               mm->avail -= gpu_buddy_block_size(mm, block);
-                               if (gpu_buddy_block_is_clear(block))
-                                       mm->clear_avail -= 
gpu_buddy_block_size(mm, block);
+                               total_allocated += bsize;
+                               mm->avail -= bsize;
+
+                               if (gpu_clear_tracker_is_clear(&mm->clear,
+                                                              boff, bsize)) {
+                                       if (flags & GPU_BUDDY_CLEAR_ALLOCATION)
+                                               block->header |= 
GPU_BUDDY_HEADER_CLEAR;
+                               }
+                               gpu_clear_tracker_mark_dirty(&mm->clear,
+                                                            boff, bsize);
+                               mm->clear_avail = mm->clear.total_clear;
+
                                list_add_tail(&block->link, &allocated);
                                continue;
-                       } else if (!mm->clear_avail) {
-                               err = -ENOSPC;
-                               goto err_free;
                        }
                }
 
@@ -1040,16 +1205,7 @@ static int __alloc_range(struct gpu_buddy *mm,
        return 0;
 
 err_undo:
-       /*
-        * We really don't want to leave around a bunch of split blocks, since
-        * bigger is better, so make sure we merge everything back before we
-        * free the allocated blocks.
-        */
-       buddy = __get_buddy(block);
-       if (buddy &&
-           (gpu_buddy_block_is_free(block) &&
-            gpu_buddy_block_is_free(buddy)))
-               __gpu_buddy_free(mm, block, false);
+       undo_partial_split(mm, block);
 
 err_free:
        if (err == -ENOSPC && total_allocated_on_err) {
@@ -1065,6 +1221,7 @@ static int __alloc_range(struct gpu_buddy *mm,
 static int __gpu_buddy_alloc_range(struct gpu_buddy *mm,
                                   u64 start,
                                   u64 size,
+                                  unsigned long flags,
                                   u64 *total_allocated_on_err,
                                   struct list_head *blocks)
 {
@@ -1074,20 +1231,23 @@ static int __gpu_buddy_alloc_range(struct gpu_buddy *mm,
        for (i = 0; i < mm->n_roots; ++i)
                list_add_tail(&mm->roots[i]->tmp_link, &dfs);
 
-       return __alloc_range(mm, &dfs, start, size,
+       return __alloc_range(mm, &dfs, start, size, flags,
                             blocks, total_allocated_on_err);
 }
 
 static int __alloc_contig_try_harder(struct gpu_buddy *mm,
                                     u64 size,
                                     u64 min_block_size,
+                                    unsigned long flags,
                                     struct list_head *blocks)
 {
        u64 rhs_offset, lhs_offset, lhs_size, filled;
        struct gpu_buddy_block *block;
-       unsigned int tree, order;
        LIST_HEAD(blocks_lhs);
+       struct rb_root *root;
+       struct rb_node *iter;
        unsigned long pages;
+       unsigned int order;
        u64 modify_size;
        int err;
 
@@ -1097,45 +1257,40 @@ static int __alloc_contig_try_harder(struct gpu_buddy 
*mm,
        if (order == 0)
                return -ENOSPC;
 
-       for_each_free_tree(tree) {
-               struct rb_root *root;
-               struct rb_node *iter;
-
-               root = &mm->free_trees[tree][order];
-               if (rbtree_is_empty(root))
-                       continue;
+       root = &mm->free_tree[order];
+       if (RB_EMPTY_ROOT(root))
+               return -ENOSPC;
 
-               iter = rb_last(root);
-               while (iter) {
-                       block = rbtree_get_free_block(iter);
-
-                       /* Allocate blocks traversing RHS */
-                       rhs_offset = gpu_buddy_block_offset(block);
-                       err =  __gpu_buddy_alloc_range(mm, rhs_offset, size,
-                                                      &filled, blocks);
-                       if (!err || err != -ENOSPC)
-                               return err;
-
-                       lhs_size = max((size - filled), min_block_size);
-                       if (!IS_ALIGNED(lhs_size, min_block_size))
-                               lhs_size = round_up(lhs_size, min_block_size);
-
-                       /* Allocate blocks traversing LHS */
-                       lhs_offset = gpu_buddy_block_offset(block) - lhs_size;
-                       err =  __gpu_buddy_alloc_range(mm, lhs_offset, lhs_size,
-                                                      NULL, &blocks_lhs);
-                       if (!err) {
-                               list_splice(&blocks_lhs, blocks);
-                               return 0;
-                       } else if (err != -ENOSPC) {
-                               gpu_buddy_free_list_internal(mm, blocks);
-                               return err;
-                       }
-                       /* Free blocks for the next iteration */
+       iter = rb_last(root);
+       while (iter) {
+               block = rbtree_get_free_block(iter);
+
+               /* Allocate blocks traversing RHS */
+               rhs_offset = gpu_buddy_block_offset(block);
+               err =  __gpu_buddy_alloc_range(mm, rhs_offset, size,
+                                              flags, &filled, blocks);
+               if (!err || err != -ENOSPC)
+                       return err;
+
+               lhs_size = max((size - filled), min_block_size);
+               if (!IS_ALIGNED(lhs_size, min_block_size))
+                       lhs_size = round_up(lhs_size, min_block_size);
+
+               /* Allocate blocks traversing LHS */
+               lhs_offset = gpu_buddy_block_offset(block) - lhs_size;
+               err =  __gpu_buddy_alloc_range(mm, lhs_offset, lhs_size,
+                                              flags, NULL, &blocks_lhs);
+               if (!err) {
+                       list_splice(&blocks_lhs, blocks);
+                       return 0;
+               } else if (err != -ENOSPC) {
                        gpu_buddy_free_list_internal(mm, blocks);
-
-                       iter = rb_prev(iter);
+                       return err;
                }
+               /* Free blocks for the next iteration */
+               gpu_buddy_free_list_internal(mm, blocks);
+
+               iter = rb_prev(iter);
        }
 
        return -ENOSPC;
@@ -1169,6 +1324,7 @@ int gpu_buddy_block_trim(struct gpu_buddy *mm,
        struct gpu_buddy_block *block;
        u64 block_start, block_end;
        LIST_HEAD(dfs);
+       bool was_clear;
        u64 new_start;
        int err;
 
@@ -1209,22 +1365,38 @@ int gpu_buddy_block_trim(struct gpu_buddy *mm,
        }
 
        list_del(&block->link);
+
+       was_clear = gpu_buddy_block_is_clear(block);
+       block->header &= ~GPU_BUDDY_HEADER_CLEAR;
+
+       if (was_clear) {
+               gpu_clear_tracker_mark_clear(&mm->clear,
+                                            gpu_buddy_block_offset(block),
+                                            gpu_buddy_block_size(mm, block));
+               mm->clear_avail = mm->clear.total_clear;
+       }
+
        mark_free(mm, block);
        mm->avail += gpu_buddy_block_size(mm, block);
-       if (gpu_buddy_block_is_clear(block))
-               mm->clear_avail += gpu_buddy_block_size(mm, block);
 
        /* Prevent recursively freeing this node */
        parent = block->parent;
        block->parent = NULL;
 
        list_add(&block->tmp_link, &dfs);
-       err =  __alloc_range(mm, &dfs, new_start, new_size, blocks, NULL);
+       err =  __alloc_range(mm, &dfs, new_start, new_size,
+                            was_clear ? GPU_BUDDY_CLEAR_ALLOCATION : 0,
+                            blocks, NULL);
        if (err) {
                mark_allocated(mm, block);
                mm->avail -= gpu_buddy_block_size(mm, block);
-               if (gpu_buddy_block_is_clear(block))
-                       mm->clear_avail -= gpu_buddy_block_size(mm, block);
+               if (was_clear) {
+                       gpu_clear_tracker_mark_dirty(&mm->clear,
+                                                    
gpu_buddy_block_offset(block),
+                                                    gpu_buddy_block_size(mm, 
block));
+                       mm->clear_avail = mm->clear.total_clear;
+                       block->header |= GPU_BUDDY_HEADER_CLEAR;
+               }
                list_add(&block->link, blocks);
        }
 
@@ -1233,6 +1405,21 @@ int gpu_buddy_block_trim(struct gpu_buddy *mm,
 }
 EXPORT_SYMBOL(gpu_buddy_block_trim);
 
+static bool clear_steer_window(struct gpu_buddy *mm, u64 min_sz,
+                              u64 *start, u64 *end, unsigned long *flags)
+{
+       struct gpu_clear_extent *ext =
+               gpu_clear_tracker_find(&mm->clear, min_sz);
+
+       if (!ext)
+               return false;
+
+       *start  = ext->start;
+       *end    = ext->end;
+       *flags |= GPU_BUDDY_RANGE_ALLOCATION;
+       return true;
+}
+
 static struct gpu_buddy_block *
 __gpu_buddy_alloc_blocks(struct gpu_buddy *mm,
                         u64 start, u64 end,
@@ -1240,18 +1427,24 @@ __gpu_buddy_alloc_blocks(struct gpu_buddy *mm,
                         unsigned int order,
                         unsigned long flags)
 {
+       /* Steer cleared allocations to a cleared extent that fits the order */
+       if (!(flags & GPU_BUDDY_RANGE_ALLOCATION) &&
+           (flags & GPU_BUDDY_CLEAR_ALLOCATION) && mm->clear_avail)
+               clear_steer_window(mm, mm->chunk_size << order,
+                                  &start, &end, &flags);
+
        if (flags & GPU_BUDDY_RANGE_ALLOCATION)
                /* Allocate traversing within the range */
-               return  __gpu_buddy_alloc_range_bias(mm, start, end,
-                                                    order, flags);
-       else if (size < min_block_size)
+               return __alloc_range_bias(mm, start, end, order, flags);
+
+       if (size < min_block_size)
                /* Allocate from an offset-aligned region without size rounding 
*/
                return gpu_buddy_offset_aligned_allocation(mm, size,
                                                           min_block_size,
                                                           flags);
-       else
-               /* Allocate from freetree */
-               return alloc_from_freetree(mm, order, flags);
+
+       /* Allocate from freetree */
+       return alloc_from_freetree(mm, order, flags);
 }
 
 /**
@@ -1310,7 +1503,7 @@ int gpu_buddy_alloc_blocks(struct gpu_buddy *mm,
                if (!IS_ALIGNED(start | end, min_block_size))
                        return -EINVAL;
 
-               return __gpu_buddy_alloc_range(mm, start, size, NULL, blocks);
+               return __gpu_buddy_alloc_range(mm, start, size, flags, NULL, 
blocks);
        }
 
        original_size = size;
@@ -1336,7 +1529,7 @@ int gpu_buddy_alloc_blocks(struct gpu_buddy *mm,
                if ((flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION) &&
                    !(flags & GPU_BUDDY_RANGE_ALLOCATION))
                        return __alloc_contig_try_harder(mm, original_size,
-                                                        original_min_size, 
blocks);
+                                                        original_min_size, 
flags, blocks);
 
                return -EINVAL;
        }
@@ -1351,8 +1544,6 @@ int gpu_buddy_alloc_blocks(struct gpu_buddy *mm,
                BUG_ON(size >= min_block_size && order < min_order);
 
                do {
-                       unsigned int fallback_order;
-
                        block = __gpu_buddy_alloc_blocks(mm, start,
                                                         end,
                                                         size,
@@ -1362,30 +1553,11 @@ int gpu_buddy_alloc_blocks(struct gpu_buddy *mm,
                        if (!IS_ERR(block))
                                break;
 
-                       if (size < min_block_size) {
-                               fallback_order = order;
-                       } else if (order == min_order) {
-                               fallback_order = min_order;
-                       } else {
+                       if (size >= min_block_size && order > min_order) {
                                order--;
                                continue;
                        }
 
-                       /* Try allocation through force merge method */
-                       if (mm->clear_avail &&
-                           !__force_merge(mm, start, end, fallback_order)) {
-                               block = __gpu_buddy_alloc_blocks(mm, start,
-                                                                end,
-                                                                size,
-                                                                min_block_size,
-                                                                fallback_order,
-                                                                flags);
-                               if (!IS_ERR(block)) {
-                                       order = fallback_order;
-                                       break;
-                               }
-                       }
-
                        /*
                         * Try contiguous block allocation through
                         * try harder method.
@@ -1395,6 +1567,7 @@ int gpu_buddy_alloc_blocks(struct gpu_buddy *mm,
                                return __alloc_contig_try_harder(mm,
                                                                 original_size,
                                                                 
original_min_size,
+                                                                flags,
                                                                 blocks);
                        err = -ENOSPC;
                        goto err_free;
@@ -1402,8 +1575,23 @@ int gpu_buddy_alloc_blocks(struct gpu_buddy *mm,
 
                mark_allocated(mm, block);
                mm->avail -= gpu_buddy_block_size(mm, block);
-               if (gpu_buddy_block_is_clear(block))
-                       mm->clear_avail -= gpu_buddy_block_size(mm, block);
+
+               /*
+                * Mark the block CLEAR only if the caller requested cleared
+                * memory and the entire block lies within a cleared extent.
+                * Either way, drop the block's range from the clear tracker
+                * since it is now allocated.
+                */
+               if (flags & GPU_BUDDY_CLEAR_ALLOCATION &&
+                   gpu_clear_tracker_is_clear(&mm->clear,
+                                              gpu_buddy_block_offset(block),
+                                              gpu_buddy_block_size(mm, block)))
+                       block->header |= GPU_BUDDY_HEADER_CLEAR;
+
+               gpu_clear_tracker_mark_dirty(&mm->clear,
+                                            gpu_buddy_block_offset(block),
+                                            gpu_buddy_block_size(mm, block));
+               mm->clear_avail = mm->clear.total_clear;
                kmemleak_update_trace(block);
                list_add_tail(&block->link, &allocated);
 
@@ -1473,26 +1661,52 @@ EXPORT_SYMBOL(gpu_buddy_block_print);
  */
 void gpu_buddy_print(struct gpu_buddy *mm)
 {
+       u64 *clear_count;
+       unsigned int i;
        int order;
 
        pr_info("chunk_size: %lluKiB, total: %lluMiB, free: %lluMiB, 
clear_free: %lluMiB\n",
                mm->chunk_size >> 10, mm->size >> 20, mm->avail >> 20, 
mm->clear_avail >> 20);
 
+       clear_count = kcalloc(mm->max_order + 1, sizeof(*clear_count), 
GFP_KERNEL);
+
+       if (clear_count) {
+               LIST_HEAD(dfs);
+
+               for (i = 0; i < mm->n_roots; i++)
+                       list_add_tail(&mm->roots[i]->tmp_link, &dfs);
+
+               while (!list_empty(&dfs)) {
+                       struct gpu_buddy_block *block =
+                               list_first_entry(&dfs, struct gpu_buddy_block, 
tmp_link);
+
+                       list_del(&block->tmp_link);
+
+                       if (gpu_buddy_block_is_split(block)) {
+                               list_add(&block->right->tmp_link, &dfs);
+                               list_add(&block->left->tmp_link, &dfs);
+                       } else if (gpu_buddy_block_is_free(block) &&
+                                  gpu_buddy_block_is_clear(block)) {
+                               clear_count[gpu_buddy_block_order(block)]++;
+                       }
+               }
+       }
+
        for (order = mm->max_order; order >= 0; order--) {
                struct gpu_buddy_block *block, *tmp;
                struct rb_root *root;
                u64 count = 0, free;
-               unsigned int tree;
 
-               for_each_free_tree(tree) {
-                       root = &mm->free_trees[tree][order];
+               root = &mm->free_tree[order];
 
-                       rbtree_postorder_for_each_entry_safe(block, tmp, root, 
rb) {
-                               BUG_ON(!gpu_buddy_block_is_free(block));
-                               count++;
-                       }
+               rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) {
+                       BUG_ON(!gpu_buddy_block_is_free(block));
+                       count++;
                }
 
+               if (clear_count)
+                       count += clear_count[order];
+
                free = count * (mm->chunk_size << order);
                if (free < SZ_1M)
                        pr_info("order-%2d free: %8llu KiB, blocks: %llu\n",
@@ -1501,11 +1715,14 @@ void gpu_buddy_print(struct gpu_buddy *mm)
                        pr_info("order-%2d free: %8llu MiB, blocks: %llu\n",
                                order, free >> 20, count);
        }
+
+       kfree(clear_count);
 }
 EXPORT_SYMBOL(gpu_buddy_print);
 
 static void gpu_buddy_module_exit(void)
 {
+       kmem_cache_destroy(slab_extents);
        kmem_cache_destroy(slab_blocks);
 }
 
@@ -1515,6 +1732,12 @@ static int __init gpu_buddy_module_init(void)
        if (!slab_blocks)
                return -ENOMEM;
 
+       slab_extents = KMEM_CACHE(gpu_clear_extent, 0);
+       if (!slab_extents) {
+               kmem_cache_destroy(slab_blocks);
+               return -ENOMEM;
+       }
+
        return 0;
 }
 
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 841f3de5f307..a0ba2e61ec23 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -9,6 +9,7 @@
 #include <linux/kmemleak.h>
 #include <linux/module.h>
 #include <linux/sizes.h>
+#include <linux/slab.h>
 
 #include <linux/gpu_buddy.h>
 #include <drm/drm_buddy.h>
@@ -40,26 +41,51 @@ EXPORT_SYMBOL(drm_buddy_block_print);
  */
 void drm_buddy_print(struct gpu_buddy *mm, struct drm_printer *p)
 {
+       u64 *clear_count;
        int order;
+       unsigned int i;
 
        drm_printf(p, "chunk_size: %lluKiB, total: %lluMiB, free: %lluMiB, 
clear_free: %lluMiB\n",
                   mm->chunk_size >> 10, mm->size >> 20, mm->avail >> 20, 
mm->clear_avail >> 20);
 
+       clear_count = kcalloc(mm->max_order + 1, sizeof(*clear_count), 
GFP_KERNEL);
+
+       if (clear_count) {
+               LIST_HEAD(dfs);
+
+               for (i = 0; i < mm->n_roots; i++)
+                       list_add_tail(&mm->roots[i]->tmp_link, &dfs);
+
+               while (!list_empty(&dfs)) {
+                       struct gpu_buddy_block *block =
+                               list_first_entry(&dfs, struct gpu_buddy_block, 
tmp_link);
+
+                       list_del(&block->tmp_link);
+
+                       if ((block->header & GPU_BUDDY_HEADER_STATE) == 
GPU_BUDDY_SPLIT) {
+                               list_add(&block->right->tmp_link, &dfs);
+                               list_add(&block->left->tmp_link, &dfs);
+                       } else if (gpu_buddy_block_is_free(block) &&
+                                  gpu_buddy_block_is_clear(block)) {
+                               clear_count[gpu_buddy_block_order(block)]++;
+                       }
+               }
+       }
+
        for (order = mm->max_order; order >= 0; order--) {
                struct gpu_buddy_block *block, *tmp;
                struct rb_root *root;
                u64 count = 0, free;
-               unsigned int tree;
-
-               for_each_free_tree(tree) {
-                       root = &mm->free_trees[tree][order];
 
-                       rbtree_postorder_for_each_entry_safe(block, tmp, root, 
rb) {
-                               BUG_ON(!gpu_buddy_block_is_free(block));
-                               count++;
-                       }
+               root = &mm->free_tree[order];
+               rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) {
+                       BUG_ON(!gpu_buddy_block_is_free(block));
+                       count++;
                }
 
+               if (clear_count)
+                       count += clear_count[order];
+
                drm_printf(p, "order-%2d ", order);
 
                free = count * (mm->chunk_size << order);
@@ -70,6 +96,8 @@ void drm_buddy_print(struct gpu_buddy *mm, struct drm_printer 
*p)
 
                drm_printf(p, ", blocks: %llu\n", count);
        }
+
+       kfree(clear_count);
 }
 EXPORT_SYMBOL(drm_buddy_print);
 
diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h
index 5fa917ba5450..435c873f3469 100644
--- a/include/linux/gpu_buddy.h
+++ b/include/linux/gpu_buddy.h
@@ -67,15 +67,6 @@
  */
 #define GPU_BUDDY_TRIM_DISABLE                 BIT(5)
 
-enum gpu_buddy_free_tree {
-       GPU_BUDDY_CLEAR_TREE = 0,
-       GPU_BUDDY_DIRTY_TREE,
-       GPU_BUDDY_MAX_FREE_TREES,
-};
-
-#define for_each_free_tree(tree) \
-       for ((tree) = 0; (tree) < GPU_BUDDY_MAX_FREE_TREES; (tree)++)
-
 /**
  * struct gpu_buddy_block - Block within a buddy allocator
  *
@@ -103,6 +94,14 @@ struct gpu_buddy_block {
 #define   GPU_BUDDY_ALLOCATED     (1 << 10)
 #define   GPU_BUDDY_FREE          (2 << 10)
 #define   GPU_BUDDY_SPLIT         (3 << 10)
+/*
+ * GPU_BUDDY_HEADER_CLEAR has two roles:
+ *  - FREE state:      set when the block's full range is cleared (tracker
+ *                     confirmed).  Cleared free blocks float in the buddy
+ *                     tree and are NOT inserted into free_tree[].
+ *  - ALLOCATED state: set when the block was served from cleared memory,
+ *                     informing the caller that no GPU clear pass is needed.
+ */
 #define GPU_BUDDY_HEADER_CLEAR  GENMASK_ULL(9, 9)
 /* Free to be used, if needed in the future */
 #define GPU_BUDDY_HEADER_UNUSED GENMASK_ULL(8, 6)
@@ -135,6 +134,38 @@ struct gpu_buddy_block {
 /* Order-zero must be at least SZ_4K */
 #define GPU_BUDDY_MAX_ORDER (63 - 12)
 
+/**
+ * struct gpu_clear_extent - a contiguous cleared (zeroed) address range
+ *
+ * Tracks a single contiguous address range whose memory content is known
+ * to be zeroed.  Extents are non-overlapping and stored in an augmented
+ * red-black tree sorted by @start.  The augmented value @subtree_max_size
+ * allows O(log N) search for an extent of at least a given size.
+ */
+struct gpu_clear_extent {
+/* private: */
+       struct rb_node  rb;
+       u64             start;
+       u64             end;
+       u64             subtree_max_size;
+};
+
+/**
+ * struct gpu_clear_tracker - tracks cleared (zeroed) address intervals
+ *
+ * Maintains a set of non-overlapping cleared extents as an augmented
+ * red-black tree.  The tracker is embedded inside struct gpu_buddy and
+ * replaces the former dual (clear/dirty) free-tree scheme.
+ *
+ * @total_clear: Total bytes of cleared memory currently tracked.
+ */
+struct gpu_clear_tracker {
+/* private: */
+       struct rb_root  root;
+/* public: */
+       u64             total_clear;
+};
+
 /**
  * struct gpu_buddy - GPU binary buddy allocator
  *
@@ -158,13 +189,12 @@ struct gpu_buddy_block {
 struct gpu_buddy {
 /* private: */
        /*
-        * Array of red-black trees for free block management.
-        * Indexed as free_trees[clear/dirty][order] where:
-        * - Index 0 (GPU_BUDDY_CLEAR_TREE): blocks with zeroed content
-        * - Index 1 (GPU_BUDDY_DIRTY_TREE): blocks with unknown content
-        * Each tree holds free blocks of the corresponding order.
+        * One RB-tree per order containing only dirty/mixed free blocks.
+        * Cleared free blocks are NOT inserted here; they float in the buddy
+        * tree and are located exclusively via the @clear tracker.
         */
-       struct rb_root **free_trees;
+       struct rb_root *free_tree;
+
        /*
         * Array of root blocks representing the top-level blocks of the
         * binary tree(s). Multiple roots exist when the total size is not
@@ -172,6 +202,7 @@ struct gpu_buddy {
         * that fits in the remaining space.
         */
        struct gpu_buddy_block **roots;
+       struct gpu_clear_tracker clear;
 /* public: */
        unsigned int n_roots;
        unsigned int max_order;

base-commit: d37690b5e02418a2365548300628ef3895a24ed2
-- 
2.34.1

[PATCH v2 1/2] gpu/buddy: replace dual-tree/force_merge with decoupled clear tracker

Reply via email to