Add gpu_test_buddy_clear_tracker_performance test case that measures
allocation latency before and after replacing the dual-tree /
force_merge design with a decoupled clear tracker.

Two scenarios are covered.

1. Single contiguous allocation after fragmentation. A 4 GiB pool is
   filled with 4 KiB blocks and freed in alternating clear/dirty order,
   so every buddy pair ends up split across the two trees and cannot
   coalesce at free() time. A single contiguous 4 GiB allocation then
   takes ~61 ms on the dual-tree design (the alloc path has to invoke
   __force_merge() to climb back up to max_order) and ~25 ms with the
   clear tracker (the pool is already coalesced at free() time).

2. Repeated allocations from a fragmented pool. Same 4 GiB pool, freed
   with even-indexed blocks cleared and odd-indexed dirty so every
   adjacent buddy pair sits on opposite sides of the merge barrier.
   16384 x 256 KiB allocations then take ~80 ms on the dual-tree
   design (each alloc pays the __force_merge() cost) and ~39 ms with
   the clear tracker (free-time merging makes each alloc an O(log N)
   split).

v2:
 - Removed unwanted sub tests

Cc: Matthew Auld <[email protected]>
Cc: Christian König <[email protected]>
Signed-off-by: Arunpravin Paneer Selvam <[email protected]>
---
 drivers/gpu/tests/gpu_buddy_test.c | 127 ++++++++++++++++++++++++++---
 1 file changed, 117 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/tests/gpu_buddy_test.c 
b/drivers/gpu/tests/gpu_buddy_test.c
index e0d24a4542b2..5495d0b8ec0c 100644
--- a/drivers/gpu/tests/gpu_buddy_test.c
+++ b/drivers/gpu/tests/gpu_buddy_test.c
@@ -38,7 +38,7 @@ static void 
gpu_test_buddy_subtree_offset_alignment_stress(struct kunit *test)
        };
        struct list_head allocated[ARRAY_SIZE(alignments)];
        unsigned int i, max_subtree_align = 0;
-       int ret, tree, order;
+       int ret, order;
        struct gpu_buddy mm;
 
        KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, SZ_4K),
@@ -93,15 +93,13 @@ static void 
gpu_test_buddy_subtree_offset_alignment_stress(struct kunit *test)
                gpu_buddy_free_list(&mm, &allocated[i], 0);
 
                for (order = 0; order <= mm.max_order; order++) {
-                       {
-                               node = mm.free_tree[order].rb_node;
-                               if (!node)
-                                       continue;
-
-                               block = container_of(node, struct 
gpu_buddy_block, rb);
-                               max_subtree_align = max(max_subtree_align,
-                                                       
block->subtree_max_alignment);
-                       }
+                       node = mm.free_tree[order].rb_node;
+                       if (!node)
+                               continue;
+
+                       block = container_of(node, struct gpu_buddy_block, rb);
+                       max_subtree_align = max(max_subtree_align,
+                                               block->subtree_max_alignment);
                }
 
                KUNIT_EXPECT_GE(test, max_subtree_align, ilog2(alignments[i]));
@@ -285,6 +283,114 @@ static void 
gpu_test_buddy_fragmentation_performance(struct kunit *test)
        gpu_buddy_fini(&mm);
 }
 
+static void gpu_test_buddy_clear_tracker_performance(struct kunit *test)
+{
+       struct gpu_buddy_block *block, *tmp;
+       unsigned long elapsed_ms;
+       LIST_HEAD(clear_blocks);
+       LIST_HEAD(dirty_blocks);
+       LIST_HEAD(allocated);
+       struct gpu_buddy mm;
+       LIST_HEAD(results);
+       ktime_t start, end;
+       int i, count;
+
+       /*
+        * Contiguous alloc latency after alternating clear/dirty fragmentation
+        *
+        * Fill a 4 GiB pool with 4 KiB allocations, partition them into
+        * alternating cleared and dirty sets, then free both.  In the old
+        * dual-tree design every adjacent buddy pair has one cleared half and
+        * one dirty half, so the pair sits on opposite sides of the clear/dirty
+        * merge barrier and cannot be coalesced at free() time.  The pool
+        * stays fully fragmented and the subsequent contiguous 4 GiB allocation
+        * has to invoke __force_merge() to climb back up to max_order before
+        * it can succeed.  With the clear-tracker design buddy pairs coalesce
+        * unconditionally during free(), so the pool is already at max_order
+        * before the timed alloc begins and __force_merge() is not needed.
+        */
+       KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, SZ_4G, SZ_4K),
+                              "buddy_init failed\n");
+
+       for (i = 0; i < SZ_4G / SZ_4K; i++)
+               KUNIT_ASSERT_FALSE_MSG(test,
+                                      gpu_buddy_alloc_blocks(&mm, 0, SZ_4G, 
SZ_4K, SZ_4K,
+                                                             &allocated, 0),
+                                      "buddy_alloc hit an error size=%u\n", 
SZ_4K);
+
+       count = 0;
+       list_for_each_entry_safe(block, tmp, &allocated, link) {
+               if (count++ % 2 == 0)
+                       list_move_tail(&block->link, &clear_blocks);
+               else
+                       list_move_tail(&block->link, &dirty_blocks);
+       }
+
+       gpu_buddy_free_list(&mm, &clear_blocks, GPU_BUDDY_CLEARED);
+       gpu_buddy_free_list(&mm, &dirty_blocks, 0);
+
+       start = ktime_get();
+       KUNIT_ASSERT_FALSE_MSG(test,
+                              gpu_buddy_alloc_blocks(&mm, 0, SZ_4G, SZ_4G, 
SZ_4K,
+                                                     &results, 0),
+                              "contiguous alloc failed\n");
+       end = ktime_get();
+       elapsed_ms = ktime_to_ms(ktime_sub(end, start));
+
+       kunit_info(test, "Contiguous alloc after fragmentation: %lu ms\n",
+                  elapsed_ms);
+
+       gpu_buddy_free_list(&mm, &results, 0);
+       gpu_buddy_fini(&mm);
+
+       /*
+        * Repeated alloc throughput from a maximally fragmented pool
+        *
+        * Fill a 4 GiB pool with 4 KiB allocations, free even-indexed blocks
+        * as cleared and odd-indexed blocks as dirty.  The alternating pattern
+        * ensures every adjacent buddy pair has one cleared half and one dirty
+        * half, so each pair lands on opposite sides of the old merge barrier.
+        * Each of the 16 384 x 256 KiB allocations in the timed loop has to
+        * pay the __force_merge() cost on the alloc path under the old design.
+        * With the clear-tracker design the pool collapses to one max_order
+        * block during free(), so each alloc is a simple O(log N) split.
+        */
+       KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, SZ_4G, SZ_4K),
+                              "buddy_init failed\n");
+
+       for (i = 0; i < SZ_4G / SZ_4K; i++)
+               KUNIT_ASSERT_FALSE_MSG(test,
+                                      gpu_buddy_alloc_blocks(&mm, 0, SZ_4G, 
SZ_4K, SZ_4K,
+                                                             &allocated, 0),
+                                      "buddy_alloc hit an error size=%u\n", 
SZ_4K);
+
+       count = 0;
+       list_for_each_entry_safe(block, tmp, &allocated, link) {
+               if (count++ % 2 == 0)
+                       list_move_tail(&block->link, &clear_blocks);
+               else
+                       list_move_tail(&block->link, &dirty_blocks);
+       }
+
+       gpu_buddy_free_list(&mm, &clear_blocks, GPU_BUDDY_CLEARED);
+       gpu_buddy_free_list(&mm, &dirty_blocks, 0);
+
+       start = ktime_get();
+       for (i = 0; i < SZ_4G / SZ_256K; i++)
+               KUNIT_ASSERT_FALSE_MSG(test,
+                                      gpu_buddy_alloc_blocks(&mm, 0, SZ_4G, 
SZ_256K, SZ_4K,
+                                                             &results, 0),
+                                      "buddy_alloc hit an error size=%u\n", 
SZ_256K);
+       end = ktime_get();
+       elapsed_ms = ktime_to_ms(ktime_sub(end, start));
+
+       kunit_info(test, "Repeated 256 KiB allocs from fragmented pool: %lu 
ms\n",
+                  elapsed_ms);
+
+       gpu_buddy_free_list(&mm, &results, 0);
+       gpu_buddy_fini(&mm);
+}
+
 static void gpu_test_buddy_alloc_range_bias(struct kunit *test)
 {
        u32 mm_size, size, ps, bias_size, bias_start, bias_end, bias_rem;
@@ -1398,6 +1504,7 @@ static struct kunit_case gpu_buddy_tests[] = {
        KUNIT_CASE(gpu_test_buddy_alloc_range),
        KUNIT_CASE(gpu_test_buddy_alloc_range_bias),
        KUNIT_CASE_SLOW(gpu_test_buddy_fragmentation_performance),
+       KUNIT_CASE_SLOW(gpu_test_buddy_clear_tracker_performance),
        KUNIT_CASE(gpu_test_buddy_alloc_exceeds_max_order),
        KUNIT_CASE(gpu_test_buddy_offset_aligned_allocation),
        KUNIT_CASE(gpu_test_buddy_subtree_offset_alignment_stress),
-- 
2.34.1

Reply via email to