Add gpu_test_buddy_clear_tracker_performance test case that measures allocation latency before and after replacing the dual-tree / force_merge design with a decoupled clear tracker.
Two scenarios are covered. 1. Single contiguous allocation after fragmentation. A 4 GiB pool is filled with 4 KiB blocks and freed in alternating clear/dirty order, so every buddy pair ends up split across the two trees and cannot coalesce at free() time. A single contiguous 4 GiB allocation then takes ~61 ms on the dual-tree design (the alloc path has to invoke __force_merge() to climb back up to max_order) and ~25 ms with the clear tracker (the pool is already coalesced at free() time). 2. Repeated allocations from a fragmented pool. Same 4 GiB pool, freed with even-indexed blocks cleared and odd-indexed dirty so every adjacent buddy pair sits on opposite sides of the merge barrier. 16384 x 256 KiB allocations then take ~80 ms on the dual-tree design (each alloc pays the __force_merge() cost) and ~39 ms with the clear tracker (free-time merging makes each alloc an O(log N) split). v2: - Removed unwanted sub tests Cc: Matthew Auld <[email protected]> Cc: Christian König <[email protected]> Signed-off-by: Arunpravin Paneer Selvam <[email protected]> --- drivers/gpu/tests/gpu_buddy_test.c | 127 ++++++++++++++++++++++++++--- 1 file changed, 117 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/tests/gpu_buddy_test.c b/drivers/gpu/tests/gpu_buddy_test.c index e0d24a4542b2..5495d0b8ec0c 100644 --- a/drivers/gpu/tests/gpu_buddy_test.c +++ b/drivers/gpu/tests/gpu_buddy_test.c @@ -38,7 +38,7 @@ static void gpu_test_buddy_subtree_offset_alignment_stress(struct kunit *test) }; struct list_head allocated[ARRAY_SIZE(alignments)]; unsigned int i, max_subtree_align = 0; - int ret, tree, order; + int ret, order; struct gpu_buddy mm; KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, SZ_4K), @@ -93,15 +93,13 @@ static void gpu_test_buddy_subtree_offset_alignment_stress(struct kunit *test) gpu_buddy_free_list(&mm, &allocated[i], 0); for (order = 0; order <= mm.max_order; order++) { - { - node = mm.free_tree[order].rb_node; - if (!node) - continue; - - block = container_of(node, struct gpu_buddy_block, rb); - max_subtree_align = max(max_subtree_align, - block->subtree_max_alignment); - } + node = mm.free_tree[order].rb_node; + if (!node) + continue; + + block = container_of(node, struct gpu_buddy_block, rb); + max_subtree_align = max(max_subtree_align, + block->subtree_max_alignment); } KUNIT_EXPECT_GE(test, max_subtree_align, ilog2(alignments[i])); @@ -285,6 +283,114 @@ static void gpu_test_buddy_fragmentation_performance(struct kunit *test) gpu_buddy_fini(&mm); } +static void gpu_test_buddy_clear_tracker_performance(struct kunit *test) +{ + struct gpu_buddy_block *block, *tmp; + unsigned long elapsed_ms; + LIST_HEAD(clear_blocks); + LIST_HEAD(dirty_blocks); + LIST_HEAD(allocated); + struct gpu_buddy mm; + LIST_HEAD(results); + ktime_t start, end; + int i, count; + + /* + * Contiguous alloc latency after alternating clear/dirty fragmentation + * + * Fill a 4 GiB pool with 4 KiB allocations, partition them into + * alternating cleared and dirty sets, then free both. In the old + * dual-tree design every adjacent buddy pair has one cleared half and + * one dirty half, so the pair sits on opposite sides of the clear/dirty + * merge barrier and cannot be coalesced at free() time. The pool + * stays fully fragmented and the subsequent contiguous 4 GiB allocation + * has to invoke __force_merge() to climb back up to max_order before + * it can succeed. With the clear-tracker design buddy pairs coalesce + * unconditionally during free(), so the pool is already at max_order + * before the timed alloc begins and __force_merge() is not needed. + */ + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, SZ_4G, SZ_4K), + "buddy_init failed\n"); + + for (i = 0; i < SZ_4G / SZ_4K; i++) + KUNIT_ASSERT_FALSE_MSG(test, + gpu_buddy_alloc_blocks(&mm, 0, SZ_4G, SZ_4K, SZ_4K, + &allocated, 0), + "buddy_alloc hit an error size=%u\n", SZ_4K); + + count = 0; + list_for_each_entry_safe(block, tmp, &allocated, link) { + if (count++ % 2 == 0) + list_move_tail(&block->link, &clear_blocks); + else + list_move_tail(&block->link, &dirty_blocks); + } + + gpu_buddy_free_list(&mm, &clear_blocks, GPU_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &dirty_blocks, 0); + + start = ktime_get(); + KUNIT_ASSERT_FALSE_MSG(test, + gpu_buddy_alloc_blocks(&mm, 0, SZ_4G, SZ_4G, SZ_4K, + &results, 0), + "contiguous alloc failed\n"); + end = ktime_get(); + elapsed_ms = ktime_to_ms(ktime_sub(end, start)); + + kunit_info(test, "Contiguous alloc after fragmentation: %lu ms\n", + elapsed_ms); + + gpu_buddy_free_list(&mm, &results, 0); + gpu_buddy_fini(&mm); + + /* + * Repeated alloc throughput from a maximally fragmented pool + * + * Fill a 4 GiB pool with 4 KiB allocations, free even-indexed blocks + * as cleared and odd-indexed blocks as dirty. The alternating pattern + * ensures every adjacent buddy pair has one cleared half and one dirty + * half, so each pair lands on opposite sides of the old merge barrier. + * Each of the 16 384 x 256 KiB allocations in the timed loop has to + * pay the __force_merge() cost on the alloc path under the old design. + * With the clear-tracker design the pool collapses to one max_order + * block during free(), so each alloc is a simple O(log N) split. + */ + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, SZ_4G, SZ_4K), + "buddy_init failed\n"); + + for (i = 0; i < SZ_4G / SZ_4K; i++) + KUNIT_ASSERT_FALSE_MSG(test, + gpu_buddy_alloc_blocks(&mm, 0, SZ_4G, SZ_4K, SZ_4K, + &allocated, 0), + "buddy_alloc hit an error size=%u\n", SZ_4K); + + count = 0; + list_for_each_entry_safe(block, tmp, &allocated, link) { + if (count++ % 2 == 0) + list_move_tail(&block->link, &clear_blocks); + else + list_move_tail(&block->link, &dirty_blocks); + } + + gpu_buddy_free_list(&mm, &clear_blocks, GPU_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &dirty_blocks, 0); + + start = ktime_get(); + for (i = 0; i < SZ_4G / SZ_256K; i++) + KUNIT_ASSERT_FALSE_MSG(test, + gpu_buddy_alloc_blocks(&mm, 0, SZ_4G, SZ_256K, SZ_4K, + &results, 0), + "buddy_alloc hit an error size=%u\n", SZ_256K); + end = ktime_get(); + elapsed_ms = ktime_to_ms(ktime_sub(end, start)); + + kunit_info(test, "Repeated 256 KiB allocs from fragmented pool: %lu ms\n", + elapsed_ms); + + gpu_buddy_free_list(&mm, &results, 0); + gpu_buddy_fini(&mm); +} + static void gpu_test_buddy_alloc_range_bias(struct kunit *test) { u32 mm_size, size, ps, bias_size, bias_start, bias_end, bias_rem; @@ -1398,6 +1504,7 @@ static struct kunit_case gpu_buddy_tests[] = { KUNIT_CASE(gpu_test_buddy_alloc_range), KUNIT_CASE(gpu_test_buddy_alloc_range_bias), KUNIT_CASE_SLOW(gpu_test_buddy_fragmentation_performance), + KUNIT_CASE_SLOW(gpu_test_buddy_clear_tracker_performance), KUNIT_CASE(gpu_test_buddy_alloc_exceeds_max_order), KUNIT_CASE(gpu_test_buddy_offset_aligned_allocation), KUNIT_CASE(gpu_test_buddy_subtree_offset_alignment_stress), -- 2.34.1
