Re: [PATCH v2 3/9] drm/ttm, drm/amdgpu, drm/xe: Consider hitch moves within bulk sublist moves
On Tue, Apr 16, 2024 at 12:07:24PM +0200, Thomas Hellström wrote: > To address the problem with hitches moving when bulk move > sublists are lru-bumped, register the list cursors with the > ttm_lru_bulk_move structure when traversing its list, and > when lru-bumping the list, move the cursor hitch to the tail. > This also means it's mandatory for drivers to call > ttm_lru_bulk_move_init() and ttm_lru_bulk_move_fini() when > initializing and finalizing the bulk move structure, so add > those calls to the amdgpu- and xe driver. > > Compared to v1 this is slightly more code but less fragile > and hopefully easier to understand. > > Changes in previous series: > - Completely rework the functionality > - Avoid a NULL pointer dereference assigning manager->mem_type > - Remove some leftover code causing build problems > v2: > - For hitch bulk tail moves, store the mem_type in the cursor > instead of with the manager. > > Cc: Christian König > Cc: Somalapuram Amaranath > Cc: > Signed-off-by: Thomas Hellström > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++ > drivers/gpu/drm/ttm/ttm_resource.c | 92 +- > drivers/gpu/drm/xe/xe_vm.c | 4 ++ > include/drm/ttm/ttm_resource.h | 58 ++-- > 4 files changed, 137 insertions(+), 21 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index 4299ce386322..18bf174c8d47 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -2368,6 +2368,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct > amdgpu_vm *vm, > if (r) > return r; > > + ttm_lru_bulk_move_init(>lru_bulk_move); > + > vm->is_compute_context = false; > > vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode & > @@ -2431,6 +2433,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct > amdgpu_vm *vm, > error_free_delayed: > dma_fence_put(vm->last_tlb_flush); > dma_fence_put(vm->last_unlocked); > + ttm_lru_bulk_move_fini(>mman.bdev, >lru_bulk_move); > amdgpu_vm_fini_entities(vm); > > return r; > @@ -2587,6 +2590,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct > amdgpu_vm *vm) > } > } > > + ttm_lru_bulk_move_fini(>mman.bdev, >lru_bulk_move); > } > > /** > diff --git a/drivers/gpu/drm/ttm/ttm_resource.c > b/drivers/gpu/drm/ttm/ttm_resource.c > index 22f8ae4ff4c0..2b93727c78e5 100644 > --- a/drivers/gpu/drm/ttm/ttm_resource.c > +++ b/drivers/gpu/drm/ttm/ttm_resource.c > @@ -32,6 +32,49 @@ > > #include > > +/* Detach the cursor from the bulk move list*/ > +static void > +ttm_resource_cursor_clear_bulk(struct ttm_resource_cursor *cursor) > +{ > + cursor->bulk = NULL; > + list_del_init(>bulk_link); > +} > + > +/* Move the cursor to the end of the bulk move list it's in */ > +static void ttm_resource_cursor_move_bulk_tail(struct ttm_lru_bulk_move > *bulk, > +struct ttm_resource_cursor > *cursor) > +{ > + struct ttm_lru_bulk_move_pos *pos; > + > + if (WARN_ON_ONCE(bulk != cursor->bulk)) { > + list_del_init(>bulk_link); > + return; > + } > + > + pos = >pos[cursor->mem_type][cursor->priority]; > + if (pos) > + list_move(>hitch.link, >last->lru.link); > + ttm_resource_cursor_clear_bulk(cursor); > +} > + > +/* Move all cursors attached to a bulk move to its end */ > +static void ttm_bulk_move_adjust_cursors(struct ttm_lru_bulk_move *bulk) > +{ > + struct ttm_resource_cursor *cursor, *next; > + > + list_for_each_entry_safe(cursor, next, >cursor_list, bulk_link) > + ttm_resource_cursor_move_bulk_tail(bulk, cursor); > +} > + > +/* Remove a cursor from an empty bulk move list */ > +static void ttm_bulk_move_drop_cursors(struct ttm_lru_bulk_move *bulk) > +{ > + struct ttm_resource_cursor *cursor, *next; > + > + list_for_each_entry_safe(cursor, next, >cursor_list, bulk_link) > + ttm_resource_cursor_clear_bulk(cursor); > +} > + > /** > * ttm_resource_cursor_fini_locked() - Finalize the LRU list cursor usage > * @cursor: The struct ttm_resource_cursor to finalize. > @@ -44,6 +87,7 @@ void ttm_resource_cursor_fini_locked(struct > ttm_resource_cursor *cursor) > { > lockdep_assert_held(>man->bdev->lru_lock); > list_del_init(>hitch.link); > + ttm_resource_cursor_clear_bulk(cursor); > } > > /** > @@ -72,9 +116,27 @@ void ttm_resource_cursor_fini(struct ttm_resource_cursor > *cursor) > void ttm_lru_bulk_move_init(struct ttm_lru_bulk_move *bulk) > { > memset(bulk, 0, sizeof(*bulk)); > + INIT_LIST_HEAD(>cursor_list); > } > EXPORT_SYMBOL(ttm_lru_bulk_move_init); > > +/** > + * ttm_lru_bulk_move_fini - finalize a bulk move structure > + * @bdev: The struct ttm_device > + * @bulk: the structure to finalize > + * > + * Sanity checks
[PATCH v2 3/9] drm/ttm, drm/amdgpu, drm/xe: Consider hitch moves within bulk sublist moves
To address the problem with hitches moving when bulk move sublists are lru-bumped, register the list cursors with the ttm_lru_bulk_move structure when traversing its list, and when lru-bumping the list, move the cursor hitch to the tail. This also means it's mandatory for drivers to call ttm_lru_bulk_move_init() and ttm_lru_bulk_move_fini() when initializing and finalizing the bulk move structure, so add those calls to the amdgpu- and xe driver. Compared to v1 this is slightly more code but less fragile and hopefully easier to understand. Changes in previous series: - Completely rework the functionality - Avoid a NULL pointer dereference assigning manager->mem_type - Remove some leftover code causing build problems v2: - For hitch bulk tail moves, store the mem_type in the cursor instead of with the manager. Cc: Christian König Cc: Somalapuram Amaranath Cc: Signed-off-by: Thomas Hellström --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++ drivers/gpu/drm/ttm/ttm_resource.c | 92 +- drivers/gpu/drm/xe/xe_vm.c | 4 ++ include/drm/ttm/ttm_resource.h | 58 ++-- 4 files changed, 137 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 4299ce386322..18bf174c8d47 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2368,6 +2368,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, if (r) return r; + ttm_lru_bulk_move_init(>lru_bulk_move); + vm->is_compute_context = false; vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode & @@ -2431,6 +2433,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, error_free_delayed: dma_fence_put(vm->last_tlb_flush); dma_fence_put(vm->last_unlocked); + ttm_lru_bulk_move_fini(>mman.bdev, >lru_bulk_move); amdgpu_vm_fini_entities(vm); return r; @@ -2587,6 +2590,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) } } + ttm_lru_bulk_move_fini(>mman.bdev, >lru_bulk_move); } /** diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c index 22f8ae4ff4c0..2b93727c78e5 100644 --- a/drivers/gpu/drm/ttm/ttm_resource.c +++ b/drivers/gpu/drm/ttm/ttm_resource.c @@ -32,6 +32,49 @@ #include +/* Detach the cursor from the bulk move list*/ +static void +ttm_resource_cursor_clear_bulk(struct ttm_resource_cursor *cursor) +{ + cursor->bulk = NULL; + list_del_init(>bulk_link); +} + +/* Move the cursor to the end of the bulk move list it's in */ +static void ttm_resource_cursor_move_bulk_tail(struct ttm_lru_bulk_move *bulk, + struct ttm_resource_cursor *cursor) +{ + struct ttm_lru_bulk_move_pos *pos; + + if (WARN_ON_ONCE(bulk != cursor->bulk)) { + list_del_init(>bulk_link); + return; + } + + pos = >pos[cursor->mem_type][cursor->priority]; + if (pos) + list_move(>hitch.link, >last->lru.link); + ttm_resource_cursor_clear_bulk(cursor); +} + +/* Move all cursors attached to a bulk move to its end */ +static void ttm_bulk_move_adjust_cursors(struct ttm_lru_bulk_move *bulk) +{ + struct ttm_resource_cursor *cursor, *next; + + list_for_each_entry_safe(cursor, next, >cursor_list, bulk_link) + ttm_resource_cursor_move_bulk_tail(bulk, cursor); +} + +/* Remove a cursor from an empty bulk move list */ +static void ttm_bulk_move_drop_cursors(struct ttm_lru_bulk_move *bulk) +{ + struct ttm_resource_cursor *cursor, *next; + + list_for_each_entry_safe(cursor, next, >cursor_list, bulk_link) + ttm_resource_cursor_clear_bulk(cursor); +} + /** * ttm_resource_cursor_fini_locked() - Finalize the LRU list cursor usage * @cursor: The struct ttm_resource_cursor to finalize. @@ -44,6 +87,7 @@ void ttm_resource_cursor_fini_locked(struct ttm_resource_cursor *cursor) { lockdep_assert_held(>man->bdev->lru_lock); list_del_init(>hitch.link); + ttm_resource_cursor_clear_bulk(cursor); } /** @@ -72,9 +116,27 @@ void ttm_resource_cursor_fini(struct ttm_resource_cursor *cursor) void ttm_lru_bulk_move_init(struct ttm_lru_bulk_move *bulk) { memset(bulk, 0, sizeof(*bulk)); + INIT_LIST_HEAD(>cursor_list); } EXPORT_SYMBOL(ttm_lru_bulk_move_init); +/** + * ttm_lru_bulk_move_fini - finalize a bulk move structure + * @bdev: The struct ttm_device + * @bulk: the structure to finalize + * + * Sanity checks that bulk moves don't have any + * resources left and hence no cursors attached. + */ +void ttm_lru_bulk_move_fini(struct ttm_device *bdev, + struct ttm_lru_bulk_move *bulk) +{ + spin_lock(>lru_lock); + ttm_bulk_move_drop_cursors(bulk); +