Re: [PATCH v2 3/9] drm/ttm, drm/amdgpu, drm/xe: Consider hitch moves within bulk sublist moves

2024-04-25 Thread Matthew Brost
On Tue, Apr 16, 2024 at 12:07:24PM +0200, Thomas Hellström wrote:
> To address the problem with hitches moving when bulk move
> sublists are lru-bumped, register the list cursors with the
> ttm_lru_bulk_move structure when traversing its list, and
> when lru-bumping the list, move the cursor hitch to the tail.
> This also means it's mandatory for drivers to call
> ttm_lru_bulk_move_init() and ttm_lru_bulk_move_fini() when
> initializing and finalizing the bulk move structure, so add
> those calls to the amdgpu- and xe driver.
> 
> Compared to v1 this is slightly more code but less fragile
> and hopefully easier to understand.
> 
> Changes in previous series:
> - Completely rework the functionality
> - Avoid a NULL pointer dereference assigning manager->mem_type
> - Remove some leftover code causing build problems
> v2:
> - For hitch bulk tail moves, store the mem_type in the cursor
>   instead of with the manager.
> 
> Cc: Christian König 
> Cc: Somalapuram Amaranath 
> Cc: 
> Signed-off-by: Thomas Hellström 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c |  4 ++
>  drivers/gpu/drm/ttm/ttm_resource.c | 92 +-
>  drivers/gpu/drm/xe/xe_vm.c |  4 ++
>  include/drm/ttm/ttm_resource.h | 58 ++--
>  4 files changed, 137 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 4299ce386322..18bf174c8d47 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2368,6 +2368,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
> amdgpu_vm *vm,
>   if (r)
>   return r;
>  
> + ttm_lru_bulk_move_init(>lru_bulk_move);
> +
>   vm->is_compute_context = false;
>  
>   vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
> @@ -2431,6 +2433,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
> amdgpu_vm *vm,
>  error_free_delayed:
>   dma_fence_put(vm->last_tlb_flush);
>   dma_fence_put(vm->last_unlocked);
> + ttm_lru_bulk_move_fini(>mman.bdev, >lru_bulk_move);
>   amdgpu_vm_fini_entities(vm);
>  
>   return r;
> @@ -2587,6 +2590,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct 
> amdgpu_vm *vm)
>   }
>   }
>  
> + ttm_lru_bulk_move_fini(>mman.bdev, >lru_bulk_move);
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/ttm/ttm_resource.c 
> b/drivers/gpu/drm/ttm/ttm_resource.c
> index 22f8ae4ff4c0..2b93727c78e5 100644
> --- a/drivers/gpu/drm/ttm/ttm_resource.c
> +++ b/drivers/gpu/drm/ttm/ttm_resource.c
> @@ -32,6 +32,49 @@
>  
>  #include 
>  
> +/* Detach the cursor from the bulk move list*/
> +static void
> +ttm_resource_cursor_clear_bulk(struct ttm_resource_cursor *cursor)
> +{
> + cursor->bulk = NULL;
> + list_del_init(>bulk_link);
> +}
> +
> +/* Move the cursor to the end of the bulk move list it's in */
> +static void ttm_resource_cursor_move_bulk_tail(struct ttm_lru_bulk_move 
> *bulk,
> +struct ttm_resource_cursor 
> *cursor)
> +{
> + struct ttm_lru_bulk_move_pos *pos;
> +
> + if (WARN_ON_ONCE(bulk != cursor->bulk)) {
> + list_del_init(>bulk_link);
> + return;
> + }
> +
> + pos = >pos[cursor->mem_type][cursor->priority];
> + if (pos)
> + list_move(>hitch.link, >last->lru.link);
> + ttm_resource_cursor_clear_bulk(cursor);
> +}
> +
> +/* Move all cursors attached to a bulk move to its end */
> +static void ttm_bulk_move_adjust_cursors(struct ttm_lru_bulk_move *bulk)
> +{
> + struct ttm_resource_cursor *cursor, *next;
> +
> + list_for_each_entry_safe(cursor, next, >cursor_list, bulk_link)
> + ttm_resource_cursor_move_bulk_tail(bulk, cursor);
> +}
> +
> +/* Remove a cursor from an empty bulk move list */
> +static void ttm_bulk_move_drop_cursors(struct ttm_lru_bulk_move *bulk)
> +{
> + struct ttm_resource_cursor *cursor, *next;
> +
> + list_for_each_entry_safe(cursor, next, >cursor_list, bulk_link)
> + ttm_resource_cursor_clear_bulk(cursor);
>
 +}
> +
>  /**
>   * ttm_resource_cursor_fini_locked() - Finalize the LRU list cursor usage
>   * @cursor: The struct ttm_resource_cursor to finalize.
> @@ -44,6 +87,7 @@ void ttm_resource_cursor_fini_locked(struct 
> ttm_resource_cursor *cursor)
>  {
>   lockdep_assert_held(>man->bdev->lru_lock);
>   list_del_init(>hitch.link);
> + ttm_resource_cursor_clear_bulk(cursor);
>  }
>  
>  /**
> @@ -72,9 +116,27 @@ void ttm_resource_cursor_fini(struct ttm_resource_cursor 
> *cursor)
>  void ttm_lru_bulk_move_init(struct ttm_lru_bulk_move *bulk)
>  {
>   memset(bulk, 0, sizeof(*bulk));
> + INIT_LIST_HEAD(>cursor_list);
>  }
>  EXPORT_SYMBOL(ttm_lru_bulk_move_init);
>  
> +/**
> + * ttm_lru_bulk_move_fini - finalize a bulk move structure
> + * @bdev: The struct ttm_device
> + * @bulk: the structure to finalize
> + *
> + * Sanity checks 

[PATCH v2 3/9] drm/ttm, drm/amdgpu, drm/xe: Consider hitch moves within bulk sublist moves

2024-04-16 Thread Thomas Hellström
To address the problem with hitches moving when bulk move
sublists are lru-bumped, register the list cursors with the
ttm_lru_bulk_move structure when traversing its list, and
when lru-bumping the list, move the cursor hitch to the tail.
This also means it's mandatory for drivers to call
ttm_lru_bulk_move_init() and ttm_lru_bulk_move_fini() when
initializing and finalizing the bulk move structure, so add
those calls to the amdgpu- and xe driver.

Compared to v1 this is slightly more code but less fragile
and hopefully easier to understand.

Changes in previous series:
- Completely rework the functionality
- Avoid a NULL pointer dereference assigning manager->mem_type
- Remove some leftover code causing build problems
v2:
- For hitch bulk tail moves, store the mem_type in the cursor
  instead of with the manager.

Cc: Christian König 
Cc: Somalapuram Amaranath 
Cc: 
Signed-off-by: Thomas Hellström 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c |  4 ++
 drivers/gpu/drm/ttm/ttm_resource.c | 92 +-
 drivers/gpu/drm/xe/xe_vm.c |  4 ++
 include/drm/ttm/ttm_resource.h | 58 ++--
 4 files changed, 137 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 4299ce386322..18bf174c8d47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2368,6 +2368,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
if (r)
return r;
 
+   ttm_lru_bulk_move_init(>lru_bulk_move);
+
vm->is_compute_context = false;
 
vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
@@ -2431,6 +2433,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
 error_free_delayed:
dma_fence_put(vm->last_tlb_flush);
dma_fence_put(vm->last_unlocked);
+   ttm_lru_bulk_move_fini(>mman.bdev, >lru_bulk_move);
amdgpu_vm_fini_entities(vm);
 
return r;
@@ -2587,6 +2590,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
}
}
 
+   ttm_lru_bulk_move_fini(>mman.bdev, >lru_bulk_move);
 }
 
 /**
diff --git a/drivers/gpu/drm/ttm/ttm_resource.c 
b/drivers/gpu/drm/ttm/ttm_resource.c
index 22f8ae4ff4c0..2b93727c78e5 100644
--- a/drivers/gpu/drm/ttm/ttm_resource.c
+++ b/drivers/gpu/drm/ttm/ttm_resource.c
@@ -32,6 +32,49 @@
 
 #include 
 
+/* Detach the cursor from the bulk move list*/
+static void
+ttm_resource_cursor_clear_bulk(struct ttm_resource_cursor *cursor)
+{
+   cursor->bulk = NULL;
+   list_del_init(>bulk_link);
+}
+
+/* Move the cursor to the end of the bulk move list it's in */
+static void ttm_resource_cursor_move_bulk_tail(struct ttm_lru_bulk_move *bulk,
+  struct ttm_resource_cursor 
*cursor)
+{
+   struct ttm_lru_bulk_move_pos *pos;
+
+   if (WARN_ON_ONCE(bulk != cursor->bulk)) {
+   list_del_init(>bulk_link);
+   return;
+   }
+
+   pos = >pos[cursor->mem_type][cursor->priority];
+   if (pos)
+   list_move(>hitch.link, >last->lru.link);
+   ttm_resource_cursor_clear_bulk(cursor);
+}
+
+/* Move all cursors attached to a bulk move to its end */
+static void ttm_bulk_move_adjust_cursors(struct ttm_lru_bulk_move *bulk)
+{
+   struct ttm_resource_cursor *cursor, *next;
+
+   list_for_each_entry_safe(cursor, next, >cursor_list, bulk_link)
+   ttm_resource_cursor_move_bulk_tail(bulk, cursor);
+}
+
+/* Remove a cursor from an empty bulk move list */
+static void ttm_bulk_move_drop_cursors(struct ttm_lru_bulk_move *bulk)
+{
+   struct ttm_resource_cursor *cursor, *next;
+
+   list_for_each_entry_safe(cursor, next, >cursor_list, bulk_link)
+   ttm_resource_cursor_clear_bulk(cursor);
+}
+
 /**
  * ttm_resource_cursor_fini_locked() - Finalize the LRU list cursor usage
  * @cursor: The struct ttm_resource_cursor to finalize.
@@ -44,6 +87,7 @@ void ttm_resource_cursor_fini_locked(struct 
ttm_resource_cursor *cursor)
 {
lockdep_assert_held(>man->bdev->lru_lock);
list_del_init(>hitch.link);
+   ttm_resource_cursor_clear_bulk(cursor);
 }
 
 /**
@@ -72,9 +116,27 @@ void ttm_resource_cursor_fini(struct ttm_resource_cursor 
*cursor)
 void ttm_lru_bulk_move_init(struct ttm_lru_bulk_move *bulk)
 {
memset(bulk, 0, sizeof(*bulk));
+   INIT_LIST_HEAD(>cursor_list);
 }
 EXPORT_SYMBOL(ttm_lru_bulk_move_init);
 
+/**
+ * ttm_lru_bulk_move_fini - finalize a bulk move structure
+ * @bdev: The struct ttm_device
+ * @bulk: the structure to finalize
+ *
+ * Sanity checks that bulk moves don't have any
+ * resources left and hence no cursors attached.
+ */
+void ttm_lru_bulk_move_fini(struct ttm_device *bdev,
+   struct ttm_lru_bulk_move *bulk)
+{
+   spin_lock(>lru_lock);
+   ttm_bulk_move_drop_cursors(bulk);
+