Re: [PATCH 26/33] drm/amdkfd: add debug suspend and resume process queues operation

2023-05-30 Thread Felix Kuehling

Am 2023-05-25 um 13:27 schrieb Jonathan Kim:

In order to inspect waves from the saved context at any point during a
debug session, the debugger must be able to preempt queues to trigger
context save by suspending them.

On queue suspend, the KFD will copy the context save header information
so that the debugger can correctly crawl the appropriate size of the saved
context. The debugger must then also be allowed to resume suspended queues.

A queue that is newly created cannot be suspended because queue ids are
recycled after destruction so the debugger needs to know that this has
occurred.  Query functions will be later added that will clear a given
queue of its new queue status.

A queue cannot be destroyed while it is suspended to preserve its saved
context during debugger inspection.  Have queue destruction block while
a queue is suspended and unblocked when it is resumed.  Likewise, if a
queue is about to be destroyed, it cannot be suspended.

Return the number of queues successfully suspended or resumed along with
a per queue status array where the upper bits per queue status show that
the request was invalid (new/destroyed queue suspend request, missing
queue) or an error occurred (HWS in a fatal state so it can't suspend or
resume queues).

v2: fixup new kfd_node struct reference for mes fw check.
also fixup missing EC_QUEUE_NEW flagging on newly created queue.

Signed-off-by: Jonathan Kim 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|   5 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  11 +
  drivers/gpu/drm/amd/amdkfd/kfd_debug.c|   7 +
  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 447 +-
  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  10 +
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  10 +
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  15 +-
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  14 +-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   5 +-
  .../amd/amdkfd/kfd_process_queue_manager.c|   1 +
  11 files changed, 512 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 98cd52bb005f..b4fcad0e62f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -772,6 +772,11 @@ bool amdgpu_amdkfd_have_atomics_support(struct 
amdgpu_device *adev)
return adev->have_atomics_support;
  }
  
+void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev)

+{
+   amdgpu_device_flush_hdp(adev, NULL);
+}
+
  void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, 
bool reset)
  {
amdgpu_umc_poison_handler(adev, reset);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index dd740e64e6e1..2d0406bff84e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -322,6 +322,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device 
*adev,
  uint64_t *mmap_offset);
  int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_mem *mem,
  struct dma_buf **dmabuf);
+void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
  int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config);
  void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 4b45d4539d48..adda60273456 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -410,6 +410,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
pr_debug("Write ptr address   == 0x%016llX\n",
args->write_pointer_address);
  
+	kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_NEW), p, dev, queue_id, false, NULL, 0);

return 0;
  
  err_create_queue:

@@ -2996,7 +2997,17 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
args->launch_mode.launch_mode);
break;
case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
+   r = suspend_queues(target,
+   args->suspend_queues.num_queues,
+   args->suspend_queues.grace_period,
+   args->suspend_queues.exception_mask,
+   (uint32_t 
*)args->suspend_queues.queue_array_ptr);
+
+   break;
case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
+   r = resume_queues(target, args->resume_queues.num_queues,
+   (uint32_t 
*)args->resume_queues.queue_array_ptr);
+   break;
case 

[PATCH 26/33] drm/amdkfd: add debug suspend and resume process queues operation

2023-05-25 Thread Jonathan Kim
In order to inspect waves from the saved context at any point during a
debug session, the debugger must be able to preempt queues to trigger
context save by suspending them.

On queue suspend, the KFD will copy the context save header information
so that the debugger can correctly crawl the appropriate size of the saved
context. The debugger must then also be allowed to resume suspended queues.

A queue that is newly created cannot be suspended because queue ids are
recycled after destruction so the debugger needs to know that this has
occurred.  Query functions will be later added that will clear a given
queue of its new queue status.

A queue cannot be destroyed while it is suspended to preserve its saved
context during debugger inspection.  Have queue destruction block while
a queue is suspended and unblocked when it is resumed.  Likewise, if a
queue is about to be destroyed, it cannot be suspended.

Return the number of queues successfully suspended or resumed along with
a per queue status array where the upper bits per queue status show that
the request was invalid (new/destroyed queue suspend request, missing
queue) or an error occurred (HWS in a fatal state so it can't suspend or
resume queues).

v2: fixup new kfd_node struct reference for mes fw check.
also fixup missing EC_QUEUE_NEW flagging on newly created queue.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|   5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  11 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c|   7 +
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 447 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  10 +
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  10 +
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  15 +-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  14 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   5 +-
 .../amd/amdkfd/kfd_process_queue_manager.c|   1 +
 11 files changed, 512 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 98cd52bb005f..b4fcad0e62f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -772,6 +772,11 @@ bool amdgpu_amdkfd_have_atomics_support(struct 
amdgpu_device *adev)
return adev->have_atomics_support;
 }
 
+void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev)
+{
+   amdgpu_device_flush_hdp(adev, NULL);
+}
+
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, 
bool reset)
 {
amdgpu_umc_poison_handler(adev, reset);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index dd740e64e6e1..2d0406bff84e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -322,6 +322,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device 
*adev,
  uint64_t *mmap_offset);
 int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_mem *mem,
  struct dma_buf **dmabuf);
+void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
 int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 4b45d4539d48..adda60273456 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -410,6 +410,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
pr_debug("Write ptr address   == 0x%016llX\n",
args->write_pointer_address);
 
+   kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_NEW), p, dev, queue_id, false, 
NULL, 0);
return 0;
 
 err_create_queue:
@@ -2996,7 +2997,17 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
args->launch_mode.launch_mode);
break;
case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
+   r = suspend_queues(target,
+   args->suspend_queues.num_queues,
+   args->suspend_queues.grace_period,
+   args->suspend_queues.exception_mask,
+   (uint32_t 
*)args->suspend_queues.queue_array_ptr);
+
+   break;
case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
+   r = resume_queues(target, args->resume_queues.num_queues,
+   (uint32_t 
*)args->resume_queues.queue_array_ptr);
+   break;
case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
case