Re: [PATCH 18/33] drm/amdkfd: add raise exception event function

2023-05-30 Thread Felix Kuehling



Am 2023-05-25 um 13:27 schrieb Jonathan Kim:

Exception events can be generated from interrupts or queue activitity.

The raise event function will save exception status of a queue, device
or process then notify the debugger of the status change by writing to
a debugger polled file descriptor that the debugger provides during
debug attach.

For memory violation exceptions, extra exception data will be saved.

The debugger will be able to query the saved exception states by query
operation that will be provided by follow up patches.

v2: use new kfd_node struct in prototype.

Signed-off-by: Jonathan Kim 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 104 +++
  drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   7 ++
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  10 +++
  drivers/gpu/drm/amd/amdkfd/kfd_process.c |   2 +
  4 files changed, 123 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 5e2ee2d1acc4..dccb27fc764b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -24,6 +24,107 @@
  #include "kfd_device_queue_manager.h"
  #include 
  
+void debug_event_write_work_handler(struct work_struct *work)

+{
+   struct kfd_process *process;
+
+   static const char write_data = '.';
+   loff_t pos = 0;
+
+   process = container_of(work,
+   struct kfd_process,
+   debug_event_workarea);
+
+   kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
+}
+
+/* update process/device/queue exception status, write to descriptor
+ * only if exception_status is enabled.
+ */
+bool kfd_dbg_ev_raise(uint64_t event_mask,
+   struct kfd_process *process, struct kfd_node *dev,
+   unsigned int source_id, bool use_worker,
+   void *exception_data, size_t exception_data_size)
+{
+   struct process_queue_manager *pqm;
+   struct process_queue_node *pqn;
+   int i;
+   static const char write_data = '.';
+   loff_t pos = 0;
+   bool is_subscribed = true;
+
+   if (!(process && process->debug_trap_enabled))
+   return false;
+
+   mutex_lock(&process->event_mutex);
+
+   if (event_mask & KFD_EC_MASK_DEVICE) {
+   for (i = 0; i < process->n_pdds; i++) {
+   struct kfd_process_device *pdd = process->pdds[i];
+
+   if (pdd->dev != dev)
+   continue;
+
+   pdd->exception_status |= event_mask & 
KFD_EC_MASK_DEVICE;
+
+   if (event_mask & 
KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+   if (!pdd->vm_fault_exc_data) {
+   pdd->vm_fault_exc_data = kmemdup(
+   exception_data,
+   exception_data_size,
+   GFP_KERNEL);
+   if (!pdd->vm_fault_exc_data)
+   pr_debug("Failed to allocate 
exception data memory");
+   } else {
+   pr_debug("Debugger exception data not 
saved\n");
+   print_hex_dump_bytes("exception data: ",
+   DUMP_PREFIX_OFFSET,
+   exception_data,
+   exception_data_size);
+   }
+   }
+   break;
+   }
+   } else if (event_mask & KFD_EC_MASK_PROCESS) {
+   process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
+   } else {
+   pqm = &process->pqm;
+   list_for_each_entry(pqn, &pqm->queues,
+   process_queue_list) {
+   int target_id;
+
+   if (!pqn->q)
+   continue;
+
+   target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
+   pqn->q->properties.queue_id :
+   pqn->q->doorbell_id;
+
+   if (pqn->q->device != dev || target_id != source_id)
+   continue;
+
+   pqn->q->properties.exception_status |= event_mask;
+   break;
+   }
+   }
+
+   if (process->exception_enable_mask & event_mask) {
+   if (use_worker)
+   schedule_work(&process->debug_event_workarea);
+   else
+   kernel_write(process->dbg_ev_file,
+  

[PATCH 18/33] drm/amdkfd: add raise exception event function

2023-05-25 Thread Jonathan Kim
Exception events can be generated from interrupts or queue activitity.

The raise event function will save exception status of a queue, device
or process then notify the debugger of the status change by writing to
a debugger polled file descriptor that the debugger provides during
debug attach.

For memory violation exceptions, extra exception data will be saved.

The debugger will be able to query the saved exception states by query
operation that will be provided by follow up patches.

v2: use new kfd_node struct in prototype.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 104 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   7 ++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  10 +++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |   2 +
 4 files changed, 123 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 5e2ee2d1acc4..dccb27fc764b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -24,6 +24,107 @@
 #include "kfd_device_queue_manager.h"
 #include 
 
+void debug_event_write_work_handler(struct work_struct *work)
+{
+   struct kfd_process *process;
+
+   static const char write_data = '.';
+   loff_t pos = 0;
+
+   process = container_of(work,
+   struct kfd_process,
+   debug_event_workarea);
+
+   kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
+}
+
+/* update process/device/queue exception status, write to descriptor
+ * only if exception_status is enabled.
+ */
+bool kfd_dbg_ev_raise(uint64_t event_mask,
+   struct kfd_process *process, struct kfd_node *dev,
+   unsigned int source_id, bool use_worker,
+   void *exception_data, size_t exception_data_size)
+{
+   struct process_queue_manager *pqm;
+   struct process_queue_node *pqn;
+   int i;
+   static const char write_data = '.';
+   loff_t pos = 0;
+   bool is_subscribed = true;
+
+   if (!(process && process->debug_trap_enabled))
+   return false;
+
+   mutex_lock(&process->event_mutex);
+
+   if (event_mask & KFD_EC_MASK_DEVICE) {
+   for (i = 0; i < process->n_pdds; i++) {
+   struct kfd_process_device *pdd = process->pdds[i];
+
+   if (pdd->dev != dev)
+   continue;
+
+   pdd->exception_status |= event_mask & 
KFD_EC_MASK_DEVICE;
+
+   if (event_mask & 
KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+   if (!pdd->vm_fault_exc_data) {
+   pdd->vm_fault_exc_data = kmemdup(
+   exception_data,
+   exception_data_size,
+   GFP_KERNEL);
+   if (!pdd->vm_fault_exc_data)
+   pr_debug("Failed to allocate 
exception data memory");
+   } else {
+   pr_debug("Debugger exception data not 
saved\n");
+   print_hex_dump_bytes("exception data: ",
+   DUMP_PREFIX_OFFSET,
+   exception_data,
+   exception_data_size);
+   }
+   }
+   break;
+   }
+   } else if (event_mask & KFD_EC_MASK_PROCESS) {
+   process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
+   } else {
+   pqm = &process->pqm;
+   list_for_each_entry(pqn, &pqm->queues,
+   process_queue_list) {
+   int target_id;
+
+   if (!pqn->q)
+   continue;
+
+   target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
+   pqn->q->properties.queue_id :
+   pqn->q->doorbell_id;
+
+   if (pqn->q->device != dev || target_id != source_id)
+   continue;
+
+   pqn->q->properties.exception_status |= event_mask;
+   break;
+   }
+   }
+
+   if (process->exception_enable_mask & event_mask) {
+   if (use_worker)
+   schedule_work(&process->debug_event_workarea);
+   else
+   kernel_write(process->dbg_ev_file,
+   &write_data,
+   1,
+